# Implementación del algoritmo de reinforce en el caso discreto y continuo

# Caso discreto

In [24]:
import math
import sys
import torch
import torch.nn.functional as F
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.utils as utils
import pdb
from torch.autograd import Variable
import gym
from gym import wrappers
import torch.optim as optim
import numpy as np

El primer paso es construir la red de política o policy network, generalmente consta de capas densas en donde se da la dimensión de entrada y la dimesión de salida de la red, calculan probabilidades 

In [3]:
class Pi(nn.Module):
    def __init__(self,hidden_size,num_inputs,action_space):
        super(Pi,self).__init__()
        self.action_space=action_space
        out_dim=action_space.n
        self.hidden_size=hidden_size
        self.num_inputs=num_inputs
        layers=[
            nn.Linear(self.num_inputs,self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size,out_dim),
            nn.Softmax(),  
        ]
        '''
        En el modelo descrito en el libro usan Categorical para sacar las probailidades de las
        acciones, en este caso usamos la función Soft Max
        Pi en este caso representa nuestra Policy Network
        '''
        self.model=nn.Sequential(*layers)
        self.train()
        
    def forward(self,x):
        out=self.model(x)
        return out
        

In [4]:
class REINFORCE:
    def __init__(self,action_space,hidden_size,num_inputs):
        self.action_space=action_space
        self.model=Pi(hidden_size,num_inputs,action_space)
        self.device="cuda:0" if torch.cuda.is_available() else "cpu"
        self.model=self.model.to(self.device) #Mandamos el modelo a cuda para entrenar
        self.model.train()
        self.optimizer=optim.Adam(self.model.parameters(),lr=1e-3) #Estamos optimizando el peso de la política
        #Usamos como optimizador descenso de gradiente estocástico
    def select_action(self,state):
        probs=self.model(Variable(state).to(self.device))
        action=probs.multinomial(1).data
        prob=probs[:,action[0,0]].view(1,-1)
        log_prob=prob.log()
        entropy=-(probs*probs.log()).sum()
        return action[0], log_prob, entropy
    def update_parameters(self,rewards,log_probs,entropy,gamma):
        '''
        En este caso queremos actualizar el Policy Gradient el cual necesita de las recompesas, la log
        probabilidad y el factor de descuento gamma
        '''
        R=torch.zeros(1,1)
        loss=0
        for t in reversed(range(len(rewards))):
            R=rewards[t]+gamma*R
            loss=loss- (log_probs[t]*(Variable(R).expand_as(log_probs[t])).to(self.device)).sum()-(0.00001*entropy[t].to(self.device)).sum()
        loss=loss/len(rewards)
        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_norm(self.model.parameters(),40)
        self.optimizer.step()
            

Recordemos lo siguiente:

El "Policy gradient", se define como $\nabla_{\theta} J\left(\pi_{\theta}\right)=\sum_{t=0}^{T}R_{t}(\tau)\nabla_{\theta}log \pi_{\theta}(a_{t}|s_{t})$
Sin embargo usando la actualización por muestreo de Monte Carlo, la actualización toma la siguiete forma
$$ \nabla_{\theta} J(\pi_{\theta})= \nabla_{\theta} J(\pi_{\theta})+ R(\tau)\nabla_{\theta}log \pi_{\theta}(a_{t}|s_{t})$$ para algun tiempo $t \in \{0,...,T\}$

# Entrenamiento principal en ambientes de gym, CartPole V0

In [5]:
env_name="CartPole-v0"
gamma=0.99
hidden_size=128
env=gym.make(env_name)
#env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
#Lo anterior para llevar un monitoreo del entrenamiento de las cosas
num_episodes=1000
num_steps=200
agent=REINFORCE(env.action_space,hidden_size,env.observation_space.shape[0])


In [6]:
if torch.cuda.is_available():
    print("Using CUDA")
else:
    print("Using CPU")

Using CUDA


In [7]:
for episode in range(num_episodes):
    state=torch.Tensor([env.reset()])
    log_probs=[]
    entropys=[]
    rewards=[]
    for t in range(num_steps):
        action,log_prob,entropy=agent.select_action(state)
        action=action.cpu()
        next_state,reward,done,_=env.step(action.numpy()[0]) #El step necesita una acción para llegar al siguiente estado y sacar recompensa
        #Actualizamos la entropia y las recompensas y las log probs
        log_probs.append(log_prob)
        rewards.append(reward)
        entropys.append(entropy)
        state=torch.Tensor([next_state])
        if done:
            break
    agent.update_parameters(rewards,log_probs,entropys,gamma)
    if episode % 10 ==0:
        print("Episode: {} Reward: {} Solved: {}".format(episode,np.sum(rewards),(np.sum(rewards)>195.0)))
    if np.sum(rewards)>195.0:
        print("Solved after {} episodes".format(episode))
        break
env.close()
        

Episode: 0 Reward: 20.0 Solved: False


  input = module(input)
  utils.clip_grad_norm(self.model.parameters(),40)


Episode: 10 Reward: 17.0 Solved: False
Episode: 20 Reward: 35.0 Solved: False
Episode: 30 Reward: 34.0 Solved: False
Episode: 40 Reward: 49.0 Solved: False
Episode: 50 Reward: 64.0 Solved: False
Episode: 60 Reward: 18.0 Solved: False
Episode: 70 Reward: 30.0 Solved: False
Episode: 80 Reward: 34.0 Solved: False
Episode: 90 Reward: 22.0 Solved: False
Episode: 100 Reward: 15.0 Solved: False
Episode: 110 Reward: 56.0 Solved: False
Episode: 120 Reward: 21.0 Solved: False
Episode: 130 Reward: 67.0 Solved: False
Episode: 140 Reward: 13.0 Solved: False
Episode: 150 Reward: 50.0 Solved: False
Episode: 160 Reward: 82.0 Solved: False
Episode: 170 Reward: 31.0 Solved: False
Episode: 180 Reward: 37.0 Solved: False
Episode: 190 Reward: 32.0 Solved: False
Episode: 200 Reward: 74.0 Solved: False
Episode: 210 Reward: 35.0 Solved: False
Episode: 220 Reward: 36.0 Solved: False
Episode: 230 Reward: 58.0 Solved: False
Episode: 240 Reward: 49.0 Solved: False
Episode: 250 Reward: 137.0 Solved: False
Episode:

# Caso del reinforce continuo

In [33]:
class NormalizedActions(gym.ActionWrapper):
    def action(self,action):
        action=(action+1)/2
        action*=(self.action_space.high-self.action_space.low)
        action+=self.action_space.low
        return action
    def reverse_action(self,action):
        '''
        Regresar la acción al estado original en el que se encontraba
        '''
        action-=self.action_space.low
        action/=(self.action_space.high-self.action_space.low)
        action=2*action-1
        return action

In [34]:
device="cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [35]:
pi = Variable(torch.FloatTensor([math.pi])).to(device)

def normal(x, mu, sigma_sq):
    a = (-1*(Variable(x)-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*pi.expand_as(sigma_sq)).sqrt()
    return a*b


class Pi(nn.Module):
    def __init__(self, hidden_size, num_inputs, action_space):
        super(Pi, self).__init__()
        self.action_space = action_space
        num_outputs = action_space.shape[0]
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_outputs)
        self.linear2_ = nn.Linear(hidden_size, num_outputs)

    def forward(self, inputs):
        x = inputs
        x = F.relu(self.linear1(x))
        mu = self.linear2(x)
        sigma_sq = self.linear2_(x)
        return mu, sigma_sq


class REINFORCE:
    def __init__(self, hidden_size, num_inputs, action_space):
        self.action_space = action_space
        self.device="cuda:0" if torch.cuda.is_available() else "cpu"
        self.model = Pi(hidden_size, num_inputs, action_space)
        self.model = self.model.to(self.device)
        self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
        self.model.train()

    def select_action(self, state):
        mu, sigma_sq = self.model(Variable(state).to(self.device))
        sigma_sq = F.softplus(sigma_sq)

        eps = torch.randn(mu.size())
        # Cálculo de la probabilidad
        action = (mu + sigma_sq.sqrt()*Variable(eps).to(self.device)).data
        prob = normal(action, mu, sigma_sq)
        entropy = -0.5*((sigma_sq+2*pi.expand_as(sigma_sq)).log()+1)

        log_prob = prob.log()
        return action, log_prob, entropy

    def update_parameters(self, rewards, log_probs, entropies, gamma):
        R = torch.zeros(1, 1)
        loss = 0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            loss = loss - (log_probs[i]*(Variable(R).expand_as(log_probs[i])).to(self.device)).sum() - (0.0001*entropies[i].to(self.device)).sum()
        loss = loss / len(rewards)
        
        self.optimizer.zero_grad()
        loss.backward()
        utils.clip_grad_norm(self.model.parameters(), 40)
        self.optimizer.step()

# Entrenamiento en el caso continuo, MountainCar Continuo

In [38]:
env_name="MountainCarContinuous-v0"
gamma=0.99
hidden_size=128
env=NormalizedActions(gym.make(env_name))
#env = wrappers.Monitor(env, '/tmp/{}-experiment'.format(env_name), force=True)
#Lo anterior para llevar un monitoreo del entrenamiento de las cosas
num_episodes=2000
num_steps=300
agent=REINFORCE(hidden_size,env.observation_space.shape[0],env.action_space)

In [39]:
for episode in range(num_episodes):
    state=torch.Tensor([env.reset()])
    log_probs=[]
    entropys=[]
    rewards=[]
    for t in range(num_steps):
        action,log_prob,entropy=agent.select_action(state)
        action=action.cpu()
        next_state,reward,done,_=env.step(action.numpy()[0]) #El step necesita una acción para llegar al siguiente estado y sacar recompensa
        #Actualizamos la entropia y las recompensas y las log probs
        log_probs.append(log_prob)
        rewards.append(reward)
        entropys.append(entropy)
        state=torch.Tensor([next_state])
        if done:
            break
    agent.update_parameters(rewards,log_probs,entropys,gamma)
    if episode % 10 ==0:
        print("Episode: {} Reward: {} Solved: {}".format(episode,np.sum(rewards),(np.sum(rewards)>11.0)))
    if np.sum(rewards)>11.0:
        print("Solved after {} episodes".format(episode))
        break
env.close()

  utils.clip_grad_norm(self.model.parameters(), 40)


Episode: 0 Reward: -21.53030062224301 Solved: False
Episode: 10 Reward: -18.442836828647568 Solved: False
Episode: 20 Reward: -23.25645587279383 Solved: False
Episode: 30 Reward: -18.877414309498697 Solved: False
Episode: 40 Reward: -18.643845774046664 Solved: False
Episode: 50 Reward: -18.061640365677405 Solved: False
Episode: 60 Reward: -17.344878157426674 Solved: False
Episode: 70 Reward: -15.547342176223921 Solved: False
Episode: 80 Reward: -13.197227439207397 Solved: False
Episode: 90 Reward: -14.042457991930837 Solved: False
Episode: 100 Reward: -10.182006949983064 Solved: False
Episode: 110 Reward: -9.806403044352603 Solved: False
Episode: 120 Reward: -12.840906211515104 Solved: False
Episode: 130 Reward: -11.295072663695077 Solved: False
Episode: 140 Reward: -11.493682242199947 Solved: False
Episode: 150 Reward: -11.141050188545345 Solved: False
Episode: 160 Reward: -9.288206109559809 Solved: False
Episode: 170 Reward: -10.000767696831534 Solved: False
Episode: 180 Reward: -9.1

Episode: 1490 Reward: -0.847191070951713 Solved: False
Episode: 1500 Reward: -0.6323860259255594 Solved: False
Episode: 1510 Reward: -1.4765843479124223 Solved: False
Episode: 1520 Reward: -0.8400208729946925 Solved: False
Episode: 1530 Reward: -1.2137844002792404 Solved: False
Episode: 1540 Reward: -0.39234168875051023 Solved: False
Episode: 1550 Reward: -0.17940806536787618 Solved: False
Episode: 1560 Reward: -0.2198252265990327 Solved: False
Episode: 1570 Reward: -0.20198192292178307 Solved: False
Episode: 1580 Reward: -0.1904529071012341 Solved: False
Episode: 1590 Reward: -0.1933106280967795 Solved: False
Episode: 1600 Reward: -0.23789964220708948 Solved: False
Episode: 1610 Reward: -0.30482488483199754 Solved: False
Episode: 1620 Reward: -1.252616409809096 Solved: False
Episode: 1630 Reward: -0.7092384351960124 Solved: False
Episode: 1640 Reward: -0.31732735232961995 Solved: False
Episode: 1650 Reward: -0.8905022929070437 Solved: False
Episode: 1660 Reward: -0.9281521388123125 So