In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

import gym
from tqdm import tqdm_notebook
import numpy as np
from collections import deque
from copy import deepcopy

In [2]:
class ActorCriticNetwork(nn.Module):
    
    def __init__(self, obs_space, action_space):
        super(ActorCriticNetwork, self).__init__()

        self.actor = nn.Sequential(
                            nn.Linear(obs_space, 64),
                            nn.Tanh(),
                            nn.Linear(64, 64),
                            nn.Tanh(),
                            nn.Linear(64, action_space),
                            nn.Softmax(dim=1))


        self.critic = nn.Sequential(
                        nn.Linear(obs_space, 64),
                        nn.Tanh(),
                        nn.Linear(64, 64),
                        nn.Tanh(),
                        nn.Linear(64, 1))
        
    def forward(self):
        raise NotImplementedError
        
    def select_action(self, state):
        ''' Selects an action given current state
        Args:
        - network (Torch NN): network to process state
        - state (Array): Array of action space in an environment

        Return:
        - (int): action that is selected
        - (float): log probability of selecting that action given state and network
        '''
    
        #convert state to float tensor, add 1 dimension, allocate tensor on device
        state = torch.from_numpy(state).float().unsqueeze(0)

        #use network to predict action probabilities
        action_probs = self.actor(state)

        #sample an action using the probability distribution
        m = Categorical(action_probs)
        action = m.sample()

        #return action
        return action.item(), m.log_prob(action)
    
    def evaluate_action(self, states, actions):
        
        #convert state to float tensor, add 1 dimension, allocate tensor on device
        states_tensor = torch.stack([torch.from_numpy(state).float().unsqueeze(0) for state in states]).squeeze(1)

        #use network to predict action probabilities
        action_probs = self.actor(states_tensor)

        #get probability distribution
        m = Categorical(action_probs)

        #return log_prob and entropy
        return m.log_prob(torch.Tensor(actions)), m.entropy()
        


In [65]:
def process_rewards(rewards, terminals):
    ''' Converts our rewards history into cumulative discounted rewards
    Args:
    - rewards (Array): array of rewards 
    
    Returns:
    - G (Array): array of cumulative discounted rewards
    '''
    #Calculate Gt (cumulative discounted rewards)
    G = []
    
    #track cumulative reward
    total_r = 0
    
    #iterate rewards from Gt to G0
    for r, done in zip(reversed(rewards), reversed(terminals)):
        
        #Base case: G(T) = r(T)
        #Recursive: G(t) = r(t) + G(t+1)^DISCOUNT
        total_r = r + total_r * γ
        
        if done:
            total_r = r
        
        #add to front of G
        G.insert(0, total_r)
    
    #whitening rewards
    G = torch.tensor(G)
    G = (G - G.mean())/G.std()
    
    return G

In [85]:
def clipped_update(batch):
    #get items from trajectory
    states = [sample[0] for sample in batch]
    actions = [sample[1] for sample in batch]
    rewards = [sample[2] for sample in batch]
    old_lps = [sample[3] for sample in batch]
    terminals = [sample[4] for sample in batch]
    
    Gt = process_rewards(rewards, terminals)
    
    for epoch in range(K_epoch):
             
        #get ratio
        new_lps, entropies = actor_critic.evaluate_action(states, actions)
        
        ratios = torch.exp(new_lps - torch.Tensor(old_lps))
        
        #compute advantages
        states_tensor = torch.stack([torch.from_numpy(state).float().unsqueeze(0) for state in states]).squeeze(1)
        vals = actor_critic.critic(states_tensor).squeeze(1).detach()
        advantages = Gt - vals
        
        #clip surrogate objective
        surrogate1 = torch.clamp(ratios, min=1 - ϵ, max=1 + ϵ) * advantages
        surrogate2 = ratios * advantages
        
        #loss, flip signs since this is gradient descent
        loss =  -torch.min(surrogate1, surrogate2) + c1 * F.mse_loss(Gt, vals) - c2 * entropies
        
        optimizer.zero_grad()
        loss.mean().backward()
        optimizer.step()
        

In [86]:
#discount factor for future utilities
γ = 0.99

#number of steps to run
TRAIN_STEPS = 100000

#max steps per episode
MAX_STEPS = 500

#score agent needs for environment to be solved
SOLVED_SCORE = 195

#clipped surrogate constraint
ϵ = 0.2

#learning rates
α_θ = 0.0003
αv = 0.001

#K epochs update 
K_epoch = 40

#minibatch size
BATCH_SIZE = 1600

#value loss weight
c1 = 0.5

#entropy weight
c2 = 0.01

#device to run model on 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [87]:
#Make environment
env = gym.make('CartPole-v1')

#seeds
np.random.seed(0)
env.seed(0)
torch.manual_seed(0)

#environment parameters
obs_space = env.observation_space.shape[0]
action_space = env.action_space.n

#Init network
actor_critic = ActorCriticNetwork(obs_space, action_space)


#Init optimizer
optimizer = torch.optim.Adam([
    {'params': actor_critic.actor.parameters(), 'lr': α_θ},
    {'params': actor_critic.critic.parameters(), 'lr': αv}
])

In [88]:
#track scores
scores = []

#recent 100 scores
recent_scores = deque(maxlen=100)

#minibatch
batch = []

#reset environment, initiable variables
state = env.reset()
curr_step = 0

#run training loop
for step in tqdm_notebook(range(1, TRAIN_STEPS)):
    
    #env.render()
    curr_step += 1

    #select action
    action, lp = actor_critic.select_action(state)

    #execute action
    new_state, reward, done, _ = env.step(action)

    #store into trajectory
    batch.append([state, action, reward, lp, done])

    #optimize surrogate
    if step % BATCH_SIZE == 0:
        clipped_update(batch)
        batch = []

    #end episode
    if done or curr_step >= MAX_STEPS:
        state = env.reset()
        curr_step = 0
        continue

    #move into new state
    state = new_state        

HBox(children=(IntProgress(value=0, max=99999), HTML(value='')))

KeyboardInterrupt: 

In [89]:
done = False
state = env.reset()
scores = []

for _ in tqdm_notebook(range(50)):
    state = env.reset()
    done = False
    score = 0
    while not done:
        #env.render()
        action, lp = actor_critic.select_action(state)
        new_state, reward, done, info = env.step(action)
        score += reward
        state = new_state
    scores.append(score)
env.close()

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [90]:
np.array(scores).mean()

315.72

In [72]:
env.close()