https://www.tensorflow.org/tutorials/reinforcement_learning/actor_critic
actor critic using value function as a baseline.(vanilla actor critic)

https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py


In [125]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.distributions.categorical import Categorical

In [131]:
import gym
render = True
log_interval = 1

In [127]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, beta, input_dims, n_actions, name='actor-critic', fc1_dims=128):
        super().__init__()
        self.beta = beta
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.fc1_dims = fc1_dims
        self.name = name
        
        # fc1 is the common graph
        self.fc1 = nn.Linear(self.input_dims[0], self.fc1_dims)
        self.val = nn.Linear(self.fc1_dims, 1)
        self.act = nn.Linear(self.fc1_dims, n_actions)
        
        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        
        self.to(self.device)
    
    def forward(self, state):
        common = self.fc1(state)
        common = F.relu(common)
        state_val = self.val(common)
        act_dist = F.softmax(self.act(common), dim=1)
        
        return act_dist, state_val

In [128]:
# agent.py
class Agent:
    def __init__(self, beta, input_dims=[8], env=None, gamma=0.99, n_actions=2, fc1_dims=128):
        self.gamma = gamma
        self.beta = beta
        self.n_actions = n_actions
        
        #self.max_step_size = max_step_size
        
        self.actorcritic = ActorCriticNetwork(beta, input_dims, n_actions, name='actor-critic')
        
        self.actor_losses = [] # policy losses
        self.critic_losses = [] # value losses
        
        
        self.value_buffer = []
        self.action_log_prob = []
        self.reward_buffer = []
        #self.results = [] # true result values
        
        self.eps = np.finfo(np.float32).eps.item()
    
    def init_losses(self):
        self.actor_losses = []
        self.critic_losses = []
    
    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.actorcritic.device)
        act_dist, state_val = self.actorcritic.forward(state)
        
        m = Categorical(T.Tensor(act_dist))
        action = m.sample()
        action_log_prob = m.log_prob(action)
        
        return action, action_log_prob, state_val
    
    def init_value_action_log_prob_buffer(self):
        self.action_log_prob = []
        self.value_buffer = []
    
    def save_buffer(self, action_log_prob, state_val):
        self.value_buffer.append(state_val)
        self.action_log_prob.append(action_log_prob)
    
    def init_reward_buffer(self):
        self.reward_buffer = []
        
    def save_reward(self, reward):
        self.reward_buffer.append(reward)
    
    def get_expected_returns(self):
        # the true expected returns
        returns = []
        R = 0
        
        for r in self.reward_buffer[::-1]:
            R = r + self.gamma * R
            returns.insert(0,R)
        
        returns = T.Tensor(returns)
        # regularization of returns
        returns = (returns - returns.mean()) / (returns.std() + self.eps)
        
        return returns
    
    def learn(self):
        self.init_losses()
        
        returns = self.get_expected_returns()
        for log_prob, value, R in zip(self.action_log_prob, self.value_buffer, returns):
            advantage = R - value
            
            self.actor_losses.append(-log_prob * advantage)
            criterion_for_val_func = nn.HuberLoss()
            self.critic_losses.append(criterion_for_val_func(value, R))

        loss = T.stack(self.actor_losses).sum() + T.stack(self.critic_losses).sum()

        T.autograd.set_detect_anomaly(True)

        self.actorcritic.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        self.actorcritic.optimizer.step()
        
        self.init_reward_buffer()
        self.init_value_action_log_prob_buffer()
    


In [129]:
class Worker:
    def __init__(self, beta, gamma, max_step_size=300, n_episodes=100000, seed=44, env='CartPole-v0', fc1_dims=128):
        self.env = gym.make(env)
        self.env.seed(seed)
        T.manual_seed(seed)
        self.eps = np.finfo(np.float32).eps.item()
        #print(self.env.observation_space.shape)
        #print(self.env.action_space.n)
        self.agent = Agent(beta, self.env.observation_space.shape, self.env, gamma, self.env.action_space.n, fc1_dims)
        self.n_episodes = n_episodes
        
        self.running_reward = 10
        self.max_steps = max_step_size
    
    
        
    def run_episode(self, init_state):
        self.ep_reward = 0
        
        self.agent.init_value_action_log_prob_buffer()
        self.agent.init_reward_buffer()
        
        state = init_state
        for t in range(self.max_steps):
            action, action_log_prob, state_val = self.agent.choose_action(state)
            self.agent.save_buffer(action_log_prob, state_val)
            
            action = action.cpu().detach().numpy()[0]
            state, reward, done, info = self.env.step(action)
            
            #print(self.env)
            if render:
                self.env.render()
            
            self.agent.save_reward(reward)
            self.ep_reward += reward
            if done:
                break
        # update cumulative reward
        self.running_reward = 0.05 * self.ep_reward + (1-0.05) * self.running_reward
        
        
    def train(self):
        self.running_reward = 10
        
        for i_episode in range(self.n_episodes):
            init_state = self.env.reset()
            
            self.run_episode(init_state)
            self.agent.learn()
            
            # log results
            if i_episode % log_interval == 0:
                print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                      i_episode, self.ep_reward, self.running_reward))
            
            if self.running_reward > self.env.spec.reward_threshold:
                print("Solved! Running reward is now {} and \
                       the last episode runs to {} time steps!".format(self.running_reward, t))
                break
            
        
            
        

In [132]:
worker = Worker(0.01, 0.99)

import time
import datetime
start_time = time.time()
worker.train()
consumed = time.time() - start_time

print(str(datetime.timedelta(seconds=consumed)))

Episode 0	Last reward: 14.00	Average reward: 10.20
Episode 1	Last reward: 11.00	Average reward: 10.24
Episode 2	Last reward: 19.00	Average reward: 10.68
Episode 3	Last reward: 24.00	Average reward: 11.34
Episode 4	Last reward: 27.00	Average reward: 12.13
Episode 5	Last reward: 35.00	Average reward: 13.27
Episode 6	Last reward: 36.00	Average reward: 14.41
Episode 7	Last reward: 16.00	Average reward: 14.49
Episode 8	Last reward: 21.00	Average reward: 14.81
Episode 9	Last reward: 17.00	Average reward: 14.92
Episode 10	Last reward: 10.00	Average reward: 14.68
Episode 11	Last reward: 41.00	Average reward: 15.99
Episode 12	Last reward: 15.00	Average reward: 15.94
Episode 13	Last reward: 10.00	Average reward: 15.65
Episode 14	Last reward: 12.00	Average reward: 15.46
Episode 15	Last reward: 9.00	Average reward: 15.14
Episode 16	Last reward: 23.00	Average reward: 15.53
Episode 17	Last reward: 14.00	Average reward: 15.46
Episode 18	Last reward: 17.00	Average reward: 15.53
Episode 19	Last reward:

Episode 155	Last reward: 200.00	Average reward: 146.79
Episode 156	Last reward: 191.00	Average reward: 149.00
Episode 157	Last reward: 119.00	Average reward: 147.50
Episode 158	Last reward: 200.00	Average reward: 150.13
Episode 159	Last reward: 143.00	Average reward: 149.77
Episode 160	Last reward: 26.00	Average reward: 143.58
Episode 161	Last reward: 155.00	Average reward: 144.15
Episode 162	Last reward: 120.00	Average reward: 142.95
Episode 163	Last reward: 107.00	Average reward: 141.15
Episode 164	Last reward: 200.00	Average reward: 144.09
Episode 165	Last reward: 134.00	Average reward: 143.59
Episode 166	Last reward: 153.00	Average reward: 144.06
Episode 167	Last reward: 200.00	Average reward: 146.85
Episode 168	Last reward: 200.00	Average reward: 149.51
Episode 169	Last reward: 188.00	Average reward: 151.44
Episode 170	Last reward: 200.00	Average reward: 153.86
Episode 171	Last reward: 164.00	Average reward: 154.37
Episode 172	Last reward: 166.00	Average reward: 154.95
Episode 173

Episode 305	Last reward: 200.00	Average reward: 165.85
Episode 306	Last reward: 200.00	Average reward: 167.56
Episode 307	Last reward: 186.00	Average reward: 168.48
Episode 308	Last reward: 200.00	Average reward: 170.06
Episode 309	Last reward: 167.00	Average reward: 169.90
Episode 310	Last reward: 200.00	Average reward: 171.41
Episode 311	Last reward: 178.00	Average reward: 171.74
Episode 312	Last reward: 187.00	Average reward: 172.50
Episode 313	Last reward: 166.00	Average reward: 172.18
Episode 314	Last reward: 154.00	Average reward: 171.27
Episode 315	Last reward: 130.00	Average reward: 169.20
Episode 316	Last reward: 136.00	Average reward: 167.54
Episode 317	Last reward: 151.00	Average reward: 166.72
Episode 318	Last reward: 170.00	Average reward: 166.88
Episode 319	Last reward: 157.00	Average reward: 166.39
Episode 320	Last reward: 153.00	Average reward: 165.72
Episode 321	Last reward: 151.00	Average reward: 164.98
Episode 322	Last reward: 181.00	Average reward: 165.78
Episode 32

NameError: name 'running_reward' is not defined

NameError: name 'beta' is not defined