In [1]:
import gym
import os
import sys
import numpy as np
from itertools import count

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.distributions import Categorical
from torch.utils.tensorboard import SummaryWriter

# if gpu is to be used
use_cuda = torch.cuda.is_available()
#use_cuda = False
print("use_cuda : ", use_cuda)
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

use_cuda :  False


In [2]:
class Actor(nn.Module):
    def __init__(self, state_size, num_actions):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, 16)
        self.fc4 = nn.Linear(16, num_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.log_softmax(self.fc4(x),dim=-1)
        return x

In [3]:
class Critic(nn.Module):
    def __init__(self, state_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.dp1 = nn.Dropout(p=0.5)
        self.fc2 = nn.Linear(16, 8)
        #self.fc3 = nn.Linear(16, 16)
        self.fc4 = nn.Linear(8, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dp1(x)
        x = F.relu(self.fc2(x))
        #x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [4]:
class A2C(object):
    def __init__(self, env, gamma, lr_actor, lr_critic, num_episodes, test_episodes, seed, N_steps, test_freq):
        super(A2C, self).__init__()
        self.env = env
        self.actor = Actor(env.observation_space.shape[0], env.action_space.n)
        self.critic = Critic(env.observation_space.shape[0])
        if use_cuda:
            self.actor.cuda()
            self.critic.cuda()
        self.optimizer_actor = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.optimizer_critic = optim.Adam(self.critic.parameters(), lr=lr_critic)
        self.N_steps = N_steps
        self.num_episodes = num_episodes
        self.test_episodes = test_episodes
        #self.num_steps = num_steps
        self.gamma = gamma
        self.test_freq = test_freq
        self.train_rewards = []
        self.test_rewards = []
        self.train_steps = []
        self.test_steps = []
        self.losses_actor = []
        self.losses_critic = []

    def select_action(self, state):
        state = Variable(Tensor(state))
        log_probs = self.actor(state)
        value = self.critic(state)
        action = Categorical(log_probs.exp()).sample()
        return action.data.cpu().numpy(), log_probs[action], value

    def play_episode(self, visualise=False):
        state = self.env.reset()
        steps = 0
        rewards = []
        log_probs = []
        values = []
        # for i in range(self.num_steps):
        while True:
            if visualise: env.render()
            action, log_prob, value = self.select_action(state)
            state, reward, is_terminal, _ = self.env.step(action)
            log_probs.append(log_prob)
            rewards.append(reward)
            values.append(value)
            steps +=1
            if is_terminal:
                break
        if visualise: print("Reward: {}, Steps: {}".format(sum(rewards), steps))
        return steps, rewards, torch.stack(log_probs), torch.stack(values)

    def optimize(self, rewards, log_probs, values):
        T = len(rewards)
        N = self.N_steps
        R = np.zeros(T, dtype=np.float32)
        loss_actor = 0
        loss_critic = 0
        for t in reversed(range(T)):
            V_end = 0 if (t+N >= T) else values[t+N].data
            R[t] = (self.gamma**N * V_end) + sum([self.gamma**k * rewards[t+k]*1e-2 for k in range(min(N, T-t))])
        R = Variable(Tensor(R), requires_grad=False)
        # compute losses using the advantage function;
        # Note: `values` is detached while computing loss for actor
        loss_actor = ((R - values.detach()) * -log_probs).mean()
        loss_critic = ((R - values)**2).mean()
        # loss = loss_actor + loss_critic

        self.optimizer_actor.zero_grad()
        self.optimizer_critic.zero_grad()
        loss_actor.backward()
        loss_critic.backward()
        # nn.utils.clip_grad_norm(self.actor.parameters(), grad_norm_limit)
        # nn.utils.clip_grad_norm(self.critic.parameters(), grad_norm_limit)
        self.optimizer_actor.step()
        self.optimizer_critic.step()
        # self.losses.append(loss.detach().cpu().numpy())
        # ipdb.set_trace()

    def train(self, num_episodes):
        print("Going to be training for a total of {} episodes".format(num_episodes))
        state = Variable(torch.Tensor(self.env.reset()))
        for e in range(num_episodes):
            steps, rewards, log_probs, values = self.play_episode()
            self.train_rewards.append(sum(rewards))
            self.train_steps.append(steps)
            self.optimize(rewards, log_probs,values)

            if (e+1) % 10 == 0:
                if (e+1) % 100 == 0:
                    print("Episode: {}, reward: {}, steps: {}".format(e+1, sum(rewards), steps))
                else: print("{}%".format((e+1) % 100))
                writer.add_scalar(f'{agent_tag}/Reward/Train', sum(rewards), e+1)
                
            # Freeze the current policy and test over 100 episodes
            if (e+1) % self.test_freq == 0:
                print("-"*10 + " testing now " + "-"*10)
                self.test(self.test_episodes, e)

    def test(self, num_episodes, e_train):
        state = Variable(torch.Tensor(self.env.reset()))
        testing_rewards = []
        testing_steps = []
        for e in range(num_episodes):
            steps, rewards, log_probs,values = self.play_episode()
            self.test_rewards.append(sum(rewards))
            self.test_steps.append(steps)
            testing_rewards.append(sum(rewards))
            testing_steps.append(steps)
        print("Mean reward achieved : {} ".format(np.mean(testing_rewards)))
        print("-"*50)
        if np.mean(testing_rewards) >= 200:
            print("-"*10 + " Solved! " + "-"*10)
            print("Mean reward achieved : {} in {} steps".format(np.mean(testing_rewards), np.mean(testing_steps)))
            print("-"*50)
            # if (e_train+1) % 5000 == 0: self.plot_rewards(save=True)
            # else: self.plot_rewards(save=False)
    
    def visualise(self):
        self.play_episode(visualise=True)


In [5]:
gamma = 0.99
lr_actor = 5e-4
lr_critic = 5e-4
num_episodes = 1500
test_episodes = 100
seed = 123
N_steps = 100
test_freq = 500

writer = SummaryWriter() 
agent_tag = 'lunar-lander-a2c'

# create the environment
env = gym.make("LunarLander-v2")
env.seed(seed)
torch.manual_seed(seed)
# plt.ion()

# A2C agent
agent = A2C(env,
            gamma,
            lr_actor,
            lr_critic,
            num_episodes,
            test_episodes,
            seed,
            N_steps,
            test_freq)

In [12]:
buffer_actor = torch.load('agents/trained-agent-actor-lunar-lander-a2c.pt')
# print(buffer_actor['state_dict'])
agent.actor.load_state_dict(buffer_actor['state_dict'])

buffer_critic = torch.load('agents/trained-agent-critic-lunar-lander-a2c.pt')
# print(buffer_critic['state_dict'])
agent.critic.load_state_dict(buffer_critic['state_dict'])
    

<All keys matched successfully>

In [11]:
agent.train(num_episodes)
# agent.test()

env.close()

Going to be training for a total of 1500 episodes
10%
20%
30%
40%
50%
60%
70%
80%
90%
Episode: 100, reward: -3.7339435433399615, steps: 1000
10%
20%
30%
40%
50%
60%
70%
80%
90%
Episode: 200, reward: 6.335143317819209, steps: 1000
10%
20%
30%
40%


KeyboardInterrupt: 

In [10]:
agent.visualise()
env.close()

Reward: 24.524316500318943, Steps: 1000


In [59]:
checkpoint = {
    'model': agent.actor,
    'state_dict': agent.actor.state_dict() 
}
torch.save(checkpoint, f'agents/trained-agent-actor-{agent_tag}.pt')
checkpoint = {
    'model': agent.critic,
    'state_dict': agent.critic.state_dict() 
}
torch.save(checkpoint, f'agents/trained-agent-critic-{agent_tag}.pt')