In [1]:
%matplotlib notebook

import torch
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.distributions import Categorical

import numpy as np
import cv2
import matplotlib.pyplot as plt

from statistics import stdev, mean

import gym
import gym_2048

In [2]:
def body_layer(dim1, dim2):
    return nn.Sequential(
        nn.Linear(dim1, dim2),
        nn.ReLU(),
    )

In [3]:
def network_body(obs_dim, hidden_dim=64, n_layers=2):
    if isinstance(hidden_dim, int):
        layer_dims = [(obs_dim, hidden_dim)] + [(hidden_dim, hidden_dim)]*(n_layers - 1)
    else:
        dims = [obs_dim] + hidden_dims
        layer_dims = [(dim[i], dim[i+1]) for i in range(len(dims)-1)]
    return [body_layer(dim1, dim2) for dim1, dim2 in layer_dims]

In [4]:
class Network(nn.Module):
    def __init__(self, obs_dim, n_actions, hidden_dim=64, n_layers=2):
        super(Network, self).__init__()
        self.body = nn.Sequential(*network_body(obs_dim, hidden_dim, n_layers))

        self.actor = nn.Linear(hidden_dim, n_actions)
        self.critic = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = self.body(x)
        logits = self.actor(x)
        state_value = self.critic(x)
        return F.softmax(logits, dim=-1), state_value

In [5]:
def plot(avg_returns, stds, fig, ax):
    episode = np.arange(len(avg_returns))
    avg_returns = np.array(avg_returns)
    stds = np.array(stds)
    
    ax.clear()
    ax.set_xlabel('Episode')
    ax.set_ylabel('Returns')
    # plot returns
    ax.plot(episode, avg_returns, label='Avg Returns')
    # plot standard deviations
    ax.fill_between(episode, avg_returns-stds, avg_returns+stds, 
                    facecolor='blue', alpha=0.1)   
    ax.set_title('Returns')
    ax.legend()
    fig.tight_layout()        
    fig.canvas.draw()
    plt.show()    

In [6]:
def ac_update(model, state_value, log_prob, reward, std, gamma=0.9, lam=0.95):
    reward = torch.tensor(reward)
    advantage = reward - state_value.item()
    # critic
    critic_loss = F.smooth_l1_loss(state_value, torch.tensor([reward]).unsqueeze(0).type(torch.FloatTensor))
    # policy gradient
    policy_loss = -log_prob * advantage
    
    variance = torch.tensor([std*0.5]).unsqueeze(0).type(torch.FloatTensor)
    
    loss = torch.mul((policy_loss + critic_loss), variance)

    model['optim'].zero_grad()
    loss.backward()
    model['optim'].step()
    model['scheduler'].step()

In [7]:
def select_action(action_probs):
    # multinomial over actions
    m = Categorical(action_probs)
    action = m.sample()
    return action.item(), m.log_prob(action) 

In [8]:
def train(env, model, n_episodes=400, max_timesteps=2000):
    fig, ax = plt.subplots(1,1, figsize=(9,4))
    
    avg_returns = []
    returns = []
    stds = [0]
    
    max_values = []
    n_zeros = []

    for episode in range(1, n_episodes+1):
        state_values, rewards, log_probs = [], [], []
        mv, zeros = [], []

        obs = env.reset()

        for ts in range(max_timesteps):
            obs = np.ravel(obs)    # flatten state for network
            obs = torch.from_numpy(obs).float().unsqueeze(0)
            
            mv.append(torch.max(obs).item())
            zeros.append(list(obs[obs==0].size())[0])
            
            action_probs, state_value = model['net'](obs)
            action, log_prob = select_action(action_probs)
            obs, reward, done, _ = env.step(action)

            state_values.append(state_value)
            rewards.append(reward)
            log_probs.append(log_prob)
            
            ac_update(model, state_value, log_prob, reward, stds[-1])

            if done:
                break
                
        returns.append(sum(rewards))
        avg_returns.append(mean(returns))
        if episode > 1:
            stds.append(stdev(returns))
            
        max_values.append(max(mv))
        n_zeros.append(mean(zeros))
        
        if episode % 10 == 0:
            print('Episode: {} - Episode Return: {} - Average Returns: {} - Max Value: {} - Average Zeros: {:.2f}'
                  .format(
                      episode, sum(rewards), mean(returns), max_values[-1], mean(n_zeros)
            ))

        plot(avg_returns, stds, fig, ax)
        
#         if episode % 25 == 0:
# #             max_timesteps = 5000
#             for param_group in model['optim'].param_groups:
#                 param_group['lr'] = 1e-3
#         elif episode > 25:
#             for param_group in model['optim'].param_groups:
#                 param_group['lr'] *= 0.99

In [9]:
def set_seed(env, seed=0):
    env.seed(0)
    torch.manual_seed(0)
    np.random.seed(0)

In [10]:
def main(environment='2048-v0', n_episodes=100, max_timesteps=2000):
    env = gym.make('CartPole-v0')
    
    set_seed(env, 0)

    obs_shape = env.observation_space.shape
    obs_dim = obs_shape[0]# * obs_shape[1]
    n_actions = env.action_space.n

    model = {}

    model['net'] = Network(obs_dim, n_actions)
    model['optim'] = optim.Adam(model['net'].parameters(), lr=1e-2)

    T_max = 400#n_episodes
    eta_min = 1e-4
    model['scheduler'] = optim.lr_scheduler.CosineAnnealingLR(model['optim'], T_max, eta_min)

    train(env, model, n_episodes, max_timesteps)

In [11]:
if __name__ == '__main__':
    main()

<IPython.core.display.Javascript object>

Episode: 10 - Episode Return: 9.0 - Average Returns: 10.4 - Max Value: 2.409444570541382 - Average Zeros: 0.00
Episode: 20 - Episode Return: 9.0 - Average Returns: 10.0 - Max Value: 2.1896564960479736 - Average Zeros: 0.00
Episode: 30 - Episode Return: 9.0 - Average Returns: 10.033333333333333 - Max Value: 2.5400490760803223 - Average Zeros: 0.00
Episode: 40 - Episode Return: 9.0 - Average Returns: 9.95 - Max Value: 2.439547300338745 - Average Zeros: 0.00
Episode: 50 - Episode Return: 8.0 - Average Returns: 9.86 - Max Value: 2.2531774044036865 - Average Zeros: 0.00
Episode: 60 - Episode Return: 10.0 - Average Returns: 9.816666666666666 - Max Value: 2.1690516471862793 - Average Zeros: 0.00


KeyboardInterrupt: 