In [1]:
import os
import json
import gym
from gym import wrappers
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count 
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
torch.manual_seed(0)
np.random.seed(12345)

In [3]:
%matplotlib inline

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
env = wrappers.Monitor(gym.make("CartPole-v0"), "./video", force=True)

In [6]:
env.observation_space.high

array([4.8000002e+00, 3.4028235e+38, 4.1887903e-01, 3.4028235e+38],
      dtype=float32)

In [7]:
env.observation_space.low

array([-4.8000002e+00, -3.4028235e+38, -4.1887903e-01, -3.4028235e+38],
      dtype=float32)

In [8]:
env.action_space

Discrete(2)

## Approach

We are trying to learn the state-action value function, $Q(s, a)$ using **vanilla Q-learning**.

Under Bellman's optimality condition, if we were functioning under some optimal policy, $\pi^*$, the regardless of which state we are in, our actions, $a=\pi^*(s)$, would lead us to the maximum expected value. This means that the equation below holds.

$$
Q_{\pi^*} (s, a) = r + \gamma \underset{a}\max Q_{\pi^*}(s^\prime, \pi^*(s^\prime))
$$

The above can thus be an update equation. Suppose our policy is now to choose the action that maximises our expected value given our current state (i.e. $\pi(s) = \underset{a}{\text{argmax}} Q(s, a)$), our aim would be to minimise the difference between the left and right side of the above equation.

Our cost metric, also known as temporal difference,  is thus

$$
\delta = Q(s, a) - \left(r + \gamma \underset{a}\max Q(s^\prime, \pi^*(s^\prime))\right)
$$

Our overall simulation method will be that of **continuous every visit Monte Carlo**.

In [9]:
class QNet(nn.Module):
    
    def __init__(self, n_inputs, n_outputs,
                 hidden_layer_sizes=[16, 32, 16],
                 max_memory=10000):
        
        super().__init__()
        
        self.n_inputs = n_inputs
        self.n_outputs = n_outputs
        
        layer_sizes = [n_inputs] + hidden_layer_sizes + [n_outputs]
        
        self.layers = []
        
        for s_in, s_out in zip(layer_sizes[:-1], layer_sizes[1:]):
            
            self.layers.append(nn.Linear(s_in, s_out))
            
        self.params = []
        
        for l in self.layers:
            
            self.params.extend(l.parameters())
            
        self.params = nn.ParameterList(self.params)
        
        return
    
    def forward(self, x):
        
        for l in self.layers:
            
            x = F.relu(l(x))
            
        return x
    
    
class ReplayMemory(object):
    
    def __init__(self, max_memory, batch_size):
        
        self.memory = []
        self.max_memory = max_memory
        self.batch_size = batch_size
        
        return
    
    def remember(self, cur_state, next_state, action, reward, is_done):
        
        self.memory.append((cur_state, next_state, action, reward, is_done))
            
        if len(self.memory) >= self.max_memory:
            
            self.memory = self.memory[-self.max_memory:]
            
        return
    
    def sample(self):
        
        return random.sample(self.memory, self.batch_size)
    
    def __len__(self):
        
        return len(self.memory)
    
def to_torch(x, device):
    
    return torch.tensor(x, dtype=torch.float32, device=device)

In [10]:
n_inputs = env.observation_space.shape[0]
n_outputs = env.action_space.n

episode_length = 200
max_episodes = 5000

memory_capacity = 5000

batch_size = 64

episode_score_history = []

epsilon_max = 0.99
epsilon_decay = 0.9999
epsilon_min = 0.05

gamma = 0.99
best_loss = np.inf
best_model = None
min_cost_improvement = 1e-3

report_every = 50
score_threshold = 195

output_dir = "E01-model"

if not os.path.isdir(output_dir):
    
    os.makedirs(output_dir)

In [11]:
net = QNet(n_inputs, n_outputs, hidden_layer_sizes=[8, 8]).to(device)

In [12]:
memory = ReplayMemory(memory_capacity, batch_size)

In [13]:
epsilon = epsilon_max
optimizer = optim.RMSprop(net.parameters())
criteria = nn.MSELoss()

In [14]:
for e in range(max_episodes):
    
    cur_state = torch.from_numpy(env.reset()).type(torch.float32).to(device)
    
    episode_score = 0
    
    for i in count():
        
        env.render()
        
        state_value = net(cur_state)
        
        if np.random.rand() <= epsilon:
            
            action = np.random.randint(n_outputs)
            
        else:
            
            with torch.no_grad():
            
                action = state_value.argmax().item()
            
        epsilon = np.max([epsilon * epsilon_decay, epsilon_min])
        
        next_state, reward, is_done, info = env.step(action)
        
        episode_score += reward
        
        memory.remember(cur_state,
                            to_torch(next_state, device),
                            to_torch([action], device),
                            to_torch([reward], device),
                            is_done)
        
        if is_done:
            
            episode_score_history.append(episode_score)
            
            break
              
        cur_state = to_torch(next_state, device)
            
        if len(memory) >= 3 * batch_size:
            
            cur_states_, next_states_, actions_, rewards_, is_done_ = zip(*memory.sample())
            
            action_batch = torch.cat(actions_).type(torch.long).view(-1, 1)
            
            state_value_batch = net(torch.cat(cur_states_).view(-1, n_inputs)).gather(1, action_batch)
            
            reward_batch = torch.cat(rewards_).view(-1, 1)
            
            is_done_batch = torch.tensor(is_done_, dtype=torch.bool, device=device)
            
            with torch.no_grad():
                
                next_state_value_batch = net(torch.cat(next_states_).view(-1, n_inputs)).max(1, keepdim=True).values.detach()
                
                next_state_value_batch[is_done_batch] = 0
            
            loss = criteria(state_value_batch,
                            reward_batch + (gamma * next_state_value_batch))
            
            optimizer.zero_grad()
            
            loss.backward()
            
            optimizer.step()
            
    if np.mean(episode_score_history[-100:]) >= score_threshold:
        
        print("Game solved!")
        
        break
    
    if (e + 1) % report_every == 0:
        
        with torch.no_grad():
            
            print("Episode: {} done. Loss: {}. Ave. episode score (last {}): {}. Last epsilon: {}"
                  .format(e + 1,
                          np.round(loss.item(), 4),
                          report_every,
                          np.mean(episode_score_history[-100:]),
                          np.round(epsilon, 4)))



Episode: 50 done. Loss: 1.0. Ave. episode score (last 50): 21.48. Last epsilon: 0.8892
Episode: 100 done. Loss: 124028.5078. Ave. episode score (last 50): 20.11. Last epsilon: 0.8096
Episode: 150 done. Loss: 2768.8. Ave. episode score (last 50): 18.25. Last epsilon: 0.7408
Episode: 200 done. Loss: 1.4518. Ave. episode score (last 50): 20.85. Last epsilon: 0.6573
Episode: 250 done. Loss: 63.6109. Ave. episode score (last 50): 25.61. Last epsilon: 0.5735
Episode: 300 done. Loss: 30.5557. Ave. episode score (last 50): 21.45. Last epsilon: 0.5304
Episode: 350 done. Loss: 34.9268. Ave. episode score (last 50): 14.8. Last epsilon: 0.4946
Episode: 400 done. Loss: 39.7656. Ave. episode score (last 50): 13.38. Last epsilon: 0.4639
Episode: 450 done. Loss: 40.8212. Ave. episode score (last 50): 13.24. Last epsilon: 0.4332
Episode: 500 done. Loss: 31.9179. Ave. episode score (last 50): 13.03. Last epsilon: 0.4073
Episode: 550 done. Loss: 22.3551. Ave. episode score (last 50): 12.23. Last epsilon:

KeyboardInterrupt: 

In [15]:
env.close()

In [None]:
import pandas as pd

In [None]:
scores = pd.Series(episode_score_history)

In [None]:
scores.plot()
scores.rolling(100).mean().plot()

In [None]:
scores.rolling(100).mean().max()

In [None]:
torch.save(net.state_dict(), os.path.join(output_dir, "best_model.pt"))

In [None]:
net.eval()

In [None]:
n_eval_episodes = 100
eval_scores = []

In [None]:
for e in range(n_eval_episodes):
    
    cur_state = to_torch(env.reset(), device)

    for i in range(200):

        env.render()

        next_state, reward, is_done, info = env.step(net(cur_state).argmax().item())

        cur_state = to_torch(next_state, device)

        if is_done:

            break
    
    if e % 10 == 9:
        
        print("Evaluated {} episodes".format(e))
            
    eval_scores.append(i)

In [None]:
env.close()

In [None]:
np.mean(eval_scores)