In [4]:
import numpy as np
import torch
from torch import nn
import gym
import cv2
import copy
import matplotlib.pyplot as plt
from IPython import display
import seaborn as sns

%matplotlib inline

In [6]:
env = gym.make('Breakout-v0')
print('Action space:', env.action_space)
print('Observation space:', env.observation_space)

Action space: Discrete(4)
Observation space: Box(210, 160, 3)


In [7]:
N_FRAMES = 4

def filter_obs(obs, resize_shape=(84, 110), crop_shape=None):
    assert(type(obs) == np.ndarray), "The observation must be a numpy array!"
    assert(len(obs.shape) == 3), "The observation must be a 3D array!"

    obs = cv2.resize(obs, resize_shape, interpolation=cv2.INTER_LINEAR)
    obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY)
    obs = obs / 255.
    if crop_shape:
        crop_x_margin = (resize_shape[1] - crop_shape[1]) // 2
        crop_y_margin = (resize_shape[0] - crop_shape[0]) // 2
        
        x_start, x_end = crop_x_margin, resize_shape[1] - crop_x_margin
        y_start, y_end = crop_y_margin, resize_shape[0] - crop_y_margin
        
        obs = obs[x_start:x_end, y_start:y_end]
    
    return obs

def get_stacked_obs(obs, prev_frames):
    if not prev_frames:
        prev_frames = [obs] * (N_FRAMES - 1)
        
    prev_frames.append(obs)
    stacked_frames = np.stack(prev_frames)
    prev_frames = prev_frames[-(N_FRAMES-1):]
    
    return stacked_frames, prev_frames

def preprocess_obs(obs, prev_frames):
    filtered_obs = filter_obs(obs)
    stacked_obs, prev_frames = get_stacked_obs(filtered_obs, prev_frames)
    return stacked_obs, prev_frames

def format_reward(reward):
    if reward > 0:
        return 1
    elif reward < 0:
        return -1
    return 0

In [8]:
# Original Paper
class DQN(nn.Module):
    def __init__(self, n_acts):
        super(DQN, self).__init__()
        
        self.layer1 = nn.Sequential(
            nn.Conv2d(N_FRAMES, 32, kernel_size=8, stride=4, padding=0),
            nn.ReLU())
        self.layer2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=0),
            nn.ReLU())
        self.layer3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=0),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Linear(64 * 12 * 9, 512),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Linear(512, n_acts))
        
    def forward(self, obs):
        q_values = self.layer1(obs)
        q_values = self.layer2(q_values)
        q_values = self.layer3(q_values)
        q_values = q_values.view(-1, 64 * 10 * 7)
        q_values = self.layer4(q_values)
        q_values = self.layer5(q_values)
        
        return q_values
    
    def train_on_batch(self, target_model, optimizer, obs, acts, rewards, next_obs, terminals, gamma=0.99):
        next_q_values = self.forward(next_obs)
        max_next_acts = torch.max(next_q_values, dim=1)[1].detach()
        
        target_next_q_values = target_model.forward(next_obs)
        max_next_q_values = target_next_q_values.gather(index=max_next_acts.view(-1, 1), dim=1)
        
        terminal_mods = 1 - terminals
        actual_qs = rewards + terminal_mods * gamma * max_next_q_values
            
        pred_qs = self.forward(obs)
        pred_qs = pred_qs.gather(index=acts.view(-1, 1), dim=1).view(-1)
        
        loss = torch.mean((actual_qs - pred_qs) ** 2)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [9]:
class ExperienceReplay():
    def __init__(self, capacity):
        self.capacity = capacity
        self.data = []
        
    def add_step(self, step_data):
        self.data.append(step_data)
        if len(self.data) > self.capacity:
            self.data = self.data[-self.capacity:]
            
    def sample(self, n):
        n = min(n, len(self.data))
        indices = np.random.choice(range(len(self.data)), n, replace=False)
        samples = np.asarray(self.data)[indices]
        
        state_data = torch.tensor(np.stack(samples[:, 0])).float().cuda()
        act_data = torch.tensor(np.stack(samples[:, 1])).long().cuda()
        reward_data = torch.tensor(np.stack(samples[:, 2])).float().cuda()
        next_state_data = torch.tensor(np.stack(samples[:, 3])).float().cuda()
        terminal_data = torch.tensor(np.stack(samples[:, 4])).float().cuda()
        
        return state_data, act_data, reward_data, next_state_data, terminal_data

# DQN Algorithm

<img src='imgs/dqn_algorithm.png' width=80% align='left' />

In [10]:
n_episodes = 100000
max_steps = 1000
er_capacity = 150000 # 1m in paper
n_acts = env.action_space.n # 0: no-op, 1: start game, 2: right, 3: left
train_batch_size = 32
learning_rate = 2.5e-4
update_freq = 100
frame_skip = 3
n_anneal_steps = 1e5 # Anneal over 1m steps in paper
target_update_delay = 10000 # How many episodes in between target model update
epsilon = lambda step: np.clip(1 - 0.9 * (step/n_anneal_steps), 0.1, 1) # Anneal over 1m steps in paper, 100k here

In [11]:
er = ExperienceReplay(er_capacity)
model = DQN(n_acts=env.action_space.n).cuda()
target_model = copy.deepcopy(model)
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate, eps=1e-6)
all_rewards = []
global_step = 0

for episode in range(n_episodes):
    prev_frames = []
    obs, prev_frames = preprocess_obs(env.reset(), prev_frames)
    
    episode_reward = 0
    step = 0
    while step < max_steps:

        ### Enact a step ###
        
        if np.random.rand() < epsilon(global_step):
            act = np.random.choice(range(n_acts))
        else:
            obs_tensor = torch.tensor([obs]).float().cuda()
            q_values = model(obs_tensor)[0]
            q_values = q_values.cpu().detach().numpy()
            act = np.argmax(q_values)
        
        cumulative_reward = 0
        for _ in range(frame_skip):
            next_obs, reward, done, _ = env.step(act)
            cumulative_reward += reward
            if done or step >= max_steps:
                break
        episode_reward += cumulative_reward
        reward = format_reward(cumulative_reward)

        next_obs, prev_frames = preprocess_obs(next_obs, prev_frames)
        er.add_step([obs, act, reward, next_obs, int(done)])
        obs = next_obs
        
        ### Display a sample episode ###
        
#         if episode % update_freq == 0:
#             img.set_data(env.render(mode='rgb_array'))
#             display.display(plt.gcf())
#             display.clear_output(wait=True)
        
        ### Train on a minibatch ###
            
        obs_data, act_data, reward_data, next_obs_data, terminal_data = er.sample(train_batch_size)
        model.train_on_batch(target_model, optimizer, obs_data, act_data,
                             reward_data, next_obs_data, terminal_data)
        
        ### Update target network ###
        
        if global_step and global_step % target_update_delay == 0:
            target_model = copy.deepcopy(model)
        
        ### Finish the step ###
        
        step += 1
        global_step += 1
        
        if done:
            break
            swapon -a
    all_rewards.append(episode_reward)
    
    if episode % update_freq == 0:
        print('Episode #{} | Step #{} | Epsilon {:.2f} | Avg. Reward {:.2f}'.format(
            episode, global_step, epsilon(global_step), np.mean(all_rewards[-update_freq:])))

RuntimeError: shape '[-1, 6912]' is invalid for input of size 4480

In [None]:
# torch.save(model, 'models/dqn_attempt_3.pt')

In [None]:
smoothed_rewards = []
smooth_window = 100
for i in range(smooth_window, len(all_rewards)-smooth_window):
    smoothed_rewards.append(np.mean(all_rewards[i-smooth_window:i+smooth_window]))
    
plt.plot(range(len(smoothed_rewards)), smoothed_rewards)