In [1]:
import random
from collections import deque

import gym
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class Memory:
    def __init__(self, size=100000):
        self.buffer = deque(maxlen=size)

    def add(self, experience_tuple):
        self.buffer.append(experience_tuple)

    def sample(self, size=64):
        sample = random.sample(self.buffer, size)
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = map(lambda x: np.stack(x, axis=0), zip(*sample))
        batch_state = torch.from_numpy(batch_state).float().to(device)
        batch_next_state = torch.from_numpy(batch_next_state).float().to(device)
        batch_action = torch.from_numpy(batch_action).long().to(device)
        batch_reward = torch.from_numpy(batch_reward).float().to(device)
        batch_done = torch.from_numpy(batch_done * 1).float().to(device)
        

        return batch_state, batch_action, batch_next_state, batch_reward, batch_done

In [4]:
def compute_cov_out_size(dim_shape, padding, dilation, kernal_size, stride):
     return int((dim_shape+(2*padding)-dilation*(kernal_size-1)-1)/stride + 1)

In [5]:
class DQN(nn.Module):
    def __init__(self, state_space, action_space, learning_rate=.0005):
        super(DQN, self).__init__()
        self.seed = torch.manual_seed(0)
        self.action_space = action_space
        self.height = state_space[0]
        self.width = state_space[1]

        self.cov1_in_channels = state_space[2]
        self.cov1_out_channels = 16
        self.cov1_kernal_size = 3
        self.cov1_stride = 1
        self.cov1_dilation = 1
        self.cov1_padding = 0
        self.cov1_out_shape = (
         compute_cov_out_size(self.height, 
                              self.cov1_padding, 
                              self.cov1_dilation, 
                              self.cov1_kernal_size,
                              self.cov1_stride),
        compute_cov_out_size(self.width, 
                              self.cov1_padding, 
                              self.cov1_dilation, 
                              self.cov1_kernal_size,
                              self.cov1_stride))

        self.pool_kernal_size = 3
        self.pool_stride = 1
        self.pool_dilation = 1
        self.pool_padding = 0
        self.pool_out_shape = (
         compute_cov_out_size(self.cov1_out_shape[0], 
                              self.pool_padding, 
                              self.pool_dilation, 
                              self.pool_kernal_size,
                              self.pool_stride),
         compute_cov_out_size(self.cov1_out_shape[1], 
                              self.pool_padding, 
                              self.pool_dilation, 
                              self.pool_kernal_size,
                              self.pool_stride))
        
        self.cov2_in_channels = self.cov1_out_channels
        self.cov2_out_channels = 6 
        self.cov2_kernal_size = 3
        self.cov2_stride = 1
        self.cov2_dilation = 1 
        self.cov2_padding = 0
        self.cov2_out_shape = (
         compute_cov_out_size(self.pool_out_shape[0], 
                              self.cov2_padding, 
                              self.cov2_dilation, 
                              self.cov2_kernal_size,
                              self.cov2_stride),
         compute_cov_out_size(self.pool_out_shape[1], 
                              self.cov2_padding, 
                              self.cov2_dilation, 
                              self.cov2_kernal_size,
                              self.cov2_stride))

        self.pool2_out_shape = (
          compute_cov_out_size(self.cov2_out_shape[0], 
                              self.pool_padding, 
                              self.pool_dilation, 
                              self.pool_kernal_size,
                              self.pool_stride),
         compute_cov_out_size(self.cov2_out_shape[1], 
                              self.pool_padding, 
                              self.pool_dilation, 
                              self.pool_kernal_size,
                              self.pool_stride))

        self.linear1_in_features = self.cov2_out_channels \
                                  *self.pool2_out_shape[0] \
                                  *self.pool2_out_shape[1]

        
        # in_channels = color channels => rgb = 3
        # out_channels = channels produced by the convolution
        # kernal_size = height and width of convolution window
        self.conv1 = nn.Conv2d(in_channels=self.cov1_in_channels, 
                               out_channels=self.cov1_out_channels, 
                               kernel_size=self.cov1_kernal_size, 
                               padding=self.cov1_padding,
                               stride=self.cov1_stride,
                               dilation=self.cov1_dilation)
        
        self.pool = nn.MaxPool2d(kernel_size=self.pool_kernal_size,
                                 padding=self.pool_padding,
                                 stride=self.pool_stride,
                                 dilation=self.pool_dilation)

        self.conv2 = nn.Conv2d(in_channels=self.cov2_in_channels, 
                               out_channels=self.cov2_out_channels,
                               kernel_size=self.cov2_kernal_size,
                               padding=self.cov2_padding, 
                               stride=self.cov2_stride,
                               dilation=self.cov2_dilation)

        self.relu = nn.LeakyReLU()
        self.fc1 = nn.Linear(self.linear1_in_features, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, self.action_space)
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, self.linear1_in_features)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        return self.fc4(x)

    def optimize(self, expected, actual):
        loss = F.mse_loss(expected, actual)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [6]:
class Agent:
    def __init__(self, env):
        self.env = env
        self.action_space = env.action_space.n
        self.state_space = env.observation_space.shape

        self.memory = Memory()
        self.model = self.create_model()
        self.target = self.create_model()
        self.gamma = .5

        self.epsilon_decay = .995
        self.epsilon = 1.0
        self.epsilon_min = .01
        self.tau = 0.001

    def create_model(self):
        return DQN(self.state_space, self.action_space).to(device)

    def is_ready(self, threshold):
        return len(self.memory.buffer) > threshold

    def get_max_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state)
        self.model.train()
        return action_values.max(1)[1].item()
                            
    def decay_exploration(self):
        self.epsilon = max(self.epsilon_min, self.epsilon_decay * self.epsilon)
    
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample()
        
        return self.get_max_action(state)

    def train(self, batch_size=64):
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.sample(size=batch_size)
        Q_next = self.target(batch_next_state).max(1)[0]
        Q_actual = batch_reward + (self.gamma * Q_next * (1 - batch_done))
        Q_expected = self.model(batch_state).gather(dim=1, index=batch_action.unsqueeze(1)).squeeze(1)
        self.model.optimize(Q_expected, Q_actual)
        self.update_target()

    def remember(self, experience_tuple):
        self.memory.add(experience_tuple)

    def update_target(self):
        for target_param, local_param in zip(self.target.parameters(), self.model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

In [None]:
env = gym.make("Breakout-v0")
env.seed(0)

agent = Agent(env=env)

total_episodes = 2000
max_steps = 1000
batch_size = 32

scores = []
last_100_scores = deque(maxlen=100)
average_rewards = []
epsilons = []

for episode in tqdm(range(1, total_episodes + 1)):
    state = env.reset()
    state = state.reshape(3, 210, 160)
    score = 0
    
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        next_state = next_state.reshape(3, 210, 160)
        score += reward
        agent.remember((state, action, next_state, reward, done))
        
        if done:
            break

        if step % 4 == 0:
            if agent.is_ready(batch_size):
                agent.train(batch_size=batch_size)

        state = next_state

    last_100_scores.append(score)
    scores.append(score)
    average_rewards.append(np.mean(last_100_scores))
    epsilons.append(agent.epsilon)

    agent.decay_exploration()

    if episode % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(last_100_scores)))
    if np.mean(last_100_scores) >= 200:
        print("Goal Reached")
        torch.save(agent.model.state_dict(), 'checkpoint.pth')
        break

  0%|          | 5/2000 [12:39<79:57:55, 144.30s/it]