In [1]:
import random
from collections import deque

import gym
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class Memory:
    def __init__(self, size=100000):
        self.buffer = deque(maxlen=size)

    def add(self, experience_tuple):
        self.buffer.append(experience_tuple)

    def sample(self, size=64):
        sample = random.sample(self.buffer, size)
        
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = map(lambda x: np.stack(x, axis=0), zip(*sample))
        
        batch_state = torch.from_numpy(batch_state).float().to(device)
        batch_next_state = torch.from_numpy(batch_next_state).float().to(device)
        
        batch_action = torch.from_numpy(batch_action).long().to(device)
        batch_reward = torch.from_numpy(batch_reward).float().to(device)
        batch_done = torch.from_numpy(batch_done * 1).float().to(device)

        return batch_state, batch_action, batch_next_state, batch_reward, batch_done

In [4]:
class DQN(nn.Module):
    def __init__(self, state_space, action_space, learning_rate=.0005):
        super(DQN, self).__init__()
        self.seed = torch.manual_seed(0)
        self.action_space = action_space
        
        # in_channels = color channels => rgb = 3
        # out_channels = channels produced by the convolution
        # kernal_size = height and width of convolution window
        self.conv1 = torch.nn.Conv2d(in_channels=3, out_channels=6, kernel_size=3, stride=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=3, stride=1)
        
        # out_channels * kernel_size * kernel_size, 64 out nodes of the previous node
        self.fc1 = nn.Linear(64, 64)
#         self.fc2 = nn.Linear(64, 64)
#         self.action_space
        self.fc3 = nn.LogSoftmax(dim=1)
        
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(31008, -1)
        x = F.relu(self.fc1(x))
#         x = F.relu(self.fc2(x))
        return self.fc3(x)# log_softmax(x, dim=1)

    def backward(self, expected, actual):
        loss = F.mse_loss(expected, actual)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

In [5]:
class Agent:
    def __init__(self, env):
        self.env = env
        self.action_space = env.action_space.n
        self.state_space = env.observation_space.shape[0]

        self.memory = Memory()
        self.model = self.create_model()
        self.target = self.create_model()
        self.gamma = .99

        self.epsilon_decay = .995
        self.epsilon = 1
        self.epsilon_min = .01
        self.tau = 0.001

    def create_model(self):
        return DQN(self.state_space, self.action_space).to(device)

    def is_ready(self, threshold):
        return len(self.memory.buffer) > threshold

    def get_max_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.model.eval()
        with torch.no_grad():
            action_values = self.model(state)
        self.model.train()
        return np.argmax(action_values.cpu().data.numpy())

    def decay_exploration(self):
        self.epsilon = max(self.epsilon_min, self.epsilon_decay * self.epsilon)
    
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return self.env.action_space.sample()
        
        return self.get_max_action(state)

    def train(self, batch_size=64):
        batch_state, batch_action, batch_next_state, batch_reward, batch_done = self.memory.sample(size=batch_size)
        Q_next = self.target(batch_next_state).detach().max(1)[0].unsqueeze(1)
        Q_actual = batch_reward + (self.gamma * Q_next * (1 - batch_done))
        Q_expected = self.model(batch_state) #.gather(dim=1, index=batch_action)

        self.model.backward(Q_expected, Q_actual)
        self.update_target()

    def remember(self, experience_tuple):
        self.memory.add(experience_tuple)

    def update_target(self):
        for target_param, local_param in zip(self.target.parameters(), self.model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

In [6]:
env = gym.make("Breakout-v0")
env.seed(0)

agent = Agent(env=env)

total_episodes = 2000
max_steps = 1000
batch_size = 64

scores = []
last_100_scores = deque(maxlen=100)
average_rewards = []
epsilons = []

for episode in tqdm(range(1, total_episodes + 1)):
    state = env.reset()
    state = state.reshape(3, 210, 160)
    score = 0
    
    for step in range(max_steps):
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        
        next_state = next_state.reshape(3, 210, 160)
        agent.remember((state, action, next_state, reward, done))
        score += reward

        if step % 4 == 0:
            if agent.is_ready(batch_size):
                agent.train(batch_size=batch_size)

        state = next_state
        if done:
            break

    last_100_scores.append(score)
    scores.append(score)
    average_rewards.append(np.mean(last_100_scores))
    epsilons.append(agent.epsilon)

    agent.decay_exploration()

    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(last_100_scores)), end="")
    if episode % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(last_100_scores)))
    if np.mean(last_100_scores) >= 200:
        print("Goal Reached")
        torch.save(agent.model.state_dict(), 'checkpoint.pth')
        break

  0%|          | 1/2000 [00:16<9:16:43, 16.71s/it]

Episode 1	Average Score: 0.00

RuntimeError: size mismatch, m1: [31008 x 1], m2: [64 x 64] at /Users/distiller/project/conda/conda-bld/pytorch_1579022036889/work/aten/src/TH/generic/THTensorMath.cpp:136