In [1]:
import os
import gym
import time
import copy
import random
import numpy as np

import torch
import torchvision
import torch.nn as nn

from tqdm import tqdm
from collections import deque
from skimage.color import rgb2grey
from matplotlib import pyplot as plt

In [2]:
class ReplayMemory:
    def __init__(self, length=int(1e5)):
        self.memory = deque(maxlen=length)
    
    def remember(self, state, action, reward, terminal, next_state):
        self.memory.append([state, action, reward, terminal, next_state])
    
    def retrieve(self, batch_size):
        if batch_size > self.depth:
            batch_size = self.depth
        
        return random.sample(self.memory, batch_size)
    
    @property
    def depth(self):
        return len(self.memory)

In [3]:
class DeepQNetwork(nn.Module):
    def __init__(self, num_frames, num_actions):
        super(DeepQNetwork, self).__init__()
        
        # Layers
        self.conv1 = nn.Conv2d(
            in_channels=num_frames,
            out_channels=32,
            kernel_size=3,
            stride=2,
            padding=1
            )
        self.conv2 = nn.Conv2d(
            in_channels=32,
            out_channels=64,
            kernel_size=3,
            stride=2,
            padding=1
            )
        self.conv3 = nn.Conv2d(
            in_channels=64,
            out_channels=128,
            kernel_size=3,
            stride=2,
            padding=1
            )
        self.conv4 = nn.Conv2d(
            in_channels=128,
            out_channels=256,
            kernel_size=3,
            stride=2,
            padding=1
            )
        self.fc1 = nn.Linear(
            in_features=25600,
            out_features=512,
            )
        self.fc2 = nn.Linear(
            in_features=512,
            out_features=num_actions
            )
        
        # Activations
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
    
    def forward(self, x):
        
        # Forward pass
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.relu(self.conv3(x))
        x = self.relu(self.conv4(x))
        x = self.relu(self.fc1(x))
        x = self.softmax(self.fc2(x))
        
        return x

In [4]:
class Agent:
    def __init__(self, DQN, memory_depth):
        self.DQN = DQN
        self.memory_depth = memory_depth
        self.memory = deque(maxlen=memory_depth)
    
    def clone(self, model):
        return copy.deepcopy(model)
    
    def remember(self, state, action, reward, terminal, next_state):
        self.memory.append([state, action, reward, terminal, next_state])
    
    def retrieve(self, batch_size):
        if batch_size > self.memories:
            batch_size = self.memories
        
        return random.sample(self.memory, batch_size)
    
    @property
    def memories(self):
        return len(self.memory)
    
    def act(self, state):
        q_values = self.DQN(state)
        action = np.argmax(q_values.numpy())
        return action

In [5]:
def process(state):
    state = rgb2grey(state[35:195, :, :])
    state = state[np.newaxis, :, :, np.newaxis]
    return to_tensor(state)

In [6]:
def epsilon_schedule(t):
    if t < anneal_time:
        return epsilon_i - t*(epsilon_i - epsilon_f)/anneal_time
    elif t >= anneal_time:
        return epsilon_f

In [7]:
# Hyperparameters

update_interval = 40
num_frames = 4
num_actions = 4
episodes = 100
memory_depth = int(1e5)
epsilon_i = 1.0
epsilon_f = 0.1
anneal_time = 10000
gamma = 0.9

In [8]:
model = DeepQNetwork(num_frames, num_actions)

In [9]:
agent = Agent(model, memory_depth)

In [10]:
cuda = True if torch.cuda.is_available() else False
to_tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

In [11]:
env = gym.make('Breakout-v0')

In [37]:
def q_iteration(episodes):
    
    for episode in range(episodes):
        
        state = env.reset()
        state = process(state)
        
        done = False
        t = 0

        while not done:
            
            env.render()
            
            while state.size()[-1] < num_frames:
                action = np.random.choice(num_actions)
                
                new_frame, reward, done, info = env.step(action)
                new_frame = process(new_frame)
                
                state = torch.cat([state, new_frame], -1)
                
            if np.random.uniform < epsilon_schedule(t):
                action = np.random.choice(num_actions)

            else:
                action = agent.act(state)

            new_frame, reward, done, info = env.step(action)
            new_frame = process(new_frame)
            
            new_state = torch.cat([state, new_frame], -1)
            new_state = new_state[:, :, :, 1:]

            agent.remember(state, action, reward, done, new_state)

            state = new_state
            t += 1
            
            if t % update_interval == 0:
                pass

            if done:
                print("Episode {}: Episode completed after {} timesteps".format(episode, t))

In [38]:
q_iteration(100)

TypeError: '<' not supported between instances of 'builtin_function_or_method' and 'float'