## Libraries

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from collections import deque
import skimage.measure
import numpy as np
import gym
from gym import wrappers

from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

## Preprocessing

In [3]:
def preprocess(rgb_tensor):
    '''
    Transforms 3D RGB numpy tensor: crop, convert to 2D grayscale, downsample, and convert to PyTorch tensor.
    '''
    crop = rgb_tensor[30:194,:,:]
    grayscale = np.dot(crop[...,:3], [0.2989, 0.5870, 0.1140])  ## using Matlab's formula
    downsample = skimage.measure.block_reduce(grayscale, (2,2), np.max)
    return downsample

## Agent Select Action

In [4]:
class Action:
    '''Returns an action value which is an int in range [0,3]'''
    
    def __init__(self):
        self.time_ = 0
        
    def update_time(self):
        self.time_ += 1
        
    def action(self):
        if self.time_ == 0:
            self.action_ = 1  ## start game by firing ball
        else:
            # take agent-based action every 4 time steps; else push action forward w/out agent computing
            if self.time_%4 == 0:
                if np.random.binomial(n=1, p=eg.epsilon_, size=1):
                    self.action_ = env.action_space.sample()  ## take random action
                else:
                    self.action_ = cnn(Variable(torch.Tensor(initial_seq).unsqueeze(0).unsqueeze(0))).data.max(1)[1][0]  ## take optimal action according to NN
        return self.action_

## Experience Replay

In [6]:
class ExperienceReplay:
    
    dq_ = deque(maxlen=500)

    def __init__(self, C):
        self.capacity_ = C
        
    def add_experience(self, experience_tuple):
        '''add new experience to experience replay'''
        self.dq_.append(experience_tuple)
        
    def sample(self, capacity=32):
        '''sample from experience replay'''
        nb_items = len(self.dq_)
        if nb_items > capacity:
            idx = np.random.choice( nb_items, size=capacity, replace=False)
        else:
            idx = np.random.choice( nb_items, size=nb_items, replace=False)
        return [self.dq_[i] for i in idx]

## Epsilon Generator

In [7]:
class EpsilonGenerator():
    
    def __init__(self, start, stop, steps):
        self.epsilon_ = start
        self.stop_ = stop
        self.steps_ = steps
        self.step_size_ = (self.epsilon_ - stop) / (self.steps_)
        self.count_ = 1
        
    def epsilon_update(self):
        '''generate next epsilon value'''
        if (self.epsilon_ >= self.stop_ and self.count_ < self.steps_):
            self.count_ += 1
            self.epsilon_ -= self.step_size_
        else:
            self.epsilon_ = self.stop_
            self.count_ += 1

## CNN Architecture

In [8]:
class CNN(nn.Module):
    
    def __init__(self,):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 8, 4)  ## Conv2d(nChannels, filters, kernel, stride)
        self.conv2 = nn.Conv2d(16, 32, 4, 4)
        self.fc1 = nn.Linear(32 * 4 * 4, 256)
        self.fc2 = nn.Linear(256, 4)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1, 32 * 4 * 4)  ## reshape 
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## Create Dataset

In [9]:
class Dataset:
    
    def __init__(self):
        self.replay_size_ = None
        self.data_ = None
        self.target_ = None
        
    def get_data(self):
        self.replay_size_ = len(minibatch)
        # create tensor of initial observations
        self.data_ = Variable(torch.Tensor([minibatch[i][0] for i in range(self.replay_size_)]).unsqueeze(1))
        # create tensor of corresponding target variable values
        target_list = []
        for i in range(self.replay_size_):
            observed = Variable(torch.Tensor(minibatch[i][3]))
            if minibatch[i][4] == 'terminal':
                target_list.append(minibatch[i][2])
            else:
                target_list.append(minibatch[i][2] + discount * 
                                   cnn(observed.unsqueeze(0).unsqueeze(0)).data.max(1)[1][0])
        self.target_ = Variable(torch.Tensor(target_list))

## Training

In [10]:
num_games = 2  ## number of games to play
time_steps = 500  ## max number of time steps per game
record = 0
view = 1

In [11]:
# Atari emulator
env = gym.make('Breakout-v0')
# whether to record training
if record:
    env = wrappers.Monitor(env, 
                           directory='/Users/davidziganto/Data_Science/PyTorch/OpenAI_vids/breakout-experiment-1', 
                           video_callable=None, ## takes video when episode number is perfect cube
                           force=True)

# instantiate key classes
cnn = CNN()
er = ExperienceReplay(C=100)
eg = EpsilonGenerator(start=1, stop=0.1, steps=1000)
agent = Action()
dataset = Dataset()

# setup variables
discount = 0.9  
learning_rate = 0.01

# CNN setup
criterion = nn.MSELoss()
optimizer = optim.RMSprop(cnn.parameters(), 
                          lr=learning_rate, 
                          alpha=0.99, 
                          eps=1e-08, 
                          weight_decay=0, 
                          momentum=0, 
                          centered=False)

# play game
for episode in range(num_games):
    
    ## start/reset environment + store observation
    initial_seq = preprocess(env.reset())
    
    for t in range(time_steps):
        
        ## show game in real-time
        if view:
            env.render()
        
        # take action (0=do nothing; 1=fire ball; 2=move right; 3=move left)
        action = agent.action()
        agent.update_time()
        
        # update epsilon for epsilon-greedy implementation
        eg.epsilon_update()
        
        # get feedback from emulator
        observation, reward, done, info = env.step(action)
        
        # preprocess new observation post action    
        final_seq = preprocess(observation)
        
        # stop if no more moves, else continue and update
        if done:
            er.add_experience((initial_seq, action, reward, final_seq, 'terminal'))
            break
        else:
            er.add_experience((initial_seq, action, reward, final_seq, 'nonterminal'))
            
        # get mini-batch sample from experience replay (fyi - randomizes index)
        minibatch = er.sample()
        
        # get data for updating policy network
        dataset.get_data()
        
        # Update CNN
        optimizer.zero_grad()  ## zero the parameter gradients
        outputs = cnn(dataset.data_).max(1)[0]  ## feedforward pass
        loss = criterion(outputs, dataset.target_)  ## calculate loss
        loss.backward()  ## backpropagation
        optimizer.step()  ## update network weights
            
        # set new observation as initial observation
        initial_seq = final_seq
        
env.close()

[2017-08-30 15:31:26,321] Making new env: Breakout-v0


# To Do...

1) normalize pixel values (preprocessing)  
2) add loss function to show learning over time  
3) add GPU functionality

## Stats for 1000 games taking random actions

In [None]:
env = gym.make('Breakout-v0')

rewards = []
steps = []

for game in range(1000):
    
    myreward = 0
    
    # reset game
    env.reset()
    
    for t in range(1000):
        
        # show in real-time
        #env.render()
        
        # take a random action
        if t == 0:
            action = 1
        else:
            action = env.action_space.sample() 
        
        # get feedback
        observation, reward, done, info = env.step(action)
        myreward += reward
        
        # end game when out of balls
        if done:
            rewards.append(myreward)
            steps.append(t)
            stats = zip(steps, rewards)
            env.close()
            break

In [None]:
max(rewards)

In [None]:
import seaborn as sns
sns.distplot(rewards);

#### Save Model

In [None]:
torch.save(cnn.state_dict(), '/Users/davidziganto/Data_Science/PyTorch/DL_models/DL_RL_Atari_breakout_500e_10000t')

#### Load Model

In [None]:
#cnn = CNN()
#cnn.load_state_dict(torch.load('/Users/davidziganto/Data_Science/PyTorch/DL_models/DL_RL_Atari_breakout'))

# EXAMPLE

### Get Frames

In [None]:
frames = []
rewards = []
nb_frames = 500
env = gym.make('Breakout-v0')
env.reset()
for t in range(nb_frames):
    env.render()
    action = env.action_space.sample() # take a random action
    observation, reward, done, info = env.step(action)
    frames.append(preprocess(observation))
    if t%4 == 3 or done:
        frameTensor = np.stack(frames)
        minibatch = Variable(torch.Tensor(frameTensor))  ## convert to torch Variable data type
        print('t:', t, '\n', minibatch)
        frames = []
    if done:
        break

### Show Preprocessed Data Frames

In [None]:
for frame in frames:
    plt.imshow(frame, cmap = plt.get_cmap('gray'))
    plt.show()

### Frame Dimensions

In [None]:
frame.shape