In [1]:
import copy

import PIL.Image

import gym

from IPython import display

import numpy as np

import torch
from torch import nn, optim
from torch.autograd import Variable

### Image Processing

In [2]:
# TODO: Process only one image
def phi_map(image_list):
    # Frame Skipping size
    k = len(image_list)
    
    im_tuple = tuple()
    for i in range(k):    
        # Load single image as PIL and convert to Luminance
        im = PIL.Image.fromarray(image_list[i]).convert('L')
        # Resize image
        im = im.resize((84, 84), PIL.Image.ANTIALIAS)
        # Transform to numpy array
        im = np.array(im)
        # Add processed image to tuple
        im_tuple += (im,)
    
    # Return tensor of processed images
    arr = tuple_to_numpy(im_tuple)
    return arr

def tuple_to_numpy(im_tuple):
    # Stack tuple of 2D images as 3D np array
    arr = np.dstack(im_tuple)
    # Move depth axis to first index: (height, width, depth) to (depth, height, width)
    arr = np.moveaxis(arr, -1, 0)
    # Make arr 4D by adding dimension at first index 
    arr = np.expand_dims ( arr, 0 )
    return arr

### Model

In [3]:
class DeepQNetwork(nn.Module):
    
    def __init__(self, num_actions):
        super(DeepQNetwork, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(4, 32, kernel_size=8, stride=4),
            nn.ReLU()
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU()
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )
        self.hidden = nn.Sequential(
            nn.Linear(64 * 7 * 7, 512),
            nn.ReLU()
        )
        self.out = nn.Sequential(
            nn.Linear(512, num_actions),
            nn.ReLU()
        )
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size()[0], -1)
        x = self.hidden(x)
        x = self.out(x)
#         print(x.size())
        return x

In [4]:
class ReplayMemory():
    
    def __init__(self, N, image_shape=(4, 84, 84)):
        self.N = N
        # Next position in arrays to be used
        self.index = 0
        
        # One np array for each tuple element
        self.phi_t = np.zeros( (N, ) + image_shape )
        self.action = np.zeros(N)
        self.reward = np.zeros(N)
        self.phi_t_plus1 = np.zeros( (N, ) + image_shape )
        self.terminates = np.zeros(N)
        
        self.full = False
    
    def add(self, experience):
        '''
        This operation adds a new experience e, replacing the earliest experience if full.
        '''
        self.phi_t[self.index]  = experience[0]
        self.action[self.index] = experience[1]
        self.reward[self.index] = experience[2]
        self.phi_t_plus1[self.index] = experience[3]
        self.terminates[self.index] = experience[4]
        
        # Update value of next index
        self.index = (self.index + 1) % self.N
        
        # Update 'full' when array is full
        if not self.full and self.index == 0:
            self.full = True
        
    def sample(self, size):
        num_items = N if self.full else self.index
        idxs = np.random.choice(num_items, size)
        return self.phi_t[idxs], self.action[idxs], self.reward[idxs], self.phi_t_plus1[idxs], self.terminates[idxs]
        
        
class History():
    
    def __init__(self):
        self.max_size = 4
        self.list = []
    
    def add(self, ex):
        # Add new element if list is not full
        if len(self.list) < self.max_size:
            self.list.append(ex)
            return
        
        # Move existing elements one index to the left
        self.list[:-1] = self.list[1:]
        # Add new value to last index
        self.list[-1] = ex
        
    def get(self):
        return self.list
        

In [39]:
def to_variable(arr):
    return Variable(torch.from_numpy(arr).float())

def initial_history(env):
    s = env.reset()[0]
    H = History()
    for _ in range(H.max_size):
        H.add(s)
    return H

def e_greedy_action(Q, phi, env, frame_count):
    # Calculate annealed epsilon
    initial_epsilon, final_epsilon = 1.0, .1
    max_frames = float(1e7)
    epsilon = max(final_epsilon, initial_epsilon - frame_count *((initial_epsilon - final_epsilon) / max_frames))
    print('Epsilon: {}'.format(epsilon))
    # Obtain a random value in range [0,1)
    rand = np.random.uniform()
    # With probability e select random action a_t
    if rand < epsilon:
        return env.action_space.sample()
    # Otherwise select a_t = argmax_a Q(phi, a)
    else:
        # Convert to Variable
        phi = to_variable(phi)
        return Q(phi).max(1)[1].data
    
def update_target_network(Q):
    return copy.deepcopy(Q)

def approximate_targets(phi_plus1_mb, r_mb, done_mb, Q_, gamma=0.99):
    '''
    gamma: future reward discount factor
    '''
    max_Q, argmax_a = Q_(to_variable(phi_plus1_mb)).detach().max(1)
    # 0 if ep. teriminates at step j+1, 1 otherwise
    terminates = to_variable(1 - done_mb)
    return to_variable(r_mb) + (gamma * max_Q) * terminates

def gradient_descent(optimizer, loss_func, y, Q, phi_mb, action_mb, mb_size):
    # Calculate Q(phi) of actions in [action_mb]
    q_phi = Q(to_variable(phi_mb))[np.arange(mb_size), action_mb]
#     # Clip error to range [-1, 1]
#     error = ( torch.clamp(y - q_phi, min=-1, max=1) )**2
    
    # Clear previous gradients before backward pass
    optimizer.zero_grad()

    # Run backward pass
    error = loss_func(q_phi, y)
    error.backward()

    # Perfom the update
    optimizer.step()

In [40]:
env = gym.make('Pong-v0')

In [None]:
NUM_EPISODES = 500
MINIBATCH_SIZE = 32
T = 10000000
N = int(1e7) # Replay Memory size: 1M
C = 10000 # Target nerwork update frequency
k = 4 # Agent History Length
frame_count = 0
ep_reward_list = []
loss_func = torch.nn.MSELoss(size_average=False)

# Initialize replay memory D to capacity N
D = ReplayMemory(N)
# Initialize action-value function Q with random weights
Q = DeepQNetwork(6)
optimizer = optim.RMSprop(
    Q.parameters(), lr=0.00025, momentum=0.95, alpha=0.95, eps=.01
)
loss = torch.nn.MSELoss(size_average=False)
# Initialize target action-value function Q^ with weights
Q_ = update_target_network(Q)

for ep in range(NUM_EPISODES):
    # Initialize sequence s1 = {x1} and preprocessed sequence phi = phi(s1)
    H = initial_history(env)
    phi = phi_map(H.get())
    
    ep_reward = 0.0
    ep_num_rewards = 0.0
    
    
    for t in range(T):
        env.render(mode='human')
        # Select action
        action = e_greedy_action(Q, phi, env, frame_count)
        # Execute action a_t in emulator and observe reward r_t and image x_(t+1)
        image, reward, done, _ = env.step(action)
        if reward != 0:
            ep_num_rewards += 1
            ep_reward +=  reward #(ep_reward * (ep_num_rewards-1) + reward) / (ep_num_rewards)
        frame_count += 1
        if done: break
        # Set s_(t+1) = s_t, a_t, x_(t+1) and preprocess phi_(t+1) =  phi_map( s_(t+1) )
        H.add(image)
        phi_prev, phi = phi, phi_map(H.get())
        # Store transition (phi_t, a_t, r_t, phi_(t+1)) in D
        D.add((phi_prev, action, reward, phi, done))        
        if t % 4 == 0:
            # Sample random minibatch of transitions ( phi_j, a_j, r_j, phi_(j+1)) from D
            phi_mb, a_mb, r_mb, phi_plus1_mb, done_mb = D.sample(MINIBATCH_SIZE)
            # Set y_j
            y = approximate_targets(phi_plus1_mb, r_mb, done_mb, Q_)
            # Perform a gradient descent step on ( y_j - Q(phi_j, a_j) )^2
            gradient_descent(optimizer, loss_func, y, Q, phi_mb, a_mb, MINIBATCH_SIZE)
#             raw_input('')
        # Reset Q_
        if t % C == 0: Q_ = update_target_network(Q)
        # -- LOGS
        display.clear_output(True)
        print('Frame #: {}'.format(frame_count))
        print('Ep. {} Reward: {}'.format(ep, ep_reward))
        print('Eps. Rewards: {}'.format(ep_reward_list))
        # -- \LOGS
        # Restart game if done
        if done: break
    ep_reward_list.append(ep_reward)

Frame #: 15436
Ep. 12 Reward: -13.0
Eps. Rewards: [-19.0, -20.0, -21.0, -21.0, -20.0, -21.0, -19.0, -21.0, -21.0, -21.0, -20.0, -21.0]
Epsilon: 0.99861076
