In [1]:
from skimage.transform import resize
import random
import numpy as np
import matplotlib.pyplot as plt
import math
import random
import pandas as pd
from sklearn.model_selection import ParameterGrid

# Catch Environment (from the assignment code)
from Catch import CatchEnv

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.nn import Module
from torch.nn import Conv2d
from torch.nn import Linear
from torch.nn import MaxPool2d
from torch.nn import ReLU
from torch.nn import LogSoftmax
from torch import flatten

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Memory buffer, to store the trajectories

In [2]:
from collections import namedtuple, deque

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class MemoryBuffer(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity) # iterable deque data structure

    def push(self, *args):
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

### Make a Deep Q-Network

In [12]:
class DQN(nn.Module):
    def __init__(self, number_of_actions):
        super(DQN, self).__init__()
        # conv layers
        self.conv1 = torch.nn.Conv2d(4, 32, 5, stride=3)   # modify input shape to match your input data size
        self.conv2 = torch.nn.Conv2d(32, 64, 3, stride=2)
        # fully connected layers
        self.fc1 = torch.nn.Linear(576, 128)
        self.fc2 = torch.nn.Linear(128, number_of_actions)

        # mat1 and mat2 shapes cannot be multiplied (8x1024 and 16x512)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, (2, 2))
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, (2, 2))
        #print('doing conv3')
        #x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x






# A model that is tiny, but runs much faster (for testing the code without waiting to train a good model)
class speedyDQN(nn.Module):

    def __init__(self, number_of_actions):
        super(speedyDQN, self).__init__()
        # conv layers
        self.conv1 = torch.nn.Conv2d(4, 1, 3, stride=10)   # modify input shape to match your input data size
        # fully connected layers
        self.fc2 = torch.nn.Linear(16, number_of_actions)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, (2, 2))
        #x = F.relu(self.conv2(x))
        #x = F.max_pool2d(x, (2, 2))
        x = x.view(x.size(0), -1)
        #x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [4]:
class Agent(object):
    def __init__(self, num_moves, eps_start, eps_min, eps_decay, memory, batch_size, learning_rate, amsgrad, gamma, target_network_update_rate):
        self.gamma = gamma
        #self.optimizer = optimizer
        self.batch_size = batch_size
        self.memory = memory
        self.num_possible_moves = num_moves
        self.epsilon = eps_start
        self.eps_start = eps_start
        self.eps_min = eps_min
        self.eps_decay = eps_decay

        self.episode = 0
        self.steps = 0
        self.target_network_update_rate = target_network_update_rate

        self.model = DQN(self.num_possible_moves).to(device)
        self.target_network = DQN(self.num_possible_moves).to(device)
        self.target_network.load_state_dict(self.model.state_dict())


        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, amsgrad=amsgrad)

    
    def select_action(self, state, testing = False): 
        # the 'testing' variable is a way for us to enforce that the agent is exploiting (not exploring) during the 10 episodes of testing
        if np.random.rand() <= self.epsilon and testing == False:
            return torch.tensor([[random.randrange(self.num_possible_moves)]], device=device, dtype=torch.long)

        q_values = self.model(state)
        action = q_values.max(1)[1].view(1, 1)

        return action # returns a tensor of shape [[n]] (where n is the action number)

    def optimize_model(self):
        self.steps += 1
        if len(self.memory) < self.batch_size:
            return

        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))

        # use of masking to handle the final states (where there is no next state)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None])
        # concatenate the states, actions and rewards into batches
        batch_of_states = torch.cat(batch.state) 
        batch_of_actions = torch.cat(batch.action)
        batch_of_rewards = torch.cat(batch.reward)

        # get the Q(s_t, a) values for the current state and the chosen action
        state_action_values = self.model(batch_of_states).gather(1, batch_of_actions)

        # Compute state-action values for all next states using the target network:  max(Q(s_{t+1}, a)).
        # 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=device)
        with torch.no_grad():
            next_state_values[non_final_mask] = self.target_network(non_final_next_states).max(1)[0]  # get the max Q value
        
        # set the temporal difference learning target
        TD_targets = (batch_of_rewards + (self.gamma * next_state_values)  ).unsqueeze(1)


        # Compute the Huber loss
        criterion = nn.SmoothL1Loss()
        TD_loss = criterion(state_action_values, TD_targets)

        # Optimize the model
        self.optimizer.zero_grad()
        TD_loss.backward()
        
        # clip the losses (Huber loss)
        torch.nn.utils.clip_grad_value_(self.model.parameters(), 100)
        self.optimizer.step()


        # update the target network every certain number of steps
        if self.steps % self.target_network_update_rate == 0:
            self.overwrite_target_network()
            

        self.update_eps()

        del non_final_mask, non_final_next_states, batch_of_states, batch_of_actions, batch_of_rewards, state_action_values, next_state_values, TD_targets


    def update_eps(self):
        if self.epsilon > self.eps_min:
            self.epsilon = self.eps_start * np.exp(-self.episode/self.eps_decay)
            # keep the epsilon value from going below the minimum
            if self.epsilon < self.eps_min: 
                self.epsilon = self.eps_min

    # update the target network by overwriting it with the current model
    def overwrite_target_network(self):
        self.target_network.load_state_dict(self.model.state_dict())

    def load(self, name):
        self.model.load_state_dict(torch.load(name))

    def save(self, name):
        torch.save(self.model.state_dict(), name)

# GRID SEARCH
some parameters to look at in the first box:

In [5]:
BATCH_SIZE = [32, 128]
GAMMA = [0.8, 0.99]
EPS_START = [0.8, 1]
EPS_END = [0, 0.05, 0.1]
EPS_DECAY = [1000, 2000, 3000]
LR = [1e-3, 1e-4]
MEMORYBUFFER = [1000, 5000]
AMSGRAD = [True, False]
TARGETNET_UPDATE_RATE = [10, 50]


hyper_grid = {'batch_size' : BATCH_SIZE,
              'gamma' : GAMMA,
              'eps_start' : EPS_START,
              'eps_end' : EPS_END,
              'eps_decay' : EPS_DECAY,
              'learning_rate' : LR,
              'memory_buffer' : MEMORYBUFFER,
              'ams_grad' : AMSGRAD,
              'targetnet_update_rate' : TARGETNET_UPDATE_RATE}

grid = list(ParameterGrid(hyper_grid))
random.shuffle(grid)  # randomly shuffle the grid (in case we don't get many trials done, at least there is more variety)
print(len(grid))



# AMOUNT OF GRID TO SAMPLE
gridSampleSize = 0.6


# stuffs
DQN_model = 1   # in case we design more models, we'll call the one we have now the 1st one
output_filename = "Grid_search_0.xlsx"


if torch.cuda.is_available():
    num_episodes = 5000
else:
    num_episodes = 50

1152


In [13]:
# GRID SEARCH LOOP

env = CatchEnv()
num_moves = env.get_num_actions()


RESULTS_DATAFRAME = pd.DataFrame(columns=["DQN_model",'batch_size', 'gamma', 'eps_start', 'eps_end', 'eps_decay', 'learning_rate', 'memory_buffer', 'ams_grad', 'targetnet_update_rate',
                                          "avgRewards", "average_last_100_episodes", "best_average_100_episodes", "time_of_peak", "time_to_convergence"])



idx = 0
for params in grid:    

    # random sampling...
    if random.random() > gridSampleSize:  # samples the grid
        continue # skips this set of parameters

    # otherwise, go on as normal:
    
    # make the agent and memory buffer using the parameters
    memoryBuffer = MemoryBuffer(params['memory_buffer'])
    agent = Agent(num_moves, params['eps_start'], params['eps_end'], params['eps_decay'], memoryBuffer, params['batch_size'], params['learning_rate'], params['ams_grad'], params['gamma'], params['targetnet_update_rate'])

    
    model_parameters = filter(lambda p: p.requires_grad, agent.model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])
    print(params)

    # for the results of this episode (and set of parameters)
    idx += 1
    RewardsList = []
    tempReward = []
    time_to_convergence = None
    best_average = 0
    best_episode = None


    for episode in range(num_episodes):
        agent.episode += 1

        state = env.reset()
        state = torch.tensor(state, dtype=torch.float32, device=device).permute(2, 0, 1).unsqueeze(0) 

        terminal = False
        while not terminal:
            # agent interacts with the environment
            action = agent.select_action(state)    
            next_state, reward, terminal = env.step(action.item()) 
            
            # turn everything into tensors here, before putting in memory
            reward = torch.tensor([reward], device=device)
            if not terminal:
                next_state = torch.tensor(next_state, dtype=torch.float32, device=device).permute(2, 0, 1).unsqueeze(0)
            
            if terminal:
                next_state = None
                tempReward.append(reward.item())   
                            

            # add trajectory to memory buffer and move to the next state
            agent.memory.push(state, action, next_state, reward)
            state = next_state

            # optimise the DQN model
            agent.optimize_model()

        # testing of the agent between 10 episode blocks
        if episode % 10 == 0 and episode > 0:
            # store the rewards of the last 10 training episode (NO SEPERATE TESTING HERE, SAVES SOME RUNNING TIME)
            RewardsList.append(sum(tempReward)/len(tempReward))
            tempReward = []

            # find the best average over 100 episodes
            running_avg = sum(RewardsList[-10:]) / 10   # each element in RewardsList is an average of 10 episodes
            if running_avg > best_average:
                best_average = running_avg
                best_episode = episode
            

            # early stopping
            # if the average of the previous 100 episodes was above 0.9, we've probably hit convergence so stop (to try and save time)
            if episode % 10 == 0 and running_avg > 0.9:
                time_to_convergence = episode
                break


    # store the results in a dataframe, making a new row for this trial here
    tempDict = {"DQN_model" : DQN_model,
                "avgRewards" : RewardsList,
                "average_last_100_episodes" : running_avg,
                "best_average_100_episodes" : best_average,
                "time_of_peak" : best_episode,
                "time_to_convergence" : time_to_convergence}
    
    resultsDict = {**params.copy(), **tempDict}  # make a line for in the results dict

    RESULTS_DATAFRAME.loc[idx] = resultsDict

    RESULTS_DATAFRAME.to_excel(output_filename) # saves on every iteration (in case this takes long, or crashes, we can still pull the results out)




27544739


KeyboardInterrupt: 