<a href="https://colab.research.google.com/github/darshan-hindocha/lab/blob/main/deep_reinforcement_learning_on_atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Introduction

In this implementation I use a Deep Q-Network to learn the best policy for the Atari game 'freeway', through OpenAI's Gym library.

I supplement the Deep Q-Network with Experience Replay and Batch Sampling. I use an epsilon-greedy type explore-exploit policy.

I use tensorflow (keras in particular) for the underlying function approximator that replaces the Q-table in a non-Deep reinforcement learning project.

## Run on Colab

In [None]:
# @title Installing ATARI
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1
!pip install gym[atari] > /dev/null 2>&1

In [None]:
#@title Imports
import time
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only
import tensorflow as tf
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import math
import glob
import io
import base64
from IPython.display import HTML
from skimage.transform import resize
from skimage.color import rgb2gray
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()



import skimage.transform
import skimage.measure
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.regularizers import l2
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import RMSprop
import keras


In [None]:
def wrap_env(env,ep_num='sample'):
  env = Monitor(env, f'./video{ep_num}', force=True)
  return env

### Pre-Processing

In [None]:
def grayscale_resize(image):

  img = resize(rgb2gray(image[0:195,:,:]), (60, 60),mode='constant')
  img = np.reshape(img, [60,60,1])

  return img

In [None]:
def build_cnn():

  img_input = layers.Input(shape=(60, 240, 1))

  x = layers.Conv2D(16, 3, activation='relu')(img_input)
  x = layers.MaxPooling2D(2)(x)

  x = layers.Conv2D(32, 3, activation='relu')(x)
  x = layers.MaxPooling2D(2)(x)

  x = layers.Conv2D(64, 3, activation='relu')(x)
  x = layers.MaxPooling2D(2)(x)

  x = layers.Flatten()(x)

  x = layers.Dense(512, activation='relu')(x)

  x = layers.Dropout(0.5)(x)

  output = layers.Dense(1)(x)

  model = Model(img_input, output)
  model.compile(loss='mean_absolute_error',optimizer=tf.keras.optimizers.Adam(0.001))
  return model

### Function Approximator CNN
This is the function that maps values in the state-action space to values. It needs three main parts, the initialisation, the prediction, and finally it needs an update method.

- Initialise: set up the function arbitrarily
- Update parameters by taking in state, action, and actual values. Actual value will be compared to the value at the state and action to create a numeric loss which is used to update the weights. 

In [None]:
class value_function():

  def __init__(self):
    ##Create the models
    self.models = np.empty(3,dtype=object)
    self.untrainable_models = np.empty(3,dtype=object)
    for i in range(3):
      self.models[i] = build_cnn()
      
    
  def init_models(self,states):

    ##Initialise the models with arbitrary weights

    input_states = np.hstack(states).reshape(1,60,240,1)
    for i in range(3):
      self.models[i].fit(x=input_states,y=np.array([0]),epochs=1,verbose=0)
      self.models[i].save(f"model{i}")

      ## Create model replicas to use for predictions
      self.untrainable_models[i] = keras.models.load_model(f"model{i}")
    pass
    
  def update(self,memory_box):
    '''
    Memory Box - the list of experiences [obs0,obs1,obs2,obs3,action,td]
    '''
    ## Decide which experiences should update which model
    all_actions = np.array([exp[4] for exp in memory_box])

    ## Formatting to get experiences input ready
    for i in range(3):
      idx = np.where(all_actions==i)[0]
      if len(idx)>0:
        model_input = []
        model_target = []
        for inp in idx:
          model_input.append(np.hstack([memory_box[inp][0],memory_box[inp][1],memory_box[inp][2],memory_box[inp][3]]).reshape(60,240,1))
          model_target.append(memory_box[inp][5])

        model_input = np.array(model_input)
        model_target = np.array(model_target)

        ## Update the keras model
        self.models[i].fit(model_input,model_target)
        self.models[i].save(f"model{i}")

        ##load weights to untrainable model
        self.untrainable_models[i] = keras.models.load_model(f"model{i}")

    pass


  def predict_best(self,states):
    '''
    Take in an experience and return the values of each
    '''
    vals = np.zeros(3)
    for i in range(3):
      vals[i] = self.untrainable_models[i].predict(np.hstack(states).reshape(1,60,240,1))
    return vals


### Agent

$ V = Reward + \gamma*Max_{a}Q(S',a)$




In [None]:
class Q_agent():
  def __init__(self, environment,value_func, gamma = 0.9, epsilon = 0.05):
    '''
    initialise hyperparameters

    input (environment, value_function_approximator, gamma, epsilon)
    '''
    ## Set the hyperparameters    
    self.epsilon = epsilon
    self.gamma = gamma
    self.value_function = value_func

    ##Initialise the function approximators
    self.environment = gym.make("Freeway-v0")
    state = self.environment.reset()
    state = grayscale_resize(state)
    states = [state,state,state,state]
    self.value_function.init_models(states)

    ## Initialise the memory where the sampled experiences are stored
    self.memory_box = []
    pass
  
  def run_episode(self,ep = 2000):

    ##initialise the environment
    self.environment = gym.make("Freeway-v0")
    state = self.environment.reset()
    state = grayscale_resize(state)

    ## First 4 frames of an episode are not used for prediction or updates
    initial = 0
    cumulative_return = 0

    ## state compilations are the experiences
    state_compilations = np.array([state,state,state,state])
    terminal = False

    ## end is used to end the episode after 5 crossings
    end = 0

    while not terminal:
      
      ## Choosing the action
        
        ## we need 4 frames/observations to call predict so first 4 frames do not call predict
      if initial < 4: 
        action = np.random.choice(3)
        
        ## epsilon greedy policy - Epsilon % of the time a random action is taken
      elif np.random.uniform()<self.epsilon: 
        action = np.random.choice(3)
        
        ## choose actions using the predict function
      else:            
        state_action_values = self.value_function.predict_best(state_compilations)
        action = np.argmax(state_action_values)

      
      ## Taking the action in the environment
      obs, reward, terminal, _ = self.environment.step(action)

      ## Altered the rewards as described in the report
      if reward == 0:
        reward = -1
      else:
        reward = 100
        end += 1
      cumulative_return+=reward
      
      
      obs = grayscale_resize(obs)
      ##adding the observation to our temporary experience memory
      if initial < 4:
        state_compilations[initial] = obs
      else:
        state_compilations[:3] = state_compilations[1:]
        state_compilations[3] = obs

      ## calculating the estimated value of our observation
      if initial < 4:
        initial+=1
        V=reward
      else:
        ## Equation described in Markdown above
        V = reward + self.gamma*np.max(self.value_function.predict_best(state_compilations))
      
      ## Taking a sample for replay experience 10% of the Time
      if np.random.uniform()<0.1 and initial >= 4:
        self.memory_box.append([state_compilations[0],state_compilations[1],state_compilations[2],state_compilations[3],action,V])

        ## if the memory box has enough samples then do an update
        if len(self.memory_box) == 32:
          self.value_function.update(self.memory_box)
          self.memory_box = []

      ## For early termination
      if end == 5:
        terminal = True
      if count > 1000:
        terminal = True
      count+=1

    ## close the environment
    self.environment.close()
    return cumulative_return

  ## Run a simulation
  def run_simulation(self, n_episodes=2):

    start_time = time.time()
    self.episodic_rewards = np.zeros(n_episodes)
    self.times = np.zeros(n_episodes)
    
    for ep in range(n_episodes):
      start_time = time.time()
      self.episodic_rewards[ep] = self.run_episode(ep)
      self.times[ep] = time.time() - start_time

    return self.episodic_rewards,self.times


In [None]:
## Set q_ag as the agent
q_ag = Q_agent(gym.make("Freeway-v0"),value_function())

base_start_time = time.time()

## Run simulation
episodic_rewards,time_list = q_ag.run_simulation(n_episodes= 75)

print("--- %s seconds ---" % (time.time() - base_start_time))

state action value: [ 0.86908972 -1.70917666  1.40470862]
state action value: [ 0.8838442  -1.72082281  1.41324961]
state action value: [ 0.88593596 -1.72375035  1.4165839 ]
state action value: [ 0.88692456 -1.72196805  1.41555119]
state action value: [ 0.87754732 -1.71324265  1.40702248]
state action value: [ 0.88763243 -1.7206738   1.41852558]
state action value: [ 0.88198644 -1.72149849  1.41450953]
state action value: [ 0.89082551 -1.72452903  1.41651821]
INFO:tensorflow:Assets written to: model0/assets
INFO:tensorflow:Assets written to: model1/assets
INFO:tensorflow:Assets written to: model2/assets
state action value: [ 0.44418833 -1.14612627  0.52087927]
state action value: [ 0.4426547  -1.15056312  0.5232228 ]
state action value: [ 0.44144359 -1.15191913  0.52299297]
state action value: [ 0.4491834  -1.15501249  0.52332515]
INFO:tensorflow:Assets written to: model0/assets
INFO:tensorflow:Assets written to: model1/assets
INFO:tensorflow:Assets written to: model2/assets
state acti

KeyboardInterrupt: ignored