## **DQN**

In [None]:
!sudo apt update
!sudo apt install -y build-essential autoconf libtool pkg-config python3-dev \
    python3-pip python3-numpy git flex bison libbz2-dev



In [None]:
!wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | sudo apt-key add -
!sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
!sudo apt-get update && apt-get --allow-unauthenticated install -y \
    cmake \
    kitware-archive-keyring

In [None]:
!sudo rm $(which cmake)
!$(which cmake) --version

In [None]:
!pip3 install -Uv nle

In [None]:
!apt-get install sox ffmpeg libcairo2 libcairo2-dev

In [None]:
!pip install manimlib pygame opencv-python minihack

In [1]:
#Appropriate imports needed
import tensorflow as tf
import numpy as np
import gym
from nle import nethack
import minihack
import math
from PIL import Image
import pygame, sys
from pygame.locals import *
from tensorflow import keras
import cv2
import random
from gym import spaces
from collections import deque
import torch
from torch import nn, optim,tensor, from_numpy, zeros, no_grad, device, cuda, save
import torch.nn.functional as F

cv2.ocl.setUseOpenCL(False)

device = device("cuda" if cuda.is_available() else "cpu") #use gpu instead for faster computation

In [2]:
#run this cell to mount colab notebook to drive for output recording
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Replay Buffer

In [3]:
#from template in DQN lab
class ReplayBuffer:
    """
    Simple storage for transitions from an environment.
    """

    def __init__(self, size):
        """
        Initialise a buffer of a given size for storing transitions
        :param size: the maximum number of transitions that can be stored
        """
        self._storage = []
        self._maxsize = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, state, action, reward, next_state, done):
        """
        Add a transition to the buffer. Old transitions will be overwritten if the buffer is full.
        :param state: the agent's initial state
        :param action: the action taken by the agent
        :param reward: the reward the agent received
        :param next_state: the subsequent state
        :param done: whether the episode terminated
        """
        data = (state, action, reward, next_state, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, indices):
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in indices:
            data = self._storage[i]
            state, action, reward, next_state, done = data
            states.append(np.array(state, copy=False))
            actions.append(action)
            rewards.append(reward)
            next_states.append(np.array(next_state, copy=False))
            dones.append(done)
        return (
            np.array(states),
            np.array(actions),
            np.array(rewards),
            np.array(next_states),
            np.array(dones),
        )

    def sample(self, batch_size):
        """
        Randomly sample a batch of transitions from the buffer.
        :param batch_size: the number of transitions to sample
        :return: a mini-batch of sampled transitions
        """
        indices = np.random.randint(0, len(self._storage) - 1, size=batch_size)
        return self._encode_sample(indices)

DQN Agent

In [4]:
from torch.nn import (BatchNorm2d, Conv2d, CrossEntropyLoss, Dropout, Linear,
                      MaxPool2d, Module, ReLU, Sequential, Softmax)
class DQNAgent:
    def __init__(
        self,
        observation_space: spaces.Box,
        action_space: spaces.Discrete,
        replay_buffer: ReplayBuffer,
        use_double_dqn,
        lr,
        batch_size,
        gamma,
    ):
        """
        Initialise the DQN algorithm using the Adam optimiser
        :param action_space: the action space of the environment
        :param observation_space: the state space of the environment
        :param replay_buffer: storage for experience replay
        :param lr: the learning rate for Adam
        :param batch_size: the batch size
        :param gamma: the discount factor
        """

        class DQN(nn.Module): # adapted from https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

          def __init__(self, n_observations, n_actions):

            super(DQN, self).__init__()

            self.fc1 = nn.Linear(in_features=81, out_features=64) #uses 81 input features as glyphs are cropped to 9x9 area
            self.fc2 = nn.Linear(in_features=64 , out_features=32)
            self.fc3 = nn.Linear(in_features=32 , out_features=16)
            self.fc4 = nn.Linear(in_features=16, out_features=action_space.n)

          def forward(self, x):
              x = F.relu(self.fc1(x))
              x = F.relu(self.fc2(x))
              x = F.relu(self.fc3(x))
              x = self.fc4(x)
              return x

        self.target_network = DQN(observation_space, action_space.n).to(device)
        self.q_network = DQN(observation_space, action_space.n).to(device)


        # initialise
        self.optimizer = optim.Adam(self.q_network.parameters(), lr = lr)
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.gamma = gamma
        self.use_double_dqn = use_double_dqn

    def optimise_td_loss(self):
        """
        Optimise the TD-error over a single minibatch of transitions
        :return: the loss
        """
        # TODO
        #   Optimise the TD-error over a single minibatch of transitions
        #   Sample the minibatch from the replay-memory
        #   using done (as a float) instead of if statement
        #   return loss
        #squeeze and unsqueeze functions used to change dimensions of variables to the appropriate ones
        sample_batch = self.replay_buffer.sample(self.batch_size)

        states_sample = sample_batch[0]
        actions_sample = sample_batch[1]
        rewards_sample = sample_batch[2]
        next_states_sample = sample_batch[3]
        dones_sample = sample_batch[4]

        states_sample = np.array(states_sample) / 255.0
        states_sample = from_numpy(states_sample).float().to(device)

        next_states_sample = np.array(next_states_sample) / 255.0
        next_states_sample = from_numpy(next_states_sample).float().to(device)

        actions_sample = np.array(actions_sample).astype(int)
        actions_sample = from_numpy(actions_sample).long().to(device)
        rewards_sample = from_numpy(rewards_sample).float().to(device)
        dones_sample = from_numpy(dones_sample).float().to(device)

        with no_grad(): # reduces unnecessary computation by temporariliy disabling gradient tracking
            if (self.use_double_dqn):
                optimal_next_a = self.q_network(next_states_sample).max(1)[1]
                optimal_next_q = self.target_network(next_states_sample).gather(1, optimal_next_a.unsqueeze(1)).squeeze()
            else:
                optimal_next_q = self.target_network(next_states_sample).max(1)[0]
            target_q_values = rewards_sample + (((1 - dones_sample) * self.gamma)* optimal_next_q)

        predicted_q = self.q_network(states_sample).gather(1, actions_sample.unsqueeze(1)).squeeze()

        # Compute Huber loss
        criterion = nn.SmoothL1Loss()
        loss = criterion(predicted_q, target_q_values)

        # optimising params
        self.optimizer.zero_grad()
        loss.backward()

        for param in self.q_network.parameters():
          param.grad.data.clamp_(-1, 1) # clamp restricts values of gradient tensor to be between -1 and 1 - form of gradient clipping to prevent 'exploding gradients'

        self.optimizer.step()

        del states_sample
        del next_states_sample

        return loss.item()

    def update_target_network(self):
        """
        Update the target Q-network by copying the weights from the current Q-network
        """
        # TODO update target_network parameters with policy_network parameters

        self.target_network.load_state_dict(self.q_network.state_dict()) # copies parameters


    def act(self, state: np.ndarray):
        """
        Select an action greedily from the Q-network given the state
        :param state: the current state
        :return: the action to take
        """
        state=np.array(state)/255.0
        state=from_numpy(state).float()
        state=state.unsqueeze(0).to(device)
        with no_grad():
            action = self.q_network(state).max(1)[1]
            return action.item()

    def save_model(self, path): #function for saving model
      save(self.q_network.state_dict(), path+"maze_final5_dqn_q_weights_final.pth")
      save(self.target_network.state_dict(), path+"maze_final5_dqn_target_weights_final.pth")


In [5]:
#function to train the DQN

def TrainDQN(env, agent, max_timesteps,hyper_params,replay_buffer):

  eps_timesteps = hyper_params["eps-fraction"] * float(hyper_params["num-steps"])
  episode_rewards = [0.0] #records the rewards for episodes
  loss_arr = [] #loss of episodes
  loss = 0 #initialise
  rewards_arr = []
  ave_rewards_arr = []


  state = env.reset()['glyphs_crop'].flatten() #flatten 9x9
  for t in range(max_timesteps):
      fraction = min(1.0, float(t) / eps_timesteps)
      eps_threshold = hyper_params["eps-start"] + fraction * (
          hyper_params["eps-end"] - hyper_params["eps-start"]
      )
      sample = random.random()
      # TODO
      #  select random action if sample is less equal than eps_threshold
      # take step in env
      # add state, action, reward, next_state, float(done) to reply memory - cast done to float
      # add reward to episode_reward

      if (sample <= eps_threshold):
        action = env.action_space.sample() # random action
      else:
        action = agent.act(state)

      next_s, reward, done, _ = env.step(action) #take the action chosen
      next_state = next_s['glyphs_crop'].flatten()
      replay_buffer.add(state, action, reward, next_state, float(done)) # stores experience
      episode_rewards[-1] += reward #update the reward
      state = next_state

      if done:
          state = env.reset()['glyphs_crop'].flatten()

          mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
          #this outputs loss and reward to text files. Uncomment if needed
          # with open("drive/MyDrive/dqn_videos/room_rewards_ave.txt", 'a') as file1:
          #   file1.write(str(mean_100ep_reward))
          #   file1.write('\n')

          # with open("drive/MyDrive/dqn_videos/room_rewards_per_ep.txt", 'a') as file1:
          #   file1.write(str(episode_rewards[-1]))
          #   file1.write('\n')

          # with open("drive/MyDrive/dqn_videos/room_loss.txt", 'a') as file1:
          #   file1.write(str(loss))
          #   file1.write('\n')

          rewards_arr.append(episode_rewards[-1])
          ave_rewards_arr.append(mean_100ep_reward)
          loss_arr.append(loss)

          episode_rewards.append(0.0)


      if (
          t > hyper_params["learning-starts"]
          and t % hyper_params["learning-freq"] == 0
      ):
          loss = agent.optimise_td_loss() #calculate the loss


      if (
          t > hyper_params["learning-starts"]
          and t % hyper_params["target-update-freq"] == 0
      ):
          agent.update_target_network() #update the target network

      num_episodes = len(episode_rewards)

      if (
          done
          and hyper_params["print-freq"] is not None
          and len(episode_rewards) % hyper_params["print-freq"] == 0
      ):
          mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
          print("********************************************************")
          print("steps: {}".format(t))
          print("episodes: {}".format(num_episodes))
          print("mean 100 episode reward: {}".format(mean_100ep_reward))
          print("% time spent exploring: {}".format(int(100 * eps_threshold)))
          print("********************************************************")
  agent.save_model()


Sub-Task 1: MiniHack-Room-5x5-v0

In [7]:
def RunDQN1():
    hyper_params = {
    "seed": 42,  # which seed to use
    "env": "MiniHack-Room-5x5-v0",  # name of the game
    "replay-buffer-size": int(5e4),  # replay buffer size
    "learning-rate": 1e-3,  # learning rate for Adam optimizer
    "discount-factor": 0.99,  # discount factor
    "num-steps": 500000,  # total number of steps to run the environment for
    "batch-size": 64,  # number of transitions to optimize at the same time
    "learning-starts": 5000,  # number of steps before learning starts
    "learning-freq": 2,  # number of iterations between every optimization step
    "use-double-dqn": True,  # use double deep Q-learning
    "target-update-freq": 100,  # number of iterations between every target network update
    "eps-start": 1.0,  # e-greedy start threshold
    "eps-end": 0.01,  # e-greedy end threshold
    "eps-fraction": 0.3,  # fraction of num-steps
    "print-freq": 10,
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    actions = tuple(nethack.CompassDirection) #actions restricted to sufficient ones needed for this environment

    env = gym.make(hyper_params["env"],observation_keys=("glyphs","glyphs_crop", "chars", "colors", "pixel", "message", "blstats", "pixel_crop"),
            actions=actions) #create the environment


    env.seed(hyper_params["seed"])

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    # Create dqn agent

    print(device)

    agent = DQNAgent(observation_space = env.observation_space,
                    action_space = env.action_space,
                    replay_buffer = replay_buffer,
                    use_double_dqn= hyper_params["use-double-dqn"],
                    lr = hyper_params['learning-rate'],
                    batch_size = hyper_params['batch-size'],
                    gamma = hyper_params['discount-factor'] )

    # Traine the agent
    TrainDQN(
      env,
      agent,
      max_timesteps=hyper_params['num-steps'],
      hyper_params=hyper_params,
      replay_buffer=replay_buffer
    )

In [None]:
#Train the agent for several runs on this environment
num_runs=5
for i in range(num_runs):
  print("run: {}".format(i))
  RunDQN1()

Sub-Task 2: MiniHack-MazeWalk-9x9-v0

In [None]:
def RunDQN2():
    hyper_params = {
    "seed": 42,  # which seed to use
    "env": "MiniHack-MazeWalk-9x9-v0",  # name of the game
    "replay-buffer-size": int(5e4),  # replay buffer size
    "learning-rate": 1e-3,  # learning rate for Adam optimizer
    "discount-factor": 0.99,  # discount factor
    "num-steps": 500000,  # total number of steps to run the environment for
    "batch-size": 64,  # number of transitions to optimize at the same time
    "learning-starts": 5000,  # number of steps before learning starts
    "learning-freq": 2,  # number of iterations between every optimization step
    "use-double-dqn": True,  # use double deep Q-learning
    "target-update-freq": 100,  # number of iterations between every target network update
    "eps-start": 1.0,  # e-greedy start threshold
    "eps-end": 0.01,  # e-greedy end threshold
    "eps-fraction": 0.3,  # fraction of num-steps
    "print-freq": 10,
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    actions = tuple(nethack.CompassDirection) #actions restricted to sufficient ones needed for this environment

    env = gym.make(hyper_params["env"],observation_keys=("glyphs","glyphs_crop", "chars", "colors", "pixel", "message", "blstats", "pixel_crop"),
            actions=actions) #create the environment


    env.seed(hyper_params["seed"])

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    #Create dqn agent

    print(device)

    agent = DQNAgent(observation_space = env.observation_space,
                    action_space = env.action_space,
                    replay_buffer = replay_buffer,
                    use_double_dqn= hyper_params["use-double-dqn"],
                    lr = hyper_params['learning-rate'],
                    batch_size = hyper_params['batch-size'],
                    gamma = hyper_params['discount-factor'] )

    # Train agent
    TrainDQN(
      env,
      agent,
      max_timesteps=hyper_params['num-steps'],
      hyper_params=hyper_params,
      replay_buffer=replay_buffer
    )

In [None]:
#Train the agent for several runs on this environment
num_runs=5
for i in range(num_runs):
  print("run: {}".format(i))
  RunDQN2()

Sub-Task 3: MiniHack-LockedDoor-v0

In [None]:
def RunDQN3():
    hyper_params = {
    "seed": 42,  # which seed to use
    "env": "MiniHack-LockedDoor-v0",  # name of the game
    "replay-buffer-size": int(5e4),  # replay buffer size
    "learning-rate": 1e-3,  # learning rate for Adam optimizer
    "discount-factor": 0.99,  # discount factor
    "num-steps": 500000,  # total number of steps to run the environment for
    "batch-size": 64,  # number of transitions to optimize at the same time
    "learning-starts": 5000,  # number of steps before learning starts
    "learning-freq": 2,  # number of iterations between every optimization step
    "use-double-dqn": True,  # use double deep Q-learning
    "target-update-freq": 100,  # number of iterations between every target network update
    "eps-start": 1.0,  # e-greedy start threshold
    "eps-end": 0.01,  # e-greedy end threshold
    "eps-fraction": 0.3,  # fraction of num-steps
    "print-freq": 10,
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    actions = tuple(nethack.CompassDirection) #actions restricted to sufficient ones needed for this environment
    actions+=(nethack.Command.KICK,)

    env = gym.make(hyper_params["env"],observation_keys=("glyphs","glyphs_crop", "chars", "colors", "pixel", "message", "blstats", "pixel_crop"),
            actions=actions) #create the environment


    env.seed(hyper_params["seed"])

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    #Create dqn agent

    print(device)

    agent = DQNAgent(observation_space = env.observation_space,
                    action_space = env.action_space,
                    replay_buffer = replay_buffer,
                    use_double_dqn= hyper_params["use-double-dqn"],
                    lr = hyper_params['learning-rate'],
                    batch_size = hyper_params['batch-size'],
                    gamma = hyper_params['discount-factor'] )

    # Train agent
    TrainDQN(
      env,
      agent,
      max_timesteps=hyper_params['num-steps'],
      hyper_params=hyper_params,
      replay_buffer=replay_buffer
    )

In [None]:
#Train the agent for several runs on this environment
num_runs=5
for i in range(num_runs):
  print("run: {}".format(i))
  RunDQN3()

Main Task: MiniHack-Quest-Hard-v0

In [None]:
def RunDQN4():
    hyper_params = {
    "seed": 42,  # which seed to use
    "env": "MiniHack-Quest-Hard-v0",  # name of the game
    "replay-buffer-size": int(5e4),  # replay buffer size
    "learning-rate": 1e-3,  # learning rate for Adam optimizer
    "discount-factor": 0.99,  # discount factor
    "num-steps": 500000,  # total number of steps to run the environment for
    "batch-size": 64,  # number of transitions to optimize at the same time
    "learning-starts": 5000,  # number of steps before learning starts
    "learning-freq": 2,  # number of iterations between every optimization step
    "use-double-dqn": True,  # use double deep Q-learning
    "target-update-freq": 100,  # number of iterations between every target network update
    "eps-start": 1.0,  # e-greedy start threshold
    "eps-end": 0.01,  # e-greedy end threshold
    "eps-fraction": 0.3,  # fraction of num-steps
    "print-freq": 10,
    }

    np.random.seed(hyper_params["seed"])
    random.seed(hyper_params["seed"])

    actions = tuple(nethack.CompassDirection) #actions restricted to sufficient ones needed for this environment
    actions=actions+(
        nethack.Command.PICKUP,
        nethack.Command.APPLY,
        nethack.Command.FIRE,
        nethack.Command.RUSH,
        nethack.Command.ZAP,
        nethack.Command.PUTON,
        nethack.Command.READ,
        nethack.Command.WEAR,
        nethack.Command.QUAFF,
        nethack.Command.PRAY,
        nethack.Command.KICK,
        )

    env = gym.make(hyper_params["env"],observation_keys=("glyphs","glyphs_crop", "chars", "colors", "pixel", "message", "blstats", "pixel_crop"),
            actions=actions) #create the environment


    env.seed(hyper_params["seed"])

    replay_buffer = ReplayBuffer(hyper_params["replay-buffer-size"])

    #Create dqn agent

    print(device)

    agent = DQNAgent(observation_space = env.observation_space,
                    action_space = env.action_space,
                    replay_buffer = replay_buffer,
                    use_double_dqn= hyper_params["use-double-dqn"],
                    lr = hyper_params['learning-rate'],
                    batch_size = hyper_params['batch-size'],
                    gamma = hyper_params['discount-factor'] )

    # Train agent
    TrainDQN(
      env,
      agent,
      max_timesteps=hyper_params['num-steps'],
      hyper_params=hyper_params,
      replay_buffer=replay_buffer
    )

In [None]:
#Train the agent for several runs on this environment
num_runs=5
for i in range(num_runs):
  print("run: {}".format(i))
  RunDQN4()