# LIBRARIES

In [2]:
# install gymnasium
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


In [1]:
import os, sys
import random
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

# BUILD AI

In [2]:
class Network(nn.Module):
    def __init__(self, state_size, action_size, seed=42):
        super(Network, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    # forward method to next layer
    def forward(self, state):
        x = self.fc1(state)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        return self.fc3(x)

# TRAIN AI

## Setup Environment

In [3]:
import gymnasium as gym
env = gym.make('LunarLander-v3')
state_shape = env.observation_space.shape
state_size = env.observation_space.shape[0]
no_actions = env.action_space.n
print(f'State shape: {state_shape}')
print(f'State size: {state_size}')
print(f'Number of Actions: {no_actions}')

State shape: (8,)
State size: 8
Number of Actions: 4


## Hyper Params

In [4]:
learning_rate = 5e-4
minibatch_size = 100
discount_factor = .99
replay_buffer_size = int(2e4) # Remember to change this to int(1e5) when run on Google Colab
interpolation_param = 1e-3

## Experiment Replay

In [5]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.capacity = capacity
        self.memory = []

    def push(self, event):
        '''
        Add Experiences to Replay Memory Buffer.
        '''
        # append memory
        self.memory.append(event)

        # Ensure memory not exceeding capacity
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        '''
        Memory Sampling
        '''
        experiences = random.sample(self.memory, k=batch_size)

        # Extract all elements
        states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float()\
            .to(self.device) # move to gpu
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long()\
            .to(self.device) # move to gpu
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float()\
            .to(self.device) # move to gpu
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float()\
            .to(self.device) # move to gpu
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float()\
            .to(self.device) # move to gpu
        return states, next_states, actions, rewards, dones

## DQN Class

In [6]:
class Agent():
    def __init__(self, state_size, action_size):
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.state_size = state_size
        self.action_size = action_size
        self.local_qnetwork = Network(state_size, action_size).to(self.device)
        self.target_qnetwork = Network(state_size, action_size).to(self.device)
        self.optimiser = optim.Adam(self.local_qnetwork.parameters(), lr=learning_rate)
        self.memory = ReplayMemory(replay_buffer_size)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        '''
        Store and learn from Experiences
        '''
        self.memory.push((state, action, reward, next_state, done))
        self.t_step = (self.t_step + 1) % 4

        # Check every 4 steps
        if self.t_step == 0:
            if len(self.memory.memory) > minibatch_size:
                experiences = self.memory.sample(100)
                self.learn(experiences, discount_factor) # learn method will be created later
    
    def act(self, state, epsilon=.0):
        '''
        State processing to Policy implementation
        '''
        # unsqueeze: add an extra dimension to state vetor. Highly important to Deep Q-Learning
        state = torch.from_numpy(state).float()\
            .unsqueeze(0)\
            .to(self.device)
        self.local_qnetwork.eval() # eval() is a method inherited from the Network class

        with torch.no_grad():
            action_values = self.local_qnetwork(state)

        self.local_qnetwork.train()

        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
    
    def learn(self, experiences, discount_factor):
        '''
        Update the Agent's q-value based on sampled experiences
        '''
        states, next_states, actions, rewards, dones = experiences
        next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + (discount_factor * next_q_targets * (1 - dones))
        q_expected = self.local_qnetwork(states).gather(1, actions)
        loss = F.mse_loss(q_expected, q_targets)
        self.optimiser.zero_grad()
        loss.backward()
        self.optimiser.step()
        self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_param)
    
    def soft_update(self, local_model, target_model, interpolation_param):
        for p_target, p_local in zip(target_model.parameters(), local_model.parameters()):
            p_target.data.copy_(interpolation_param * p_local + (1.0 - interpolation_param) * p_target.data)

## Initialise DQN Agent

# VISUALISATION