<a href="https://colab.research.google.com/github/cypherics/RL/blob/3.2/assignment_3/assignment3_dqn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym[atari,accept-rom-license]==0.25.2
import sys, os
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ale-py~=0.7.5
  Downloading ale_py-0.7.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autorom[accept-rom-license]~=0.4.2
  Downloading AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting AutoROM.accept-rom-license
  Downloading AutoROM.accept-rom-license-0.5.4.tar.gz (12 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting libtorrent
  Using cached libtorrent-2.0.7-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (8.6 MB)
Building wheels for collected packages: AutoROM.accept-rom-license
  Building wheel for AutoROM.accept-rom-license (pyproject.toml) ... [?25l[?25hdone
  Created wheel

In [2]:
import torch
import torchvision
import numpy as np
import random
from gym.spaces import Box
from collections import deque


class SkipFrame(gym.Wrapper):
    def __init__(self, env, skip):
        super().__init__(env)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = False
        for i in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            total_reward += reward
            if done:
                break
        return obs, total_reward, done, info


class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs_shape = self.observation_space.shape[:2]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        observation = np.transpose(observation, (2, 0, 1))
        observation = torch.tensor(observation.copy(), dtype=torch.float)
        transform = torchvision.transforms.Grayscale()
        observation = transform(observation)
        return observation


class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape):
        super().__init__(env)
        self.shape = (shape, shape) if isinstance(shape, int) else tuple(shape)
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.uint8)

    def observation(self, observation):
        transforms = torchvision.transforms.Compose([torchvision.transforms.Resize(self.shape),
                                                     torchvision.transforms.Normalize(0, 255)])
        return transforms(observation).squeeze(0)


class ExperienceReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def __len__(self):
        return len(self.memory)

    def store(self, state, next_state, action, reward, done):
        state = state.__array__()
        next_state = next_state.__array__()
        self.memory.append((state, next_state, action, reward, done))

    def sample(self, batch_size):
        # TODO: uniformly sample batches of Tensors for: state, next_state, action, reward, done
        # ...


        # uniformly get batch with batch_size
        sampled_batch = random.sample(self.memory, batch_size)

        states = []
        next_states = []
        actions = []
        rewards = []
        dones = []

        # save to arrays
        for (curr_state, next_state, action, reward, done) in sampled_batch:
            states.append(curr_state)
            next_states.append(next_state)
            actions.append(action)
            rewards.append(reward)
            dones.append(done)


        return  torch.tensor(np.array(states)), \
                torch.tensor(np.array(next_states)), \
                torch.tensor(np.array(actions)), \
                torch.tensor(np.array(rewards)), \
                torch.tensor(np.array(dones))


In [29]:
import torch
import gym
import numpy as np
import copy
from gym.wrappers import FrameStack
from sklearn.metrics import mean_squared_error


env_rendering = False    # Set to False while training your model on Colab
testing_mode = True
test_model_directory = '/content/sample_data/ddqn.pth'

# Create and preprocess the Space Invaders environment
if env_rendering:
    env = gym.make("ALE/SpaceInvaders-v5", full_action_space=False, render_mode="human")
else:
    env = gym.make("ALE/SpaceInvaders-v5", full_action_space=False)
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
env = FrameStack(env, num_stack=4)
image_stack, h, w = env.observation_space.shape
num_actions = env.action_space.n
print('Number of stacked frames: ', image_stack)
print('Resized observation space dimensionality: ', h, w)
print('Number of available actions by the agent: ', num_actions)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

seed = 61
env.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.backends.cudnn.enabled:
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Hyperparameters (to be modified)
batch_size = 32
alpha = 0.00025
gamma = 0.95
eps, eps_decay, min_eps = 1.0, 0.999, 0.05
buffer = ExperienceReplayMemory(20000)
burn_in_phase = 20000
sync_target = 30000
max_train_frames = 10000
max_train_episodes = 100000
max_test_episodes = 1000
curr_step = 0
learning_rate = 0.001


Number of stacked frames:  4
Resized observation space dimensionality:  84 84
Number of available actions by the agent:  6
cuda


  deprecation(
  deprecation(


In [4]:
import torch.nn as nn

def convert(x):
    return torch.tensor(x.__array__()).float()


class DeepQNet(torch.nn.Module):
    def __init__(self, h, w, image_stack, num_actions):
        super(DeepQNet, self).__init__()
        # TODO: create a convolutional neural network
        # ...

        # self.conv = torch.nn.Sequential(
        #     torch.nn.Conv2d(4, 6, 5),
        #     torch.nn.ReLU(),
        #     torch.nn.MaxPool2d(2,2),
        #     torch.nn.Conv2d(6,16,5),
        #     torch.nn.ReLU(),
        #     torch.nn.MaxPool2d(2,2)
        # )

        # self.out_size = self.get_out(h,w)
        # self.fully_connected_layers = torch.nn.Sequential(
        #     torch.nn.Linear(self.out_size, 128),
        #     torch.nn.Linear(128, 64),
        #     torch.nn.Linear(64, num_actions)
        # )

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=4, out_channels=32, kernel_size=(8, 8), stride=4),
            nn.ReLU())
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(4, 4), stride=2),
            nn.ReLU())
        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3, 3), stride=1),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=7*7*64, out_features=256),
            nn.ReLU())
        self.fc2 = nn.Linear(in_features=256, out_features=num_actions)

    def get_out(self, h,w):
        out = self.conv(torch.zeros(1, 4, h, w))
        return int(np.prod(out.size()))

    def forward(self, x):
        # TODO: forward pass from the neural network
        # ...
        out1 = self.conv1(x)
        out2 = self.conv2(out1)        
        out3 = self.conv3(out2)
        out4 = self.fc1(out3.view(-1, 7*7*64))        
        out = self.fc2(out4)
        return out




# TODO: create an online and target DQN (Hint: Use copy.deepcopy() and requires_grad utilities!)
# ...
online_dqn = DeepQNet(h,w,image_stack, num_actions)
target_dqn = copy.deepcopy(online_dqn)
online_dqn.to(device)
target_dqn.to(device)


# TODO: create the appropriate MSE criterion and Adam optimizer
# ...
optimizer = torch.optim.Adam(online_dqn.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()


In [27]:
def policy(state, is_training):
    global eps
    state = convert(state).unsqueeze(0).to(device)
    # state = (1, batch, h, w)

    #TODO: Implement an epsilon-greedy policy
    #...
    # state_c = torch.from_numpy(state).float()/255.0
    # state = Variable(state).cuda()
    
    # online_dqn.eval()
    # estimate = online_dqn.forward(state).max(dim=1)
    
    # # with epsilon prob to choose random action else choose argmax Q estimate action
    # if random.random() < self.epsilon:
    #     return random.randint(0, self.action_number-1)
    # else:
    #     return estimate[1].data[0]

    with torch.no_grad():
        if is_training:
            p = online_dqn(state)
            # P = (1, 6)
            if np.random.rand() < eps:
                a = random_action()
            else:
                a = torch.argmax(p, dim=1).tolist()

        else:
            p = online_dqn(state)
            a = torch.argmax(p, dim=1).tolist()

    return convert(np.array(a)).to(device)

def random_action():
    return np.random.randint(0, num_actions)

def compute_loss(state, action, reward, next_state, done):
    state = convert(state).to(device)
    next_state = convert(next_state).to(device)
    action = action.view(-1, 1).to(device)
    reward = reward.view(-1, 1).to(device)
    done = done.view(-1, 1).to(device)

    # TODO: Compute the DQN (or DDQN) loss based on the criterion
    # ...

    # mse loss
    online_dqn.eval()
    target_dqn.eval()

    # action_new = online_dqn.forward(next_state).max(dim=1)[1].cpu().data.view(-1, 1).to(device)
    action_new = torch.argmax(online_dqn.forward(next_state), dim=1).view(-1, 1)
    target = target_dqn.forward(next_state)
    y_target =  torch.gather(target, dim=1, index=action_new)
    y = reward + torch.mul((y_target * (~done)), gamma)


    online_dqn.train()
    Q = (torch.gather(online_dqn.forward(state), dim=1, index=action))

    loss = criterion(input=Q.float(), target=y.float().detach())

    return loss


def run_episode(curr_step, buffer, is_training):
    global eps
    global target_dqn
    global online_dqn
    episode_reward, episode_loss = 0, 0.
    state = env.reset()
    
    for t in range(max_train_frames):
        action = policy(state, is_training)
        curr_step += 1
        next_state, reward, done, _ = env.step(int(action.item()))
        episode_reward += reward

        if is_training:
            buffer.store(state, next_state, int(action.item()), reward, done)

            if curr_step > burn_in_phase:
                state_batch, next_state_batch, action_batch, reward_batch, done_batch = buffer.sample(batch_size)

                if curr_step % sync_target == 0:
                    # TODO: Periodically update your target_dqn at each sync_target frames
                    # ...
                     target_dqn.load_state_dict(online_dqn.state_dict())

                loss = compute_loss(state_batch, action_batch, reward_batch, next_state_batch, done_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                episode_loss += loss.item()

        else:
            with torch.no_grad():
                episode_loss += compute_loss(state, action.type(torch.int64), torch.tensor(np.array(reward)), next_state, torch.tensor(np.array(done))).item()

        state = next_state

        if done:
            break
    

    return dict(reward=episode_reward, loss=episode_loss / t), curr_step


In [6]:
def update_metrics(metrics, episode):
    for k, v in episode.items():
        metrics[k].append(v)


def print_metrics(it, metrics, is_training, window=100):
    reward_mean = np.mean(metrics['reward'][-window:])
    loss_mean = np.mean(metrics['loss'][-window:])
    mode = "train" if is_training else "test"
    print(f"Episode {it:4d} | {mode:5s} | reward {reward_mean:5.5f} | loss {loss_mean:5.5f}")


def save_checkpoint(curr_step, eps, train_metrics):
    save_dict = {'curr_step': curr_step, 
                 'train_metrics': train_metrics, 
                 'eps': eps,
                 'online_dqn': online_dqn.state_dict(), 
                 'target_dqn': target_dqn.state_dict()}
    torch.save(save_dict, test_model_directory)


In [7]:
# TODO: Plot your train_metrics and test_metrics
# ...
def plot_metrics(metrics, window=100):
    reward = metrics['reward'][-window:]
    loss = metrics['loss'][-window:]

    reward = [r for idx, r in enumerate(metrics['reward']) if idx % 50 == 0]
    loss = [r for idx, r in enumerate(metrics['loss']) if idx % 50 == 0]
    epsiodes = np.arange(0, max_train_episodes, 50)

    fig, (ax1, ax2) = plt.subplots(1, 2)
    ax1.plot(epsiodes, reward)
    ax2.plot(epsiodes, loss)

    ax1.set_xlabel("episodes")
    ax2.set_xlabel("episodes")
    ax1.set_ylabel("reward")
    ax2.set_ylabel("loss")

    plt.tight_layout()
    plt.show()

def save_m(inp):
    # load json module
    import json
    import os

    # create json object from dictionary
    json = json.dumps(inp)

    # open file for writing, "w" 
    f = open(os.path.join(r"/content/sample_data", "test_metrics.json"),"w")

    # write json object to file
    f.write(json)

    # close file
    f.close()


In [31]:
metrics = None
if testing_mode:
    # TODO: Load your saved online_dqn model for evaluation
    # ...
    loaded_pth_file = torch.load(test_model_directory)
    online_dqn = DeepQNet(h,w,image_stack,num_actions)
    online_dqn.load_state_dict(loaded_pth_file['online_dqn'])
    online_dqn.cuda()
    test_metrics = dict(reward=[], loss=[])
    for it in range(max_test_episodes):
        episode_metrics, curr_step = run_episode(curr_step, buffer, is_training=False)
        update_metrics(test_metrics, episode_metrics)
        print_metrics(it + 1, test_metrics, is_training=False)
        save_m(test_metrics)

    metrics = test_metrics
else:
    print("Training")
    train_metrics = dict(reward=[], loss=[])
    for it in range(max_train_episodes):
        episode_metrics, curr_step = run_episode(curr_step, buffer, is_training=True)
        update_metrics(train_metrics, episode_metrics)
        if curr_step > burn_in_phase and eps > min_eps:
            eps *= eps_decay
        if it % 50 == 0:
            print_metrics(it, train_metrics, is_training=True)
            save_checkpoint(curr_step, eps, train_metrics)

        # print(f"episode: {it} done!")
        save_m(train_metrics)
    metrics = train_metrics
    # save_m(metrics)

Episode    1 | test  | reward 60.00000 | loss 636.51727
Episode    2 | test  | reward 272.50000 | loss 820.00025
Episode    3 | test  | reward 288.33333 | loss 868.35117
Episode    4 | test  | reward 287.50000 | loss 869.31444
Episode    5 | test  | reward 287.00000 | loss 862.90092
Episode    6 | test  | reward 328.33333 | loss 898.91984
Episode    7 | test  | reward 350.71429 | loss 909.96075
Episode    8 | test  | reward 343.75000 | loss 913.75689
Episode    9 | test  | reward 326.11111 | loss 881.60184
Episode   10 | test  | reward 327.00000 | loss 880.44550
Episode   11 | test  | reward 307.27273 | loss 893.50273
Episode   12 | test  | reward 301.66667 | loss 903.68561
Episode   13 | test  | reward 306.53846 | loss 899.05854
Episode   14 | test  | reward 311.07143 | loss 880.78079
Episode   15 | test  | reward 304.00000 | loss 887.28825
Episode   16 | test  | reward 313.12500 | loss 883.39500
Episode   17 | test  | reward 311.17647 | loss 899.07958
Episode   18 | test  | reward 32

KeyboardInterrupt: ignored

In [None]:
  plot_metrics(metrics)