# Q-Learning and Deep Q-Networks

In [None]:
# Download Atari ROMS for Breakout
! wget http://www.atarimania.com/roms/Roms.rar
! mkdir /content/ROM/
! unrar e /content/Roms.rar /content/ROM/
! python -m atari_py.import_roms /content/ROM/

## Playing Breakout wth DQN


In [1]:
import torch
from torch import nn, optim
import gym
import random
import numpy as np
import copy
from PIL import Image

# for reproductability
torch.manual_seed(0)
random.seed(0)

print(f'PyTorch: {torch.__version__}')
print(f'AI Gym: {gym.__version__}')
print(f'Numpy: {np.__version__}')


PyTorch: 1.10.0+cu111
AI Gym: 0.17.3
Numpy: 1.21.5


In [2]:
def epsilon_greedy_action(action_distribution, epsilon=1e-5):
    action_distribution = action_distribution.detach().numpy()
    if random.random() < epsilon:
        return np.argmax(np.random.random(
           action_distribution.shape))
    else:
        return np.argmax(action_distribution)

def epsilon_greedy_action_annealed(action_distribution,
                                   percentage, 
                                   epsilon_start=1.0, 
                                   epsilon_end=1e-8):
    action_distribution = action_distribution.detach().numpy()
    annealed_epsilon = epsilon_start*(1.0-percentage) + epsilon_end*percentage
    if random.random() < annealed_epsilon:
        return np.argmax(np.random.random(
          action_distribution.shape))
    else:
        return np.argmax(action_distribution)

In [3]:
class EpisodeHistory(object):

    def __init__(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.terminals = []

    def add_to_history(self, state, action, reward, 
      state_prime, terminal):
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.state_primes.append(state_prime)
        self.terminals.append(terminal)

# Build DQN Agent

In [4]:
class DQNAgent(object):

    def __init__(self, num_actions,
                 learning_rate=1e-3, history_length=4,
                 screen_height=84, screen_width=84, 
                 gamma=0.99):
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.history_length = history_length
        self.screen_height = screen_height
        self.screen_width = screen_width
        self.gamma = gamma

        self.build_prediction_network()
        self.build_target_network()
        #self.build_training()
    
    def build_prediction_network(self):
        self.model_predict = nn.Sequential(
          nn.Conv2d(4, 32, kernel_size=8 , stride=4),
          nn.Conv2d(32, 64, kernel_size=4, stride=2),
          nn.Conv2d(64, 64, kernel_size=3, stride=1),
          nn.Flatten(),
          nn.Linear(3136, 512),
          nn.Linear(512, self.num_actions)
          )

    def build_target_network(self):
        self.model_target = nn.Sequential(
          nn.Conv2d(4, 32, kernel_size=8 , stride=4),
          nn.Conv2d(32, 64, kernel_size=4, stride=2),
          nn.Conv2d(64, 64, kernel_size=3, stride=1),
          nn.Flatten(),
          nn.Linear(3136, 512),
          nn.Linear(512, self.num_actions)
          )

    def sample_and_train_pred(self, replay_table, batch_size):

        s_t, action, reward, s_t_plus_1, terminal = replay_table.sample_batch(
              batch_size)

        # given state_t, find q_t (predict_model) and q_t+1 (target_model)
        # do it in batches
        # Find q_t_plus_1
        input_t = torch.from_numpy(s_t_plus_1).float()
        model_t = self.model_target.float()
        q_t_plus_1 = model_t(input_t)
        
        terminal = torch.tensor(terminal).float()
        max_q_t_plus_1, _ = torch.max(q_t_plus_1, dim=1)
        reward = torch.from_numpy(reward).float()
        target_q_t = (1. - terminal) * self.gamma * max_q_t_plus_1 + reward

        # Find q_t, and q_of_action
        input_p = torch.from_numpy(s_t).float()
        model_p = self.model_predict.float()
        q_t = model_p(input_p)
        action = torch.from_numpy(action)
        action_one_hot = nn.functional.one_hot(action, 
                                               self.num_actions)
        q_of_action = torch.sum(q_t * action_one_hot)

        # Compute loss
        self.delta = (target_q_t - q_of_action)
        self.loss = torch.mean(self.delta)

        # Update predict_model gradients (only)
        self.optimizer = optim.Adam(self.model_predict.parameters(),
                                    lr = self.learning_rate)
        self.loss.backward()
        self.optimizer.step()

        return q_t

    def predict_action(self, state, epsilon_percentage):
        input_p = torch.from_numpy(state).float().unsqueeze(dim=0)
        model_p = self.model_predict.float()
        action_distribution = model_p(input_p)
        # sample from action distribution
        action = epsilon_greedy_action_annealed(action_distribution.detach(), 
                                                epsilon_percentage)
        return action
     
    def process_state_into_stacked_frames(self, 
                                          frame, 
                                          past_frames, 
                                          past_state=None):
        full_state = np.zeros((self.history_length, 
                              self.screen_width, 
                              self.screen_height))

        if past_state is not None:
            for i in range(len(past_state)-1):
                full_state[i, :, :] = past_state[i+1, :, :]
            full_state[-1, :, :] = self.preprocess_frame(frame, 
                                                        (self.screen_width, 
                                                          self.screen_height)
                                                        )
        else:
            all_frames = past_frames + [frame]
            for i, frame_f in enumerate(all_frames):
                full_state[i, :, :] = self.preprocess_frame(frame_f, 
                                                            (self.screen_width, 
                                                            self.screen_height)
                                                            )
        return full_state

    def to_grayscale(self, x):
        return np.dot(x[...,:3], [0.299, 0.587, 0.114])

    def preprocess_frame(self, im, shape):
        cropped = im[16:201,:] # (185, 160, 3)
        grayscaled = self.to_grayscale(cropped) # (185, 160)
        # resize to (84,84)
        resized = np.array(Image.fromarray(grayscaled).resize(shape))
        mean, std = 40.45, 64.15
        frame = (resized-mean)/std
        return frame

## Implementing Experience Replay

In [5]:
class ExperienceReplayTable(object):

    def __init__(self, table_size=50000):
        self.states = []
        self.actions = []
        self.rewards = []
        self.state_primes = []
        self.terminals = []

        self.table_size = table_size

    def add_episode(self, episode):
        self.states += episode.states
        self.actions += episode.actions
        self.rewards += episode.rewards
        self.state_primes += episode.state_primes
        self.terminals += episode.terminals

        self.purge_old_experiences()

    def purge_old_experiences(self):
        while len(self.states) > self.table_size:
            self.states.pop(0)
            self.actions.pop(0)
            self.rewards.pop(0)
            self.state_primes.pop(0)

    def sample_batch(self, batch_size):
        s_t, action, reward, s_t_plus_1, terminal = [], [], [], [], []
        rands = np.arange(len(self.states))
        np.random.shuffle(rands)
        rands = rands[:batch_size]

        for r_i in rands:
            s_t.append(self.states[r_i])
            action.append(self.actions[r_i])
            reward.append(self.rewards[r_i])
            s_t_plus_1.append(self.state_primes[r_i])
            terminal.append(self.terminals[r_i])
        return (np.array(s_t), np.array(action), np.array(reward), 
                np.array(s_t_plus_1), np.array(terminal))

# Set up Q Learning
- The configuration values below are for illustrative purposes so you can execute the code in Colab. Training takes a long time (days) at realistic values.
- A larger reward value for Breakout may require max_episode_length to be 100000.  That is, you need to be able to play long enough to get a decent reward (score). 

In [6]:
learn_start = 4
total_episodes = 32
epsilon_stop = 32
train_frequency = 2
target_frequency = 4
batch_size = 4
max_episode_length = 1000
env = gym.make('Breakout-v4')
num_actions = env.action_space.n
solved = False

In [7]:
agent = DQNAgent(num_actions=num_actions, 
                 learning_rate=1e-4, 
                 history_length=4,
                 gamma=0.98)

# Train DQN

In [8]:
episode_rewards = []
q_t_list = []
batch_losses = []
past_frames_last_time = None

replay_table = ExperienceReplayTable()
global_step_counter = 0

for i in range(total_episodes):
    # Get initial frame -> state
    frame = env.reset() # np.array of shape (210, 160, 3)
    # past_frames is a list of past 3 frames (np.arrays)
    past_frames = [copy.deepcopy(frame) for _ in range(agent.history_length-1)]
    state = agent.process_state_into_stacked_frames(
        frame, past_frames, past_state=None) # state is (4,84,84)
    
    # initialize episode history (s_t, a, r, s_t+1, terminal)
    episode_reward = 0.0
    episode_history = EpisodeHistory()
    epsilon_percentage = float(min(i/float(epsilon_stop), 1.0))

    for j in range(max_episode_length):
        # predict action or choose random action at first
        if global_step_counter < learn_start:
          action = np.argmax(np.random.random((agent.num_actions)))
        else:
          action = agent.predict_action(state, epsilon_percentage)

        # take action, get next frame (-> next state), reward, and terminal
        reward = 0
        frame_prime, reward, terminal, _ = env.step(action)
        if terminal == True:
          reward -= 1
          
        # get next state from next frame and past frames 
        state_prime = agent.process_state_into_stacked_frames(frame_prime, 
                                                              past_frames, 
                                                              past_state=state)
        # Update past_frames with frame_prime for next time
        past_frames.append(frame_prime)
        past_frames = past_frames[len(past_frames)-agent.history_length:]
        past_frames_last_time = past_frames

        # Add to episode history (state, action, reward, state_prime, terminal)
        episode_history.add_to_history(
                    state, action, reward, state_prime, terminal)
        state = state_prime
        episode_reward += reward
        global_step_counter += 1
        
        #  Do not train predict_model until we have enough 
        #   episodes in episode history
        if global_step_counter > learn_start:
          if global_step_counter % train_frequency == 0:
              if(len(replay_table.actions) != 0):
                q_t = agent.sample_and_train_pred(replay_table, batch_size)
                q_t_list.append(q_t)

                if global_step_counter % target_frequency == 0:
                    agent.model_target.load_state_dict(
                        agent.model_predict.state_dict())

        # If terminal or max episodes reached, 
        #   add episode_history to replay table
        if j == (max_episode_length - 1):
            terminal = True

        if terminal:
            replay_table.add_episode(episode_history)
            episode_rewards.append(episode_reward)
            break
    print(f'Episode[{i}]: {len(episode_history.actions)} \
              actions {episode_reward} reward')

Episode[0]: 306               actions 2.0 reward
Episode[1]: 228               actions 0.0 reward
Episode[2]: 172               actions -1.0 reward
Episode[3]: 231               actions 0.0 reward
Episode[4]: 266               actions 0.0 reward
Episode[5]: 223               actions 0.0 reward
Episode[6]: 280               actions 1.0 reward
Episode[7]: 205               actions -1.0 reward
Episode[8]: 188               actions -1.0 reward
Episode[9]: 388               actions 3.0 reward
Episode[10]: 239               actions 0.0 reward
Episode[11]: 216               actions 0.0 reward
Episode[12]: 230               actions 0.0 reward
Episode[13]: 246               actions 0.0 reward
Episode[14]: 288               actions 1.0 reward
Episode[15]: 183               actions -1.0 reward
Episode[16]: 214               actions -1.0 reward
Episode[17]: 253               actions 1.0 reward
Episode[18]: 259               actions 0.0 reward
Episode[19]: 169               actions -1.0 reward
Epis