<a href="https://colab.research.google.com/github/dude123studios/AdvancedDeepLearning/blob/main/DQN_for_Atari.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import gym
import math
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam
import sys

In [2]:
EPOCHS = 50
THRESHHOLD = 10
MONITOR = False

In [3]:
class DQN:
    def __init__(self, env_string, batch_size = 128, img_size = 84, m = 4):
        self.memory = deque(maxlen=1000)
        self.env = gym.make(env_string)
        self.img_size = img_size
        self.action_size = self.env.action_space.n
        self.m = m

        self.batch_size = batch_size
        self.gamma = 1.0
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995

        alpha = 0.01
        alpha_decay = 0.01

        if MONITOR:
            self.env = gym.wrappers.Monitor(self.env, 'recording', force=True)

        self.model = Sequential([
            Conv2D(32, 8, (4, 4), activation='relu', padding='same', input_shape = (img_size, img_size, m)),
            Conv2D(64, 4, (2, 2), activation='relu', padding='valid'),
            Conv2D(64, 3, (1, 1), activation='relu', padding='valid'),
            Flatten(),
            Dense(512, activation='elu'),
            Dense(self.action_size, activation='linear')
        ])
        self.model.compile(loss = 'mse', optimizer = Adam(alpha, decay=alpha_decay))
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def preprocces_state(self, img):
        img_temp = img[31:195]
        img_temp = tf.image.rgb_to_grayscale(img_temp)
        img_temp = tf.image.resize(img_temp, (self.img_size, self.img_size),
                                   method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
        img_temp = tf.cast(img_temp, tf.float32)
        return img_temp[:, :, 0]
    
    def combine_frames(self, prev, new):
        if len(prev.shape) == 3 and prev.shape[0] == self.m:
            imgs = np.append(prev[1:, :, :], np.expand_dims(img2, 0), axis=2)
            return tf.expand_dims(imgs, 0)
        else:
            img = np.stack([prev]*self.m, axis=2)
            return tf.expand_dims(img, 0)


    def replay(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, min(len(self.memory), batch_size))
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state)
            if done:
                y_target[0][action] = reward 
            else:
                y_target[0][action] = reward + self.gamma * np.max(self.model.predict(next_state)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])
        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
    
    def choose_action(self, state, epsilon):
        if np.random.random() <= epsilon:
            return self.env.action_space.sample()
        else:
            return np.argmax(self.model.predict(state))
    
    def train(self):
        scores = deque(maxlen=5)
        avg_scores = []
        for e in range(EPOCHS):
            state = self.env.reset()
            state = self.preprocces_state(state)
            state = self.combine_frames(state, state)
            done = False
            i = 0
            while not done:
                action = self.choose_action(state, self.epsilon)
                next_state, reward, done, _ = self.env.step(action)
                next_state = self.preprocces_state(next_state)
                next_state = self.combine_frames(next_state, state)
                self.remember(state, action, reward, next_state, done)
                state = next_state
                self.epsilon = max(self.epsilon_min, self.epsilon_decay*self.epsilon)
                i += reward
            scores.append(i)
            mean_score = np.mean(scores)
            avg_scores.append(mean_score)
            if mean_score >= THRESHHOLD and e >= 5:
                print('Ran {} episodes. Solved after {} trials ✔'.format(e, e-5))
                return avg_scores
            if e % 5 == 0:
                print('[Episode {}] - Mean survival time over last 5 episodes was {}.'.format(e, mean_score))
            self.replay(self.batch_size)

        print('Did not solve after {} episodes'.format(e))
        return avg_scores
    

In [4]:
dqn = DQN('Breakout-v0')
dqn.train()

[Episode 0] - Mean survival time over last 5 episodes was 1.0.
[Episode 5] - Mean survival time over last 5 episodes was 1.0.
[Episode 10] - Mean survival time over last 5 episodes was 0.6.
[Episode 15] - Mean survival time over last 5 episodes was 2.4.
[Episode 20] - Mean survival time over last 5 episodes was 2.6.
[Episode 25] - Mean survival time over last 5 episodes was 1.8.
[Episode 30] - Mean survival time over last 5 episodes was 1.6.
[Episode 35] - Mean survival time over last 5 episodes was 0.6.
[Episode 40] - Mean survival time over last 5 episodes was 0.8.
[Episode 45] - Mean survival time over last 5 episodes was 0.4.
Did not solve after 49 episodes


[1.0,
 1.5,
 1.3333333333333333,
 1.0,
 1.2,
 1.0,
 0.6,
 0.8,
 0.8,
 0.6,
 0.6,
 1.2,
 1.6,
 2.0,
 2.4,
 2.4,
 2.6,
 2.6,
 2.2,
 2.0,
 2.6,
 2.8,
 2.0,
 2.4,
 2.4,
 1.8,
 1.2,
 1.4,
 1.2,
 1.2,
 1.6,
 1.2,
 1.0,
 0.8,
 0.4,
 0.6,
 0.8,
 1.0,
 1.0,
 1.2,
 0.8,
 0.8,
 0.6,
 0.6,
 0.6,
 0.4,
 0.6,
 0.8,
 1.0,
 0.8]