<a href="https://colab.research.google.com/github/elk-cloner/RL/blob/master/CartPole_Simple_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# using https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t guide to
# show openai gym env in jupyter
!apt-get update
!pip install pyvirtualdisplay
!apt-get install -y xvfb python-opengl ffmpeg
!pip install pyglet==1.4.0 # gym requirements
!apt-get install cmake
!pip install --upgrade setuptools
!pip install ez_setup
!pip install tensorflow gym gym-retro
!pip install git+https://github.com/MaxStrange/retrowrapper.git

In [None]:
# upload your ROMs and run below command
!python -m retro.import ./

In [None]:
import io
import gym
import time
import math
import glob
import base64
import random
import retro
import matplotlib
%matplotlib inline
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import multiprocessing as mp
import retrowrapper
from itertools import count
from collections import deque
from IPython.display import HTML
from gym.wrappers import Monitor
from gym import logger as gymlogger
gymlogger.set_level(40) #error only
from IPython import display as ipythondisplay


from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

In [None]:
"""
Utility functions to enable video recording of gym environment and displaying it
To enable video, just do "env = wrap_env(env)""
"""

def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  env = Monitor(env, './video', force=True)
  return env

In [None]:
loss_func = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

In [None]:
from collections import deque
class Experience:
    def __init__(self, state, action, reward, next_state, done):
        self.state = state
        self.action = action
        self.reward = reward
        self.next_state = next_state
        self.done = done
memory_buffer = deque(maxlen=1000)

In [None]:
def resize_frame(img):
    return tf.image.resize(img, size=(244, 244))

In [None]:
dqn_model = tf.keras.Sequential(
    [
        tf.keras.layers.InputLayer(input_shape=(244, 244, 3)),
        tf.keras.layers.Conv2D(filters=32, kernel_size=8, strides=4, padding="same", use_bias=True, activation="relu"),
        tf.keras.layers.Conv2D(filters=64, kernel_size=4, strides=2, padding="same", use_bias=True, activation="relu"),
        tf.keras.layers.Conv2D(filters=64, kernel_size=3, strides=1, padding="same", use_bias=True, activation="relu"),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(units=4096, activation="relu", use_bias=True),
        tf.keras.layers.Dense(units=2**12 + 1)
    ]
)
dqn_target_model = tf.keras.models.clone_model(dqn_model)
print(dqn_model.summary())

In [None]:
xx = np.ones(shape=[1, 244, 244, 3])
dqn_model(xx)

In [None]:
game = "SonicTheHedgehog-Genesis"
env1 = retrowrapper.RetroWrapper(game)
env2 = retrowrapper.RetroWrapper(game)

In [None]:
# action_map = {k: v}
import numpy as np
action2index = {tuple(map(int, "{0:012b}".format(i))): i for i in range(2**12 + 1) }
index2action = {v: k for k, v in action2index.items()}
action2index[(0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1)]
len(action2index)

In [None]:
decay_step = 0
def select_action(state):
    global decay_step
    # max 1, decrease linearly until 0.1 in first 1m steps and then fix at 0.1
    a = -(1 - 0.1) / 1000000.0
    b = 1.0
    value = max(0.1, a * float(decay_step) + b)
    decay_step += 1
    if random.random() < value:
        action = env1.action_space.sample()
        return action
    else:
        state = tf.reshape(state, shape=(-1, 244, 244, 3))
        action_scores = dqn_model.predict(state)
        action = index2action[int(tf.math.argmax(action_scores, axis=1).numpy()[0])]
        return np.array(action)

In [None]:
def update_model():
    if len(memory_buffer) < 32:
        print("\n not enough data to optimize data")
        return 0
    print("\n")
    print("get new batch")
    batch = random.sample(memory_buffer, 32)
    y = np.zeros(shape=(32,))
    selected_action = np.zeros(shape=(32, 2))
    batch_states = []
    for i, sample in enumerate(batch):
        if sample.done:
            y[i] = sample.reward
        else:
            sample.next_state = tf.reshape(sample.next_state, shape=(-1, 244, 244, 3))
            y[i] = sample.reward + 0.999 * tf.math.reduce_max(dqn_target_model.predict(sample.next_state)).numpy()
        selected_action[i][0] = i
        selected_action[i][1] = sample.action
        batch_states.append(sample.state)
    batch_states = np.array(batch_states)
    with tf.GradientTape() as tape:
        x = dqn_model(batch_states)
        x = tf.gather_nd(x, selected_action)
        loss = loss_func(y, x)
        print(f"loss: {loss}")
        gradient = tape.gradient(loss, dqn_model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, dqn_model.trainable_variables))
    return 1
    

In [None]:
def render_model():
    current_state = env2.reset()
    dqn_target_model.load_weights("./pre_train_model/model.ckpt")
    cnt = 0
    episod_reward = []
    print("start rendering middle model loop...")
    while True:
        cnt += 1
        current_state = np.array(current_state).reshape(1, -1)
        action_score = dqn_target_model.predict(current_state)
        action = tf.math.argmax(action_score, axis=1).numpy()[0]
        action = np.array(index2action[int(action)])
        next_state, reward, done, info = envv.step(action)
        episod_reward.append(reward)
        current_state = next_state
        if done or cnt == 100:
            break
    print(f"new oracle reward: {sum(episod_reward)/len(episod_reward)}")
    return True
    

In [None]:
update_steps = 0
rew = []
for train_step in range(1000000):
    state = env1.reset()
    state = resize_frame(state)
    cnt = 0
    while True:
        cnt += 1
        print(f"cnt: {cnt}", end="-")
        action = select_action(state)
        next_state, reward, done, info = env1.step(action)
        next_state = resize_frame(next_state)
        rew.append(reward)
        if reward <= 0.0:
            reward = -1.0
        memory_buffer.append(Experience(state, action2index[tuple(action.tolist())], reward, next_state, done))
        update_steps += update_model()
        if update_steps and update_steps % 10 == 0:
            dqn_target_model.set_weights(dqn_model.get_weights())
            print("oracle got updated")
            dqn_target_model.save_weights("./pre_train_model/model.ckpt")
            render_model()
        if done or cnt == 100:
            break
        state = next_state
    break
print(rew)