In [1]:
import matplotlib.pyplot as plt
from nes_py.wrappers import JoypadSpace
from gym import wrappers
import gym_super_mario_bros
from IPython import display
from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
import io
import base64
import json
from IPython.display import HTML
from keras.models import Sequential,model_from_json
from keras.layers.core import Dense, Flatten
from keras.optimizers import sgd
from keras.layers import Conv2D, MaxPooling2D, Activation, AveragePooling2D,Reshape,BatchNormalization
import numpy as np

env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = wrappers.Monitor(env, "./gym-results", force=True)

ModuleNotFoundError: No module named 'keras'

# Train Function

In [None]:
import cv2
from tqdm import tqdm_notebook

def train(agent,env,epoch,prefix='', verbose=True, max_step=1000, decay_rate=0.99999):
    # Number of won games
    score = 0
    loss = 0
    best_x = 0
    best_run_index = -1
    epsilon_start = agent.epsilon

    hist = {}
    hist['loss'] = []
    hist['win-lose'] = []
    hist['score'] = []
    hist['info'] = []
    hist['x_pos'] = []


    for e in tqdm_notebook(range(1, epoch + 1)):
        if e == epoch:
            print('Monitoring ...')
            env = wrappers.Monitor(env, "./gym-results/", force=True)

        agent.set_epsilon(epsilon_start*(epoch - e)/epoch)
        # At each epoch, we restart to a fresh game and get the initial state

        state = env.reset()
        cropped_state = state[30:230,50:250,:]
        cropped_state = cv2.cvtColor(cropped_state, cv2.COLOR_BGR2GRAY) #shape (240, 256)
        cropped_state = cv2.resize(cropped_state, (128, 128), interpolation = cv2.INTER_AREA)
        cropped_state = np.moveaxis(np.array([cropped_state]), 0, 2) # shape (128, 128, 1)
        # This assumes that the games will terminate
        game_over = False

        win = 0
        lose = 0

        
        i = 0
        dist = []
        while (not game_over and i < max_step):
            # The agent performs an action
            agent.set_epsilon(agent.epsilon * decay_rate)
            action = agent.act(cropped_state)

            # Apply an action to the environment, get the next state, the reward
            # and if the games end
            prev_state = cropped_state
            state, reward, game_over, info = env.step(action)
            cropped_state = state[30:230,50:250,:]
            cropped_state = cv2.cvtColor(cropped_state, cv2.COLOR_BGR2GRAY)
            cropped_state = cv2.resize(cropped_state, (128, 128), interpolation = cv2.INTER_AREA)
            cropped_state = np.moveaxis(np.array([cropped_state]), 0, 2)

            # Update the counters
            if reward > 0:
                win = win + reward
            if reward < 0:
                lose = lose -reward

            # Apply the reinforcement strategy
            if i % agent.reinforce_rate == 0:
                loss = agent.reinforce(prev_state, cropped_state, action, reward, game_over)

            dist.append(info['x_pos'])
            

            if game_over: break

            i += 1

        env.done = True


        # Save if mario is far
        if info['x_pos'] > best_x:
            agent.save(name_weights=prefix+'model.h5',name_model=prefix+'model.json')
            best_x = info['x_pos']
            best_run_index = e
            

        # Update stats
        score += win-lose

        if verbose:
            print("Epoch {:03d}/{:03d} | Loss {:.4f} | Win/lose count {}/{} ({}) | Epsilon: {}"
                  .format(e, epoch, loss, round(win), round(lose), round(win-lose), round(agent.epsilon*1000)/1000))

        hist['x_pos'].append(dist)
        hist['loss'].append(loss)
        hist['info'].append(info)
        hist['win-lose'].append(win - lose)
        hist['score'].append(score)
    
    print('Best run index: ', best_run_index)
    return hist

In [None]:
from gym import Wrapper

class CustomReward(Wrapper):
    def __init__(self, env):
        super(CustomReward, self).__init__(env)
        self._current_score = 0
        self._x_pos = 0
        self._y_pos = 0

    def step(self, action):
        state, reward, done, info = self.env.step(action)
        
        difference_score = info['score'] - self._current_score
        difference_x = info['x_pos'] - self._x_pos
        difference_y = info['y_pos'] - self._y_pos
        
        if difference_x < -20:
            difference_x = 0
        elif difference_x < 1:
            difference_x = -7

        if difference_y < 0:
            difference_y = 0 
        
#         print(f"Score : {difference_score} | X : {difference_x} | Y : {difference_y}")
        reward = difference_score / 5 + difference_x / 5 + difference_y
        
        self._current_score = info['score']
        self._x_pos = info['x_pos']
        self._y_pos = info['y_pos']
        
        if done:
            if info['flag_get']:
                reward += 500.0
            else:
                reward -= 500.0
        
        return state, reward, done, info

In [None]:
from collections import deque

class MaxAndSkipEnv(Wrapper):
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        self._obs_buffer = deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs

In [None]:
env = CustomReward(env)
env = MaxAndSkipEnv(env)

# DQN Agent

## Memory

In [None]:
class Memory(object):
    def __init__(self, max_memory=100):
        self.max_memory = max_memory
        self.memory = list()

    def remember(self, m):
      if len(self.memory) < self.max_memory:
        self.memory.append(m)
      else:
        self.memory.pop(0) # if we have reached the max memory we pop the oldest element of our list
        self.memory.append(m)

    def random_access(self):
        index = np.random.randint(0, len(self.memory))
        return self.memory[index]

In [None]:

from keras import backend as K
from keras.optimizers import Adam


class DQN():
    def __init__(self,  epsilon = 0.1, memory_size=1000, batch_size = 16,n_actions=7, reinforce_rate=10):

        # Discount for Q learning
        self.discount = 0.99

        self.reinforce_rate = reinforce_rate
        
        self.n_actions = n_actions
        self.state_size=240

        # epsilon
        self.epsilon = epsilon

        # Memory
        self.memory = Memory(memory_size)
        
        # Batch size when learning
        self.batch_size = batch_size


    def set_epsilon(self, e):
        self.epsilon = e

    def act(self,s,train=True):
        """ This function should return the next action to do:
        an integer between 0 and 4 (not included) with a random exploration of epsilon"""
        if train:
            if np.random.rand() <= self.epsilon:
                a = np.random.randint(0, self.n_actions, size=1)[0]

            else:
                a = self.learned_act(s)

        else: # in some cases, this can improve the performance.. remove it if poor performances
            a = self.learned_act(s)

        return a

    def learned_act(self, s):
       # We take the action that gives the best reward Q as a learned action 
        q_pred = self.model.predict(np.array([s]))[0]
        a = np.argmax(q_pred)
        return a

    def reinforce(self, s_, n_s_, a_, r_, game_over_):
        # Two steps: first memorize the states, second learn from the pool

        self.memory.remember([s_, n_s_, a_, r_, game_over_])
        if len(self.memory.memory) < self.batch_size:
            return 0
        
        
        input_states = np.zeros((self.batch_size, 128, 128, 1)) # state with shape (128, 128, 3)
        target_q = np.zeros((self.batch_size, self.n_actions))

        minibatch = np.array([self.memory.random_access() for i in range(self.batch_size)])
        # print(minibatch.shape)
        input_states = np.array([item[0] for item in minibatch])
        next_states = np.array([item[1] for item in minibatch])
        pred_q = self.model.predict(input_states)
        next_state_pred_q = self.model.predict(next_states)
        
        for i in range(self.batch_size):
            s_, n_s_, a_, r_, game_over_ = minibatch[i]
            if game_over_:
                target_q[i, a_] = r_ # don't have to update Q the while in train() will end
            else:
                target_q[i] = pred_q[i]
                # On ajoute de la connaissance pour l'action qu'on a effectué et pour laquelle on a eu du feeedback

                # We predict the next Q (with next state input) with the current 
                # policy and update it using Bellman Equation
                target_q[i, a_] = r_ + self.discount * np.max(next_state_pred_q[i])

        # HINT: Clip the target to avoid exploiding gradients.. -- clipping is a bit tighter
        target_q = np.clip(target_q, -3, 3)
        l = self.model.train_on_batch(input_states, target_q)


        return l

    def _huber_loss(self, target, prediction):
        # sqrt(1+error^2)-1
        error = prediction - target
        return K.mean(K.sqrt(1+K.square(error))-1, axis=-1)

    def save(self,name_weights='model.h5',name_model='model.json'):
        self.model.save_weights(name_weights, overwrite=True)
        with open(name_model, "w") as outfile:
            json.dump(self.model.to_json(), outfile)
            
    def load(self,name_weights='model.h5',name_model='model.json'):
        with open(name_model, "r") as jfile:
            model = model_from_json(json.load(jfile))
        model.load_weights(name_weights)
        model.compile("sgd", "mse")
        self.model = model


class DQN_CNN(DQN):
    def __init__(self, *args,lr=0.1,**kwargs):
        super(DQN_CNN, self).__init__(*args,**kwargs)
        
        ###### FILL IN
        model = Sequential()
        model.add(Conv2D(64, 3, input_shape=(128, 128, 1), padding='same'))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(MaxPooling2D())
        model.add(Conv2D(128, 3, padding='same'))
        model.add(BatchNormalization())
        model.add(Activation('relu'))
        model.add(MaxPooling2D())

        model.add(Flatten())
        # model.add(Dense(128, activation='relu'))
        model.add(Dense(self.n_actions, activation='linear'))

        K.set_image_data_format('channels_last')

        # model = Sequential()
        # model.add(Flatten())
        # model.add(Dense(24, input_dim=240*256, activation='relu'))
        # model.add(Dense(24, activation='relu'))
        # model.add(Dense(self.n_actions, activation='linear'))
        # model.compile(loss='mse',
        #               optimizer=sgd(lr=lr))
        
        model.compile(sgd(lr=lr, decay=1e-4, momentum=0.0), self._huber_loss)
        # model.compile(Adam(lr=0.001), self._huber_loss)

        self.model = model
        

In [None]:
n_train = 1000
epsilon = 0.5
env = JoypadSpace(env, SIMPLE_MOVEMENT[1:])
n_actions = env.action_space.n

agent = DQN_CNN(lr=.1, epsilon=epsilon, memory_size=500, batch_size = 8, n_actions=n_actions)
















In [None]:
hist = train(agent, env, n_train, prefix='cnn_train_explore', max_step=8000)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

  return (self.ram[0x86] - self.ram[0x071c]) % 256


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 001/1000 | Loss 0.3086 | Win/lose count 1855.0/1064.0 (790.0) | Epsilon: 0.497


  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 002/1000 | Loss 0.1985 | Win/lose count 2967.0/837.0 (2129.0) | Epsilon: 0.497
Epoch 003/1000 | Loss 0.2103 | Win/lose count 3956.0/2072.0 (1884.0) | Epsilon: 0.494
Epoch 004/1000 | Loss 0.0732 | Win/lose count 3045.0/1092.0 (1953.0) | Epsilon: 0.495
Epoch 005/1000 | Loss 0.0759 | Win/lose count 4395.0/2547.0 (1847.0) | Epsilon: 0.492
Epoch 006/1000 | Loss 0.2240 | Win/lose count 2399.0/1790.0 (609.0) | Epsilon: 0.494
Epoch 007/1000 | Loss 0.1823 | Win/lose count 3727.0/2859.0 (868.0) | Epsilon: 0.492
Epoch 008/1000 | Loss 0.3413 | Win/lose count 1917.0/1233.0 (685.0) | Epsilon: 0.494


  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 009/1000 | Loss 0.2284 | Win/lose count 1926.0/887.0 (1040.0) | Epsilon: 0.494
Epoch 010/1000 | Loss 0.2035 | Win/lose count 1880.0/954.0 (926.0) | Epsilon: 0.493
Epoch 011/1000 | Loss 0.1739 | Win/lose count 2688.0/1376.0 (1312.0) | Epsilon: 0.492
Epoch 012/1000 | Loss 0.3081 | Win/lose count 2680.0/1527.0 (1153.0) | Epsilon: 0.491
Epoch 013/1000 | Loss 0.2684 | Win/lose count 2681.0/1637.0 (1045.0) | Epsilon: 0.491
Epoch 014/1000 | Loss 0.1953 | Win/lose count 1817.0/480.0 (1337.0) | Epsilon: 0.492
Epoch 015/1000 | Loss 0.2013 | Win/lose count 1897.0/870.0 (1027.0) | Epsilon: 0.491
Epoch 016/1000 | Loss 0.1381 | Win/lose count 943.0/568.0 (375.0) | Epsilon: 0.491
Epoch 017/1000 | Loss 0.2301 | Win/lose count 2344.0/884.0 (1460.0) | Epsilon: 0.49
Epoch 018/1000 | Loss 0.1598 | Win/lose count 1721.0/779.0 (942.0) | Epsilon: 0.49
Epoch 019/1000 | Loss 0.1185 | Win/lose count 3831.0/1330.0 (2501.0) | Epsilon: 0.487
Epoch 020/1000 | Loss 0.0827 | Win/lose count 1443.0/863.0 (580.0) 

  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 028/1000 | Loss 0.4122 | Win/lose count 1574.0/569.0 (1005.0) | Epsilon: 0.485
Epoch 029/1000 | Loss 0.5319 | Win/lose count 2444.0/902.0 (1543.0) | Epsilon: 0.484
Epoch 030/1000 | Loss 0.4891 | Win/lose count 2871.0/873.0 (1998.0) | Epsilon: 0.483
Epoch 031/1000 | Loss 0.3001 | Win/lose count 1807.0/531.0 (1276.0) | Epsilon: 0.483
Epoch 032/1000 | Loss 0.4078 | Win/lose count 2173.0/565.0 (1608.0) | Epsilon: 0.482
Epoch 033/1000 | Loss 0.2042 | Win/lose count 2772.0/1679.0 (1092.0) | Epsilon: 0.48
Epoch 034/1000 | Loss 0.3697 | Win/lose count 951.0/840.0 (111.0) | Epsilon: 0.482
Epoch 035/1000 | Loss 0.2182 | Win/lose count 3376.0/3063.0 (313.0) | Epsilon: 0.478
Epoch 036/1000 | Loss 0.2581 | Win/lose count 3211.0/1022.0 (2188.0) | Epsilon: 0.479
Epoch 037/1000 | Loss 0.2370 | Win/lose count 1779.0/1032.0 (747.0) | Epsilon: 0.48
Epoch 038/1000 | Loss 0.1357 | Win/lose count 2015.0/918.0 (1097.0) | Epsilon: 0.479
Epoch 039/1000 | Loss 0.2073 | Win/lose count 1104.0/551.0 (553.0) 

  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 065/1000 | Loss 0.2742 | Win/lose count 4445.0/3395.0 (1050.0) | Epsilon: 0.462
Epoch 066/1000 | Loss 0.2530 | Win/lose count 3610.0/1892.0 (1718.0) | Epsilon: 0.463
Epoch 067/1000 | Loss 0.6453 | Win/lose count 1975.0/1339.0 (637.0) | Epsilon: 0.464
Epoch 068/1000 | Loss 0.6196 | Win/lose count 1269.0/825.0 (444.0) | Epsilon: 0.465
Epoch 069/1000 | Loss 0.4938 | Win/lose count 1641.0/1366.0 (276.0) | Epsilon: 0.463
Epoch 070/1000 | Loss 0.4567 | Win/lose count 3198.0/1843.0 (1355.0) | Epsilon: 0.462
Epoch 071/1000 | Loss 0.4267 | Win/lose count 3480.0/2329.0 (1152.0) | Epsilon: 0.46
Epoch 072/1000 | Loss 0.1005 | Win/lose count 4247.0/2381.0 (1866.0) | Epsilon: 0.459
Epoch 073/1000 | Loss 0.3568 | Win/lose count 1838.0/509.0 (1329.0) | Epsilon: 0.462
Epoch 074/1000 | Loss 0.4052 | Win/lose count 1958.0/700.0 (1258.0) | Epsilon: 0.462
Epoch 075/1000 | Loss 0.2558 | Win/lose count 2150.0/615.0 (1536.0) | Epsilon: 0.461
Epoch 076/1000 | Loss 0.2326 | Win/lose count 4102.0/3771.0 (3

In [None]:
def get_x_pos_from_runs(hist):
  result = []
  for run in hist['x_pos']:
    previous_x = -100
    max_x = []
    for x in run:
      if x < previous_x:
        max_x.append(previous_x)
    previous_x = x
    max_x.append(run[-1])
    result.append(max(max_x))
  print('Successive maximum x_pos: ', result)
  return result

plt.figure(figsize=(12, 12))
plt.subplot(121)
plt.title('Loss evolution')
plt.plot(hist['loss'])

plt.subplot(122)
plt.title('Max distance evolution')
plt.plot(get_x_pos_from_runs(hist))

In [None]:
video = io.open('./gym-results/openaigym.video.1.7434.video000000.mp4' , 'r+b').read()
encoded = base64.b64encode(video)
HTML(data='''
    <video width="360" height="auto" alt="test" controls><source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''
.format(encoded.decode('ascii')))

# Test

In [None]:
def test(agent,env,epochs,prefix='', max_step=4000):
    # Number of won games
    score = 0        
    for e in range(epochs):
        agent.set_epsilon(0)
        if e == epochs - 1:
            print('Monitoring...')
            env = wrappers.Monitor(env, "./gym-results/", force=True)
        
        state = env.reset()
        state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY) #shape (240, 256)
        state = cv2.resize(state, (128, 128), interpolation = cv2.INTER_AREA)
        state = np.moveaxis(np.array([state]), 0, 2) # shape (240, 256, 1)
        # This assumes that the games will end
        game_over = False
    
        win = 0
        lose = 0
        i = 0
    
        while (not game_over and i < max_step):
            # The agent performs an action
            action = agent.act(state)
    
            # Apply an action to the environment, get the next state, the reward
            # and if the games end
            prev_state = state

            state, reward, game_over, info = env.step(action)
            state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY) #shape (240, 256)
            state = cv2.resize(state, (128, 128), interpolation = cv2.INTER_AREA)
            state = np.moveaxis(np.array([state]), 0, 2) # shape (240, 256, 1)

            if reward > 0:
                win = win + reward 
            if reward < 0:
                lose = lose -reward
        
            i += 1

        # Update stats
        score = score + win-lose

        print("Win/lose count {}/{}. Average score ({})"
              .format(win, lose, score/(1+e)))
    print('Final score: '+str(score/epochs))

In [None]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, SIMPLE_MOVEMENT[:1])
n_actions = env.action_space.n

# Load best model
agent.load('cnn_train_exploremodel.h5', 'cnn_train_exploremodel.json')
test(agent, env, 1)

Monitoring...


KeyError: ignored

# Grid search for optimum epsilon

For the epsilon grid search we train on 10 epoch, with a batchsize of 8

In [None]:
hist = []
tested_epsilon = [.1, .5, 1]
for epsilon in tested_epsilon:
  agent = DQN_CNN(lr=.1, epsilon=epsilon, memory_size=500, batch_size = 8, n_actions=n_actions)
  hist.append(train(agent, env, n_train, prefix='cnn_train_explore', max_step=4000))

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 001/010 | Loss 0.0270 | Win/lose count 2103/79 (2024) | Epsilon: 0.08904301938981736
Epoch 002/010 | Loss 0.0144 | Win/lose count 1597/76 (1521) | Epsilon: 0.07920873514500848
Epoch 003/010 | Loss 0.0127 | Win/lose count 1186/62 (1124) | Epsilon: 0.06961675106468246
Epoch 004/010 | Loss 0.0192 | Win/lose count 1985/70 (1915) | Epsilon: 0.059410709983694916
Epoch 005/010 | Loss 0.0024 | Win/lose count 1662/86 (1576) | Epsilon: 0.049429773648633074
Epoch 006/010 | Loss 0.0046 | Win/lose count 1988/79 (1909) | Epsilon: 0.03953551554730103
Epoch 007/010 | Loss 0.0062 | Win/lose count 1566/78 (1488) | Epsilon: 0.029713673697403162
Epoch 008/010 | Loss 0.0047 | Win/lose count 1558/94 (1464) | Epsilon: 0.019748592249564316
Epoch 009/010 | Loss 0.0012 | Win/lose count 1759/99 (1660) | Epsilon: 0.009854961361961003
Monitoring ...


  return (self.ram[0x86] - self.ram[0x071c]) % 256


Epoch 010/010 | Loss 0.0031 | Win/lose count 797/60 (737) | Epsilon: 0.0

Best run index:  1


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))