In [1]:
import tensorflow as tf
import random
import gym
import numpy as np

from collections import deque

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam

import gym_super_mario_bros
from gym_super_mario_bros.actions import RIGHT_ONLY
from nes_py.wrappers import JoypadSpace

from IPython.display import clear_output

from keras.models import save_model
from keras.models import load_model

import time

In [2]:
env = gym_super_mario_bros.make('SuperMarioBros-v0')
env = JoypadSpace(env, RIGHT_ONLY)

In [3]:
total_reward = 0
done = True

#for step in range(100000):
#    env.render()

#    if done:
#        state = env.reset()
#    state, reward, done, info = env.step(env.action_space.sample())
#    preprocess_state(state)
#    print(info)

#    total_reward += reward
#    clear_output(wait=True)
    

#env.close()

In [4]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_space  = state_size
        self.action_space = action_size
        self.memory       = deque(maxlen=5000)
        self.gamma        = 0.8

        self.epsilon       = 1
        self.max_epsilon   = 1
        self.min_epsilon   = 0.01
        self.decay_epsilon = 0.0001

        self.main_network   = self.build_network()
        self.target_network = self.build_network()
        self.update_target_network()

    def build_network(self):
        model = Sequential()
        model.add(Conv2D(64, (4,4), strides=4, padding='same', input_shape=self.state_space))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (4,4), strides=2, padding='same'))
        model.add(Activation('relu'))

        model.add(Conv2D(64, (3,3), strides=1, padding='same'))
        model.add(Activation('relu'))
        model.add(Flatten())

        model.add(Dense(512, activation='relu'))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))

        model.compile(loss='mse', optimizer=Adam())

        return model

    def update_target_network(self):
        self.target_network.set_weights(self.main_network.get_weights())

    def act(self, state):
        if random.uniform(0,1) < self.epsilon:
            return np.random.randint(self.action_space)
        Q_value = self.main_network.predict(state)
        return np.argmax(Q_value[0])

    def update_epsilon(self, episode):
        self.epsilon = self.min_epsilon + (self.max_epsilon - self.epsilon_min) * np.exp(-self.epsilon_decay * episode)

    def train(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = self.main_network.predict(state)

            if done:
                target[0][action] = reward
            else:
                target[0][action] = (reward + self.gamma * np.amax(self.target_network.predict(next_state)))
            
            self.main_network.fit(state, target, epochs=1, verbose=0)
    
    def store_transition(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

In [5]:
action_size = env.action_space.n
state_size  = (80,80,1)

from PIL import Image

def preprocess_state(state):
    image = Image.fromarray(state)
    image = image.resize((88,80))
    image = image.convert('L')
    image = np.array(image)

    return image

#

In [6]:
num_episode   = 1000000
num_timesteps = 400000
batch_size    = 64

In [7]:
dqn = DQNAgent(state_size,action_size)

In [8]:
for i in range(num_episode):
    Return    = 0
    done      = False
    time_step = 0

    state = preprocess_state(env.reset())
    state = state.reshape(-1, 80,88,1)

    for t in range(num_timesteps):
        env.render()
        time_step += 1

        action = dqn.act(state)

        next_state, reward, done, info = env.step(action)

        next_state = preprocess_state(next_state)
        next_state = next_state.reshape(-1, 80,88,1)

        dqn.store_transition(state, action, reward, next_state, done)
        state = next_state

        Return += reward
        print("Episode is: {}\nTotal Time Step: {}\nCurrent Reward: {}\nEpsilon is: {}".format(str(i), str(time_step), str(Return), str(dqn.epsilon)))

        clear_output(wait=True)

        #video 10 - t:8:30 

ValueError: cannot step in a done environment! call `reset`