In [None]:
!pip install nes-py==0.2.6
!pip install gym-super-mario-bros
!apt-get update
!apt-get install ffmpeg libsm6 libxext6  -y
!apt install -y libgl1-mesa-glx
!pip install opencv-python

In [None]:
import torch
import torch.nn as nn
import random
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from tqdm import tqdm
import pickle 
from gym_super_mario_bros.actions import RIGHT_ONLY, SIMPLE_MOVEMENT, COMPLEX_MOVEMENT
import gym
import numpy as np
import collections 
import cv2
import matplotlib.pyplot as plt

%matplotlib inline
import time
import pylab as pl
from IPython import display
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# RIGHT_ONLY

In [None]:
RIGHT_ONLY

# SIMPLE_MOVEMENT

In [None]:
SIMPLE_MOVEMENT

# COMPLEX_MOVEMENT

In [None]:
COMPLEX_MOVEMENT

In [None]:
class MaxAndSkipEnv(gym.Wrapper):
    """
        Each action of the agent is repeated over skip frames
        return only every `skip`-th frame
    """
    def __init__(self, env=None, skip=4):
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        """Clear past frame buffer and init to first obs"""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class MarioRescale84x84(gym.ObservationWrapper):
    """
    Downsamples/Rescales each frame to size 84x84 with greyscale
    """
    def __init__(self, env=None):
        super(MarioRescale84x84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return MarioRescale84x84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 240 * 256 * 3:
            img = np.reshape(frame, [240, 256, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution." 
        # image normalization on RBG
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class ImageToPyTorch(gym.ObservationWrapper):
    """
    Each frame is converted to PyTorch tensors
    """
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

    
class BufferWrapper(gym.ObservationWrapper):
    """
    Only every k-th frame is collected by the buffer
    """
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


class PixelNormalization(gym.ObservationWrapper):
    """
    Normalize pixel values in frame --> 0 to 1
    """
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


def create_mario_env(env,movement):
    env = MaxAndSkipEnv(env)
    env = MarioRescale84x84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    env = PixelNormalization(env)
    return JoypadSpace(env, movement)

In [None]:
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1')
env = create_mario_env(env,RIGHT_ONLY) 
# Observation Space
observation_space = env.observation_space.shape
observation_space

In [None]:
# Action Space
env = create_mario_env(env,RIGHT_ONLY) 
action_space = env.action_space.n
action_space

In [None]:
model =nn.Sequential(
    nn.Conv2d(3,32,kernel_size=3,padding=1),
    nn.ReLU(),
    nn.Conv2d(32,64,kernel_size=3,stride=1,padding=1),
    nn.ReLU(),
    nn.MaxPool2d(2,2),

    nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1),
    nn.ReLU(),
    nn.Conv2d(128,128,kernel_size=3,stride=1,padding=1),
    nn.ReLU(),
    nn.MaxPool2d(2,2),

    nn.Conv2d(128,256,kernel_size=3,stride=1,padding=1),
    nn.ReLU(),
    nn.Conv2d(256,256,kernel_size=3,stride=1,padding=1),
    nn.ReLU(),
    nn.MaxPool2d(2,2),

    nn.Flatten(),
    nn.Linear(256*4*4,1024),
    nn.ReLU(),
    nn.Linear(1024,512),
    nn.ReLU(),
    nn.Linear(512,10))


In [None]:
print(model)

In [None]:
class DQNSolver(nn.Module):
    """
    Convolutional Neural Network
    """
    def __init__(self, input_shape, n_actions):
        super(DQNSolver, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.MaxPool2d(2,2),

            nn.Conv2d(64,128,kernel_size=3,stride=1,padding=1),
            nn.ReLU(),
            nn.Conv2d(128,128,kernel_size=3,stride=1,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),

            nn.Conv2d(128,256,kernel_size=3,stride=1,padding=1),
            nn.ReLU(),
            nn.Conv2d(256,256,kernel_size=3,stride=1,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2,2),
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))
    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)

In [None]:
class DQN:
    # Initializers
    def __init__(self, state_space, action_space, batch_size, gamma, lr, dropout, exploration_max,
                       exploration_min, pretrained, exploration_decay, double_dqn = False, max_memory_size = 30000):
        self.state_space = state_space
        self.action_space = action_space
        self.pretrained = pretrained
        self.memory_sample_size = batch_size

        # check cuda or cpu
        if torch.cuda.is_available():
            self.device = 'cuda'
            print("Current device is Cuda")    
        else:
            self.device = 'cpu'
            print("Current device is cpu") 
        self.dqn = DQNSolver(state_space, action_space).to(self.device)
        self.optimizer = torch.optim.Adam(self.dqn.parameters(), lr=lr)
        self.max_memory_size = max_memory_size
        # In order to save pretarined to move on fast 
        if self.pretrained:
          # Loading Pretrained Project Files
            self.dqn.load_state_dict(torch.load("DQN.pt", map_location=torch.device(self.device)))
            self.STATE_MEM = torch.load("STATE_MEM.pt")
            self.ACTION_MEM = torch.load("ACTION_MEM.pt")
            self.REWARD_MEM = torch.load("REWARD_MEM.pt")
            self.STATE2_MEM = torch.load("STATE2_MEM.pt")
            self.DONE_MEM = torch.load("DONE_MEM.pt")
            with open("ending_position.pkl", 'rb') as f:
                self.ending_position = pickle.load(f)
            with open("num_in_queue.pkl", 'rb') as f:
                self.num_in_queue = pickle.load(f)
        else:
            self.STATE_MEM = torch.zeros(max_memory_size, *self.state_space)
            self.ACTION_MEM = torch.zeros(max_memory_size, 1)
            self.REWARD_MEM = torch.zeros(max_memory_size, 1)
            self.STATE2_MEM = torch.zeros(max_memory_size, *self.state_space)
            self.DONE_MEM = torch.zeros(max_memory_size, 1)
            self.ending_position = 0
            self.num_in_queue = 0

        # Learning parameters
        self.gamma = gamma
        self.l1 = nn.CrossEntropyLoss().to(self.device) # Cross Entropy Loss
        self.exploration_max = exploration_max
        self.exploration_rate = exploration_max
        self.exploration_min = exploration_min
        self.exploration_decay = exploration_decay

    def remember(self, state, action, reward, state2, done):
        """Store the experiences in a buffer to use later"""
        self.STATE_MEM[self.ending_position] = state.float()
        self.ACTION_MEM[self.ending_position] = action.float()
        self.REWARD_MEM[self.ending_position] = reward.float()
        self.STATE2_MEM[self.ending_position] = state2.float()
        self.DONE_MEM[self.ending_position] = done.float()
        self.ending_position = (self.ending_position + 1) % self.max_memory_size  # FIFO tensor
        self.num_in_queue = min(self.num_in_queue + 1, self.max_memory_size)
    
    def batch_experiences(self):
        idx = random.choices(range(self.num_in_queue), k=self.memory_sample_size)
        STATE = self.STATE_MEM[idx]
        ACTION = self.ACTION_MEM[idx]
        REWARD = self.REWARD_MEM[idx]
        STATE2 = self.STATE2_MEM[idx]
        DONE = self.DONE_MEM[idx]      
        return STATE, ACTION, REWARD, STATE2, DONE
    
    def act(self, state):
        if random.random() < self.exploration_rate:  
            return torch.tensor([[random.randrange(self.action_space)]])
        else:
            return torch.argmax(self.dqn(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()
    
    
    def experience_replay(self):
        if self.memory_sample_size > self.num_in_queue:
            return
        # Sample a batch of experiences
        STATE, ACTION, REWARD, STATE2, DONE = self.batch_experiences()
        STATE = STATE.to(self.device)
        ACTION = ACTION.to(self.device)
        REWARD = REWARD.to(self.device)
        STATE2 = STATE2.to(self.device)
        DONE = DONE.to(self.device)
        
        # Implementation of DQN 
        # Formula ==> Q*(S, A) <= r + γ max_a Q(S', a) 
        target = REWARD + torch.mul((self.gamma * self.dqn(STATE2).max(1).values.unsqueeze(1)), 1 - DONE)
            
        current = self.dqn(STATE).gather(1, ACTION.long())
        loss = self.l1(current, target) # x, y values for the loss function
        loss.backward() # Compute gradients

        self.optimizer.step() # Backpropagate error
        self.exploration_rate *= self.exploration_decay
        self.exploration_rate = max(self.exploration_rate, self.exploration_min)

In [None]:
class DDQN:

    def __init__(self, state_space, action_space, batch_size, gamma, lr, dropout, exploration_max, exploration_min,
                       exploration_decay, pretrained, double_dqn = True, max_memory_size = 30000):
        self.state_space = state_space
        self.action_space = action_space
        self.double_dqn = double_dqn
        self.pretrained = pretrained
        self.memory_sample_size = batch_size
        self.max_memory_size = max_memory_size
        if torch.cuda.is_available():
            self.device = 'cuda'
            print("Current Device is Cuda")    
        else:
            self.device = 'cpu'
            print("Current Device is cpu") 
        # Init Double DQN network
        self.local_net = DQNSolver(state_space, action_space).to(self.device)
        self.target_net = DQNSolver(state_space, action_space).to(self.device)
        
        if self.pretrained:
            self.local_net.load_state_dict(torch.load("DQN1.pt", map_location=torch.device(self.device)))
            self.target_net.load_state_dict(torch.load("DQN2.pt", map_location=torch.device(self.device)))
        # Optimizer is Adam
        self.optimizer = torch.optim.Adam(self.local_net.parameters(), lr=lr)
        self.copy = 100
        self.step = 0

        # In order to save pretarined to move on fast
        if self.pretrained:
          # Loading Pretrained Project Files
            self.STATE_MEM = torch.load("STATE_MEM.pt")
            self.ACTION_MEM = torch.load("ACTION_MEM.pt")
            self.REWARD_MEM = torch.load("REWARD_MEM.pt")
            self.STATE2_MEM = torch.load("STATE2_MEM.pt")
            self.DONE_MEM = torch.load("DONE_MEM.pt")
            with open("ending_position.pkl", 'rb') as f:
                self.ending_position = pickle.load(f)
            with open("num_in_queue.pkl", 'rb') as f:
                self.num_in_queue = pickle.load(f)
        else:
            self.STATE_MEM = torch.zeros(max_memory_size, *self.state_space)
            self.ACTION_MEM = torch.zeros(max_memory_size, 1)
            self.REWARD_MEM = torch.zeros(max_memory_size, 1)
            self.STATE2_MEM = torch.zeros(max_memory_size, *self.state_space)
            self.DONE_MEM = torch.zeros(max_memory_size, 1)
            self.ending_position = 0
            self.num_in_queue = 0
         
        # Learning parameters
        self.gamma = gamma
        self.l1 = nn.CrossEntropyLoss().to(self.device)
        self.exploration_max = exploration_max
        self.exploration_rate = exploration_max
        self.exploration_min = exploration_min
        self.exploration_decay = exploration_decay

    def remember(self, state, action, reward, state2, done):
        self.STATE_MEM[self.ending_position] = state.float()
        self.ACTION_MEM[self.ending_position] = action.float()
        self.REWARD_MEM[self.ending_position] = reward.float()
        self.STATE2_MEM[self.ending_position] = state2.float()
        self.DONE_MEM[self.ending_position] = done.float()
        self.ending_position = (self.ending_position + 1) % self.max_memory_size  # FIFO tensor
        self.num_in_queue = min(self.num_in_queue + 1, self.max_memory_size)
    
    def batch_experiences(self):
        idx    = random.choices(range(self.num_in_queue), k=self.memory_sample_size)
        STATE  = self.STATE_MEM[idx]
        ACTION = self.ACTION_MEM[idx]
        REWARD = self.REWARD_MEM[idx]
        STATE2 = self.STATE2_MEM[idx]
        DONE   = self.DONE_MEM[idx]      
        return STATE, ACTION, REWARD, STATE2, DONE
    
    def act(self, state):
        if self.double_dqn:
            self.step += 1
        if random.random() < self.exploration_rate:  
            return torch.tensor([[random.randrange(self.action_space)]])
        if self.double_dqn:
            return torch.argmax(self.local_net(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()

    def experience_replay(self):
        if self.double_dqn and self.step % self.copy == 0:
            self.target_net.load_state_dict(self.local_net.state_dict())
        if self.memory_sample_size > self.num_in_queue:
            return
        # taking state action reward second state and done from batch experiences
        STATE, ACTION, REWARD, STATE2, DONE = self.batch_experiences()
        STATE = STATE.to(self.device)
        ACTION = ACTION.to(self.device)
        REWARD = REWARD.to(self.device)
        STATE2 = STATE2.to(self.device)
        DONE = DONE.to(self.device)

        # Implementation of DQN 
        # Double Q-Learning target is Q*(S, A) <- r + γ max_a Q_target(S', a)
        target = REWARD + torch.mul((self.gamma * self.target_net(STATE2).max(1).values.unsqueeze(1)),  1 - DONE)
        current = self.local_net(STATE).gather(1, ACTION.long())

        loss = self.l1(current, target)
        loss.backward() # Compute gradients
        self.optimizer.step() # Backpropagate error
        self.exploration_rate *= self.exploration_decay
        self.exploration_rate = max(self.exploration_rate, self.exploration_min)

In [None]:
def show_state(env, ep=0, info=""):
    plt.figure(3)
    plt.clf()
    plt.imshow(env.render(mode='rgb_array'))
    plt.title("Episode: %d %s" % (ep, info))
    plt.axis('off')

    display.clear_output(wait=True)
    display.display(plt.gcf())

In [None]:
def run(training_mode, pretrained, double_dqn, exploration_max=1, movement = RIGHT_ONLY, num_episodes = 500):
    # Right Only Default    
    env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')
    env = create_mario_env(env,movement)
    if double_dqn == False:
      agent = DQN(state_space = observation_space,
                  action_space = action_space,
                  batch_size = 32,
                  gamma = 0.85,
                  lr = 25e-3,
                  dropout = 0.2,
                  exploration_max = 1.0,
                  exploration_min = 0.02,
                  exploration_decay = 0.99,
                  double_dqn = double_dqn,
                  pretrained = pretrained)
    else:
      agent = DDQN(state_space = observation_space,
                   action_space = action_space,
                   batch_size = 32,
                   gamma = 0.85,
                   lr = 25e-3,
                   dropout = 0.2,
                   exploration_max = 1.0,
                   exploration_min = 0.02,
                   exploration_decay = 0.99,
                   double_dqn = double_dqn,
                   pretrained = pretrained)
    # Restart the enviroment for each episode
    num_episodes = num_episodes
    env.reset()
    
    total_rewards = []
    average_rewards = []
    
    if training_mode and pretrained:
        with open("total_rewards.pkl", 'rb') as f:
            total_rewards = pickle.load(f)
    
    for ep_num in tqdm(range(num_episodes)):
        state = env.reset()
        state = torch.Tensor([state])
        total_reward = 0
        steps = 0
        while True:
            if not training_mode:
                show_state(env, ep_num + 1)
            action = agent.act(state)
            steps += 1
            
            state_next, reward, terminal, info = env.step(int(action[0]))
            total_reward += reward
            state_next = torch.Tensor([state_next])
            reward = torch.tensor([reward]).unsqueeze(0)
            
            terminal = torch.tensor([int(terminal)]).unsqueeze(0)
            
            if training_mode:
                agent.remember(state, action, reward, state_next, terminal)
                agent.experience_replay()
            
            state = state_next
            if terminal:
                break
        
        total_rewards.append(total_reward)
        average_rewards.append(np.mean(total_rewards))

        print("=> Episode {} Score = {:.2f}, Average Score = {:.2f}".format(ep_num + 1, total_rewards[-1], np.mean(total_rewards)))

    if training_mode:
        with open("ending_position.pkl", "wb") as f:
            pickle.dump(agent.ending_position, f)
        with open("num_in_queue.pkl", "wb") as f:
            pickle.dump(agent.num_in_queue, f)
        with open("total_rewards.pkl", "wb") as f:
            pickle.dump(total_rewards, f)
        if agent.double_dqn:
            torch.save(agent.local_net.state_dict(), "DQN1.pt")
            torch.save(agent.target_net.state_dict(), "DQN2.pt")
        else:
            torch.save(agent.dqn.state_dict(), "DQN.pt")  
        torch.save(agent.STATE_MEM,  "STATE_MEM.pt")
        torch.save(agent.ACTION_MEM, "ACTION_MEM.pt")
        torch.save(agent.REWARD_MEM, "REWARD_MEM.pt")
        torch.save(agent.STATE2_MEM, "STATE2_MEM.pt")
        torch.save(agent.DONE_MEM,   "DONE_MEM.pt")
    # plot Episodes vs Reward
    x = np.arange(1,len(average_rewards)+1)
    y = average_rewards   
    return(x,y)

## Deep Q-Learning Movement => Right Only 

In [None]:
# Number of episode that you want to train;
episodes_nb = 2
# For training
deepq_right_x, deepq_right_y = run(training_mode=True, pretrained=False, double_dqn=False, num_episodes=episodes_nb, 
                                   exploration_max = 0.95, movement = RIGHT_ONLY)

In [None]:
# For Testing
episodes_nb_test = 3
deepq_right_x_test,deepq_right_y_test = run(training_mode=False, pretrained=False, double_dqn=False, num_episodes=episodes_nb_test, 
                                            exploration_max = 0.05, movement = RIGHT_ONLY)

In [None]:
# Train Plot
plt.plot(deepq_right_x, deepq_right_y)
plt.title(f"Episodes vs Average Rewards (Deep Q - RIGHT_ONLY)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

In [None]:
# Test Plot
plt.plot(deepq_right_x_test, deepq_right_y_test)
plt.title(f"Episodes vs Average Rewards (Deep Q - RIGHT_ONLY)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

## Deep Q-Learning Movement => SIMPLE_MOVEMENT

In [None]:
# Number of episode that you want to train;
#episodes_nb = 3
# For training
deepq_simple_x, deepq_simple_y = run(training_mode=True, pretrained=False, double_dqn=False, num_episodes=episodes_nb, 
                                     exploration_max = 0.95, movement = SIMPLE_MOVEMENT)

In [None]:
# For Testing
#episodes_nb_test = 3
deepq_simple_x_test,deepq_simple_y_test = run(training_mode=False, pretrained=False, double_dqn=False, num_episodes=episodes_nb_test, 
                                              exploration_max = 0.05, movement = SIMPLE_MOVEMENT)

In [None]:
plt.plot(deepq_simple_x, deepq_simple_y)
plt.title(f"Episodes vs Average Rewards (Deep Q - SIMPLE_MOVEMENT)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

## Deep Q-Learning Movement => COMPLEX_MOVEMENT

In [None]:
# Number of episode that you want to train;
#episodes_nb = 3
# For training
deepq_complex_x, deepq_complex_y = run(training_mode=True, pretrained=False, double_dqn=False, num_episodes=episodes_nb, 
                                       exploration_max = 0.95, movement = COMPLEX_MOVEMENT)

In [None]:
# For Testing
#episodes_nb_test = 3
deepq_complex_x_test,deepq_complex_y_test = run(training_mode=False, pretrained=False, double_dqn=False, num_episodes=episodes_nb_test, 
                                                exploration_max = 0.05, movement = COMPLEX_MOVEMENT)

In [None]:
plt.plot(deepq_complex_x, deepq_complex_y_test)
plt.title(f"Episodes vs Average Rewards (Deep Q - COMPLEX_MOVEMENT)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

## Double Deep Q-Learning Movement => Right Only 

In [None]:
# Number of episode that you want to train;
#episodes_nb = 3
# For training
double_deepq_right_x, double_deepq_right_y = run(training_mode=True, pretrained=False, double_dqn=True, num_episodes=episodes_nb, 
                                                 exploration_max = 0.95, movement = RIGHT_ONLY)

In [None]:
# For Testing
#episodes_nb_test = 3
double_deepq_right_x_test,double_deepq_right_y_test = run(training_mode=False, pretrained=False, double_dqn=True, num_episodes=episodes_nb_test, 
                                                          exploration_max = 0.05, movement = RIGHT_ONLY)

In [None]:
plt.plot(double_deepq_right_x, double_deepq_right_y)
plt.title(f"Episodes vs Average Rewards (Deep Q - RIGHT_ONLY)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

## Double Deep Q-Learning Movement => SIMPLE_MOVEMENT

In [None]:
# Number of episode that you want to train;
#episodes_nb = 3
# For training
double_deepq_simple_x, double_deepq_simple_y = run(training_mode=True, pretrained=False, double_dqn=True, num_episodes=episodes_nb, 
                                                   exploration_max = 0.95, movement = SIMPLE_MOVEMENT)          

In [None]:
# For Testing
#episodes_nb = 3
double_deepq_simple_x_test,double_deepq_simple_y_test  = run(training_mode=False, pretrained=False, double_dqn=True, num_episodes=episodes_nb_test, 
                                                             exploration_max = 0.05, movement = SIMPLE_MOVEMENT)

In [None]:
plt.plot(double_deepq_simple_x, double_deepq_simple_y)
plt.title(f"Episodes vs Average Rewards (Deep Q - SIMPLE_MOVEMENT)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

## Double Deep Q-Learning Movement => COMPLEX_MOVEMENT

In [None]:
# Number of episode that you want to train;
#episodes_nb = 3
# For training
double_deepq_complex_x, double_deepq_complex_y = run(training_mode=True, pretrained=False, double_dqn=True, num_episodes=episodes_nb, 
                                                     exploration_max = 0.95, movement = COMPLEX_MOVEMENT)

In [None]:
# For Testing
#episodes_nb = 3
double_deepq_complex_x_test,double_deepq_complex_y_test  = run(training_mode=False, pretrained=False, double_dqn=True, num_episodes=episodes_nb_test, 
                                                               exploration_max = 0.05, movement = COMPLEX_MOVEMENT)

In [None]:
plt.plot(double_deepq_complex_x, double_deepq_complex_y)
plt.title(f"Episodes vs Average Rewards (Deep Q - COMPLEX_MOVEMENT)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.show()

Comparison Deep Q vs Double Deep Q for Right Only

In [None]:
plt.plot(deepq_right_x, deepq_right_y)
plt.plot(double_deepq_right_x, double_deepq_right_y)
plt.title(f"Episodes vs Average Rewards (Deep Q vs Doble Deep Q - RIGHT_ONLY)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.legend(["DQN","DDQN"])
plt.show()

Comparison Deep Q vs Double Deep Q for Simple Movement

In [None]:
plt.plot(deepq_simple_x, deepq_simple_y)
plt.plot(double_deepq_simple_x, double_deepq_simple_y)
plt.title(f"Episodes vs Average Rewards (Deep Q vs Doble Deep Q - SIMPLE_MOVEMENT)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.legend(["DQN","DDQN"])
plt.show()

Comparison Deep Q vs Double Deep Q for Complex Movement

In [None]:
plt.plot(deepq_right_x, deepq_right_y)
plt.plot(double_deepq_complex_x, double_deepq_complex_y)
plt.title(f"Episodes vs Average Rewards (Deep Q vs Doble Deep Q - COMPLEX_MOVEMENT)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.legend(["DQN","DDQN"])
plt.show()

Comparison Deep Q for Each Action


In [None]:
plt.plot(deepq_right_x, deepq_right_y)
plt.plot(deepq_simple_x, deepq_simple_y)
plt.plot(deepq_complex_x, deepq_complex_y)
plt.title(f"Episodes vs Average Rewards (Deep Q)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.grid()
plt.legend(["RIGHT_ONLY","SIMPLE_MOVEMENT","COMPLEX_MOVEMENT"])
plt.show()

Comparison Double Deep Q for Each Action

In [None]:
plt.plot(double_deepq_right_x, double_deepq_right_y)
plt.plot(double_deepq_simple_x, double_deepq_simple_y)
plt.plot(double_deepq_complex_x, double_deepq_complex_y)
plt.title(f"Episodes vs Average Rewards (Double Deep Q)")
plt.xlabel("Episodes") 
plt.ylabel("Average Rewards") 
plt.legend(["RIGHT_ONLY","SIMPLE_MOVEMENT","COMPLEX_MOVEMENT"])
plt.grid()
plt.show()