#### Implementation of NoisyNet paper for 2-dimensional Atari games, such as Breakout, Q-bert.
#### https://arxiv.org/abs/1706.10295
<br>

    Please note: The 2 dimensional image state requires a lot of memory capacity due to the buffer size of 1,000,000 as in DQN paper.

#### Please NOTE,
    The code lines different from Vanila DQN are annotated with '*/*/*/'.
    So, by searching '*/*/*/', you can find these lines.
    
    [What is the difference?]
    In NoisyNet, parametized noises are added to network weights, especially to fully-connected layers. 
    And an exploration of an agent is controlled with these noisy layers, so e-greedy exploration is not used here.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim 
import torch.nn.functional as F 

import gym
import numpy as np
import time
import os
import cv2
import matplotlib.pyplot as plt
from IPython.display import clear_output

#### NoisyNet Configurations
    NOTE: This notebook implements Factorized Gaussian noise approach with Vanila DQN. 
<br>

![image](https://drive.google.com/uc?id=1nkhxKS4xNqWEyo0763_petQ00xVkziWu)
![image](https://drive.google.com/uc?id=1TLBJL2sq9he7pwPgMBBsM39KiU323LBO)
![image](https://drive.google.com/uc?id=1FMDgE8Vj4HFRAV8IFNwYuzDPxjOT8CJa)

In [2]:
# */*/*/
class Noisy_LinearLayer(nn.Module):
    ''' Noisy linear layer '''
    def __init__(self, 
                input_feature: "int: the number of input features", 
                output_feature: "int: the number of output features", 
                initial_std: "float: the standard daviation used for parameter initialization"):

        super(Noisy_LinearLayer, self).__init__()

        self.input_feature = input_feature
        self.output_feature = output_feature
        self.init_noise_std = initial_std

        # nn.Parameter : this is learnable parameters. Set by default as "requires_grad=True" 
        self.weight_mu_params = nn.Parameter(torch.Tensor(output_feature, input_feature))
        self.weight_sigma_params = nn.Parameter(torch.Tensor(output_feature, input_feature))
        self.bias_mu_params = nn.Parameter(torch.Tensor(output_feature))
        self.bias_sigma_papams = nn.Parameter(torch.Tensor(output_feature))

        # register_buffer : this is not learnable parameter. 
        self.register_buffer("weight_epsilon", torch.Tensor(output_feature, input_feature))
        self.register_buffer("bias_epsilon", torch.Tensor(output_feature))

        self.initialize_parameters()
        self.initialize_factorized_noise()

    def initialize_parameters(self):
        """Initialize weights and biases using Factorized Gaussian noise"""
        params_range = 1 / np.sqrt(self.input_feature)
        self.weight_mu_params.data.uniform_(-params_range, params_range)
        self.bias_mu_params.data.uniform_(-params_range, params_range)

        self.weight_sigma_params.data.fill_(self.init_noise_std/np.sqrt(self.input_feature))
        self.bias_sigma_papams.data.fill_(self.init_noise_std/np.sqrt(self.input_feature))

    def initialize_factorized_noise(self):
        """Initialize noise parameters in Factorized Gaussian noise"""
        eps_in = torch.randn(self.input_feature)
        eps_in = eps_in.sign() * eps_in.abs().sqrt()
        eps_out = torch.randn(self.output_feature)
        eps_out = eps_out.sign() * eps_out.abs().sqrt()
        self.weight_epsilon.copy_(eps_out.ger(eps_in))
        self.bias_epsilon.copy_(eps_out)

    def forward(self, x):
        """ F.linear passes input x with linear computation """
        return F.linear(x, self.weight_mu_params + self.weight_sigma_params*self.weight_epsilon,
                        self.bias_mu_params + self.bias_sigma_papams*self.bias_epsilon)
# */*/*/

In [3]:
class QNetwork(nn.Module):
    
    def __init__(self, input_dim, action_dim, initial_std, rand_seed=False,
                conv_channel_1=32, conv_channel_2=64, conv_channel_3=64,
                kernel_1=8, kernel_2=4, kernel_3=3, 
                stride_1=4, stride_2=2, stride_3=1):

        super(QNetwork, self).__init__()
        # self.seed = torch.manual_seed(rand_seed)
        self.Conv1 = nn.Conv2d(input_dim[0], conv_channel_1, (kernel_1,kernel_1), stride=stride_1)
        self.Conv2 = nn.Conv2d(conv_channel_1, conv_channel_2, (kernel_2,kernel_2), stride=stride_2)
        self.Conv3 = nn.Conv2d(conv_channel_2, conv_channel_3, (kernel_3,kernel_3), stride=stride_3)

        def calculate_conv2d_size(size, kernel_size, stride):
            return (size - (kernel_size - 1) - 1) // stride  + 1

        w, h = input_dim[1], input_dim[2]
        convw = calculate_conv2d_size(calculate_conv2d_size(calculate_conv2d_size(w,kernel_1,stride_1),
                                                            kernel_2,stride_2),
                                      kernel_3,stride_3)
        convh = calculate_conv2d_size(calculate_conv2d_size(calculate_conv2d_size(h,kernel_1,stride_1),
                                                            kernel_2,stride_2),
                                      kernel_3,stride_3)
        linear_input_size = convw * convh * conv_channel_3

        # */*/*/
        self.V_noisy_linear1 = Noisy_LinearLayer(linear_input_size, 512, initial_std) 
        self.V_noisy_linear2 = Noisy_LinearLayer(512, 1, initial_std) 
        self.A_noisy_linear1 = Noisy_LinearLayer(linear_input_size, 512, initial_std) 
        self.A_noisy_linear2 = Noisy_LinearLayer(512, action_dim, initial_std) 
        # */*/*/
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.Conv1(x)) 
        x = self.relu(self.Conv2(x)) 
        x = self.relu(self.Conv3(x)) 
        x = x.reshape(x.shape[0], -1) 
        # */*/*/
        V = self.V_noisy_linear2(self.relu(self.V_noisy_linear1(x))) 
        A = self.A_noisy_linear2(self.relu(self.A_noisy_linear1(x))) 
        # */*/*/
        Q = V + A - A.mean(dim=-1, keepdim=True) 
        return Q
    
        # */*/*/
    def init_noise(self):

        self.V_noisy_linear1.initialize_factorized_noise()
        self.V_noisy_linear2.initialize_factorized_noise()
        self.A_noisy_linear1.initialize_factorized_noise()
        self.A_noisy_linear2.initialize_factorized_noise()
        # */*/*/

In [4]:
class ReplayBuffer:
    """ Experience Replay Buffer in DQN paper """
    
    def __init__(self, 
                 buffer_size: ('int: total size of the Replay Buffer'), 
                 input_dim: ('tuple: a dimension of input data. Ex) (3, 84, 84)'), 
                 batch_size: ('int: a batch size when updating')):
                 
        # To check if input image has 3 channels
        assert len(input_dim)==3, "The state dimension should be 3-dim! (CHxWxH). Please check if input_dim is right"

        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.save_count, self.current_size = 0, 0

        # One can choose either np.zeros or np.ones. 
        # The reason using np.ones here is for checking the total memory occupancy of the buffer. 
        self.state_buffer = np.ones((buffer_size, input_dim[0], input_dim[1], input_dim[2]), 
                                    dtype=np.uint8) # data type is np.int8 for saving the memory
        self.action_buffer = np.ones(buffer_size, dtype=np.uint8) 
        self.reward_buffer = np.ones(buffer_size, dtype=np.float32) 
        self.next_state_buffer = np.ones((buffer_size, input_dim[0], input_dim[1], input_dim[2]),  
                                         dtype=np.uint8) 
        self.done_buffer = np.ones(buffer_size, dtype=np.uint8) 

    def __len__(self):
        return self.current_size

    def store(self, 
              state: np.ndarray, 
              action: int, 
              reward: float, 
              next_state: np.ndarray, 
              done: int):

        self.state_buffer[self.save_count] = state
        self.action_buffer[self.save_count] = action
        self.reward_buffer[self.save_count] = reward
        self.next_state_buffer[self.save_count] = next_state
        self.done_buffer[self.save_count] = done

        # self.save_count is an index when storing transitions into the replay buffer
        self.save_count = (self.save_count + 1) % self.buffer_size
        # self.current_size is an indication for how many transitions is stored
        self.current_size = min(self.current_size+1, self.buffer_size)

    def batch_load(self):
        # Selecting samples randomly with a size of self.batch_size 
        indices = np.random.randint(self.current_size, size=self.batch_size)
        return dict(
                states=self.state_buffer[indices], 
                actions=self.action_buffer[indices],
                rewards=self.reward_buffer[indices],
                next_states=self.next_state_buffer[indices], 
                dones=self.done_buffer[indices]) 

#### Initialization of noises
    NOTE: This notebook implements NoisyNet-Dueling.
<Br>
    
![image](https://drive.google.com/uc?id=1TG1RywTdqJEQu5yO6ujc76G1U6Lz_i6_)
![image](https://drive.google.com/uc?id=1P-Bdtz4L2fKFulk8ZEPr2kEMv9KlmfRs)
![image](https://drive.google.com/uc?id=1_esXqm0n1040DvsO1Aso63Z8HQidqppU)

In [5]:
class Agent:
    def __init__(self, 
                 env: 'Environment',
                 initial_std: ('float: noise standard deviation'), 
                 input_frame: ('int: The number of channels of input image'),
                 input_dim: ('int: The width and height of pre-processed input image'),
                 training_frames: ('int: The total number of training frames'),
                 skipped_frame: ('int: The number of skipped frames in the environment'),
                 gamma: ('float: Discount Factor'),
                 update_freq: ('int: Behavior Network Update Frequency'),
                 target_update_freq: ('int: Target Network Update Frequency'),
                 update_type: ('str: Update type for target network. Hard or Soft')='hard',
                 soft_update_tau: ('float: Soft update ratio')=None,
                 batch_size: ('int: Update batch size')=32,
                 buffer_size: ('int: Replay buffer size')=1000000,
                 update_start_buffer_size: ('int: Update starting buffer size')=50000,
                 learning_rate: ('float: Learning rate')=0.0004,
                 device_num: ('int: GPU device number')=0,
                 rand_seed: ('int: Random seed')=None,
                 plot_option: ('str: Plotting option')=False,
                 model_path: ('str: Model saving path')='./',
                 trained_model_path: ('str: Trained model path')=''):

        self.action_dim = env.action_space.n
        self.device = torch.device(f'cuda:{device_num}' if torch.cuda.is_available() else 'cpu')
        self.model_path = model_path
        
        self.env = env
        self.initial_std = initial_std        
        self.input_frames = input_frame
        self.input_dim = input_dim
        self.training_frames = training_frames
        self.skipped_frame = skipped_frame
        self.gamma = gamma
        self.update_freq = update_freq
        self.target_update_freq = target_update_freq
        self.update_cnt = 0
        self.update_type = update_type
        self.tau = soft_update_tau
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.update_start = update_start_buffer_size
        self.seed = rand_seed
        self.plot_option = plot_option
        
        self.q_behave = QNetwork((self.input_frames, self.input_dim, self.input_dim), self.action_dim, self.initial_std).to(self.device)
        self.q_target = QNetwork((self.input_frames, self.input_dim, self.input_dim), self.action_dim, self.initial_std).to(self.device)
        if trained_model_path: # load a trained model if existing
            self.q_behave.load_state_dict(torch.load(trained_model_path))
            print("Trained model is loaded successfully.")
        
        # Initialize target network parameters with behavior network parameters
        self.q_target.load_state_dict(self.q_behave.state_dict())
        self.q_target.eval()
        self.optimizer = optim.Adam(self.q_behave.parameters(), lr=learning_rate) 

        self.memory = ReplayBuffer(self.buffer_size, (self.input_frames, self.input_dim, self.input_dim), self.batch_size)

    def select_action(self, state: 'Must be pre-processed in the same way as updating current Q network. See def _compute_loss'):
        
        with torch.no_grad(): 
            # devide an image input with 255 for nomalization
            state = torch.FloatTensor(state).to(self.device).unsqueeze(0)/255
            Qs = self.q_behave(state)
            # take an action of a maximum Q-value
            action = Qs.argmax()

        # return action and Q-values (Q-values are not required for implementing algorithms. This is just for checking Q-values for each state. Not must-needed)  
        return Qs.detach().cpu().numpy(), action.detach().item()  

    def processing_resize_and_gray(self, frame):
        ''' Convert images to gray scale and resize ''' 
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 
        frame = cv2.resize(frame, dsize=(self.input_dim, self.input_dim)).reshape(self.input_dim, self.input_dim).astype(np.uint8) 
        return frame 

    def get_init_state(self):
        ''' return an initial state with a dimension of (self.input_frames, self.input_dim, self.input_dim) '''
        init_state = np.zeros((self.input_frames, self.input_dim, self.input_dim))
        init_frame = self.env.reset()
        init_state[0] = self.processing_resize_and_gray(init_frame)
        
        for i in range(1, self.input_frames): 
            action = self.env.action_space.sample()
            for j in range(self.skipped_frame-1):  
                state, _, _, _ = self.env.step(action) 
            state, _, _, _ = self.env.step(action) 
            init_state[i] = self.processing_resize_and_gray(state) 
        return init_state

    def get_state(self, state, action, skipped_frame=0):
        ''' return reward, next_state, done ''' 
        next_state = np.zeros((self.input_frames, self.input_dim, self.input_dim))
        for i in range(len(state)-1):
            next_state[i] = state[i+1]

        rewards = 0
        dones = 0
        
        for _ in range(skipped_frame-1):
            state, reward, done, _ = self.env.step(action) 
            rewards += reward # reward accumulates for the case that rewards occur while skipping
            dones += int(done) 
        state, reward, done, _ = self.env.step(action) 
        next_state[-1] = self.processing_resize_and_gray(state) 
        rewards += reward 
        dones += int(done) 
        return rewards, next_state, dones

    def store(self, state, action, reward, next_state, done):
        self.memory.store(state, action, reward, next_state, done)

    def update_behavior_q_net(self):
        # */*/*/ 
        # Initialize noises for updating networks.
        self.q_behave.init_noise()
        self.q_target.init_noise()
        # */*/*/
        
        # update behavior q network with a batch
        batch = self.memory.batch_load()
        loss = self._compute_loss(batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def target_soft_update(self):
        ''' target network is updated with Soft Update. tau is a hyperparameter for the updating ratio betweeen target and behavior network  '''
        for target_param, current_param in zip(self.q_target.parameters(), self.q_behave.parameters()):
            target_param.data.copy_(self.tau*current_param.data + (1.0-self.tau)*target_param.data)

    def target_hard_update(self):
        ''' target network is updated with Hard Update '''
        self.update_cnt = (self.update_cnt+1) % self.target_update_freq
        if self.update_cnt==0:
            self.q_target.load_state_dict(self.q_behave.state_dict())

    def train(self):
        tic = time.time()
        losses = []
        scores = []
        avg_scores = [[-10000]] # As an initial score, set an arbitrary score of an episode.

        score = 0

        print("Storing initial buffer..") 
        state = self.get_init_state()
        for frame_idx in range(1, self.update_start+1):
            # Store transitions into the buffer until the number of 'self.update_start' transitions is stored 
            _, action = self.select_action(state)
            reward, next_state, done = self.get_state(state, action, skipped_frame=self.skipped_frame)
            self.store(state, action, reward, next_state, done)
            state = next_state
            if done: state = self.get_init_state()

        print("Done. Start learning..")
        history_store = []
        for frame_idx in range(1, self.training_frames+1):
            Qs, action = self.select_action(state)
            reward, next_state, done = self.get_state(state, action, skipped_frame=self.skipped_frame)
            self.store(state, action, reward, next_state, done)
            history_store.append([state, Qs, action, reward, next_state, done]) # history_store is for checking an episode later. Not must-needed.
            
            if (frame_idx % self.update_freq) == 0:
                loss = self.update_behavior_q_net()
                score += reward
                losses.append(loss)
                
                if self.update_type=='hard':   self.target_hard_update()
                elif self.update_type=='soft': self.target_soft_update()
                
            if done:
                # For saving and plotting when an episode is done.
                scores.append(score)
                if np.mean(scores[-10:]) > max(avg_scores):
                    torch.save(self.q_behave.state_dict(), self.model_path+'{}_Score:{}.pt'.format(frame_idx, np.mean(scores[-10:])))
                    training_time = round((time.time()-tic)/3600, 1)
                    np.save(self.model_path+'{}_history_Score_{}_{}hrs.npy'.format(frame_idx, score, training_time), np.array(history_store))
                    print("          | Model saved. Recent scores: {}, Training time: {}hrs".format(scores[-10:], training_time), ' /'.join(os.getcwd().split('/')[-3:]))
                avg_scores.append(np.mean(scores[-10:]))

                if self.plot_option=='inline': 
                    scores.append(score)
                    self._plot(frame_idx, scores, losses)
                else: 
                    print(score, end='\r')

                score=0
                state = self.get_init_state()
                history_store = []
            else: state = next_state

        print("Total training time: {}(hrs)".format((time.time()-tic)/3600))

    def _compute_loss(self, batch: "Dictionary (S, A, R', S', Dones)"):
        ''' Compute loss. If normalization is used, it must be applied to both 'state' and 'next_state'. ex) state/255 '''
        states = torch.FloatTensor(batch['states']).to(self.device) / 255
        next_states = torch.FloatTensor(batch['next_states']).to(self.device) / 255
        actions = torch.LongTensor(batch['actions'].reshape(-1, 1)).to(self.device)
        rewards = torch.FloatTensor(batch['rewards'].reshape(-1, 1)).to(self.device)
        dones = torch.FloatTensor(batch['dones'].reshape(-1, 1)).to(self.device)

        current_q = self.q_behave(states).gather(1, actions)
        
        # estimate next Q value (Double DQN)
        next_q = self.q_target(next_states).gather(1, self.q_behave(next_states).argmax(axis=1, keepdim=True)).detach()
        mask = 1 - dones
        target = (rewards + (mask * self.gamma * next_q)).to(self.device)

        # Use smooth l1 loss for clipping loss between -1 to 1 as in DQN paper.
        loss = F.smooth_l1_loss(current_q, target)
        return loss

    def _plot(self, frame_idx, scores, losses):
        clear_output(True) 
        plt.figure(figsize=(20, 5), facecolor='w') 
        plt.subplot(121)  
        plt.title('frame %s. score: %s' % (frame_idx, np.mean(scores[-10:])))
        plt.plot(scores) 
        plt.subplot(122) 
        plt.title('loss') 
        plt.plot(losses) 
        plt.show() 

In [6]:
env_list = {
    0: "CartPole-v0",
    1: "CartPole-v2",
    2: "LunarLander-v2",
    3: "Breakout-v4",
    4: "BreakoutDeterministic-v4",
    5: "BreakoutNoFrameskip-v4",
    6: "BoxingDeterministic-v4",
    7: "PongDeterministic-v4",
}
env_name = env_list[6]
env = gym.make(env_name)

# Same input size as in DQN paper. 
input_dim = 84
input_frame = 4

print("env_name", env_name) 
print(env.unwrapped.get_action_meanings(), env.action_space.n) 

# starting to update Q-network until ReplayBuffer is filled with the number of samples = update_start_buffer_size
update_start_buffer_size = 10000

# total training frames
training_frames = 10000000

## epsilon for exploration --> These are not needed anymore. 
# eps_max = 1.0
# eps_min = 0.1
# eps_decay = 1/1000000

# gamma (used decaying future rewards)
gamma = 0.99

# size of ReplayBuffer
buffer_size = int(1e6) # this is the same size of the paper
# buffer_size = int(1.5e5) # if don't have an enough memory capacity, lower the value like this. But this may cause a bad training performance.   

# update batch size
batch_size = 32           
learning_rate = 0.0001 # In the paper, they use RMSProp and learning rate 0.00025. In this notebook, the Adam is used with lr=0.0001. 

# updating Q-network with 'soft' or 'hard' updating method
update_freq = 4
update_type = 'hard'
soft_update_tau = 0.002

# target network update frequency (applied when it takes 'hard' update). 
# 10000 means the target network is updated once while the behavior network is updated 10000 times. 
target_update_freq = 10000

# assign skipped_frame to be 0
# because the word 'Deterministic' in the name 'BoxingDeterministic' means it automatically skips 4 frames in the game.
# assign skipped_frame to be 0 when selecting games such as "BreakoutNoFrameskip".
skipped_frame = 0

# cuda device
device_num = 0

# Noise standard deviation
initial_std = 0.5 # 0.5 is a value for Factorized Gaussian Noise

# choose plotting option.
# 'inline' - plots status in jupyter notebook
# 'False' - it prints only reward of the episode
plot_options = {1: 'inline', 2: False} 
plot_option = plot_options[2]

# The path for saving a trained model.  
rand_seed = None
rand_name = ('').join(map(str, np.random.randint(10, size=(3,))))
folder_name = os.getcwd().split('/')[-1] 
model_name = 'Test'
model_save_path = f'./model_save/{model_name}/'
if not os.path.exists('./model_save/'):
    os.mkdir('./model_save/')
if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)
print("model_save_path:", model_save_path)
trained_model_path = ''

env_name BoxingDeterministic-v4
['NOOP', 'FIRE', 'UP', 'RIGHT', 'LEFT', 'DOWN', 'UPRIGHT', 'UPLEFT', 'DOWNRIGHT', 'DOWNLEFT', 'UPFIRE', 'RIGHTFIRE', 'LEFTFIRE', 'DOWNFIRE', 'UPRIGHTFIRE', 'UPLEFTFIRE', 'DOWNRIGHTFIRE', 'DOWNLEFTFIRE'] 18
model_save_path: ./model_save/Test/


In [None]:
# There is no more epsilon parameter for exploration.
agent = Agent( 
    env,
    initial_std,
    input_frame,
    input_dim,
    training_frames,
    skipped_frame,
    gamma,
    update_freq,
    target_update_freq,
    update_type,
    soft_update_tau,
    batch_size,
    buffer_size,
    update_start_buffer_size,
    learning_rate,
    device_num,
    rand_seed,
    plot_option,
    model_save_path,
    trained_model_path
) 

agent.train()

#### An example of results

    Storing initial buffer..
    Done. Start learning..
              | Model saved. Recent scores: [-3.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-3.0, 4.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-3.0, 4.0, -2.0, 5.0], Training time: 0.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [7.0, -2.0, 3.0, 8.0, 6.0, -7.0, 4.0, 1.0, -4.0, 7.0], Training time: 0.2hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [3.0, -3.0, -7.0, -7.0, 5.0, 1.0, 6.0, 10.0, 9.0, 10.0], Training time: 0.3hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-7.0, -7.0, 5.0, 1.0, 6.0, 10.0, 9.0, 10.0, -3.0, 8.0], Training time: 0.3hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-7.0, 5.0, 1.0, 6.0, 10.0, 9.0, 10.0, -3.0, 8.0, -1.0], Training time: 0.3hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [5.0, 1.0, 6.0, 10.0, 9.0, 10.0, -3.0, 8.0, -1.0, 1.0], Training time: 0.3hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [6.0, 12.0, 6.0, -4.0, -4.0, 6.0, 9.0, -3.0, 14.0, 5.0], Training time: 0.5hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [12.0, 6.0, -4.0, -4.0, 6.0, 9.0, -3.0, 14.0, 5.0, 11.0], Training time: 0.5hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-4.0, 6.0, 9.0, -3.0, 14.0, 5.0, 11.0, -1.0, 9.0, 14.0], Training time: 0.5hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [6.0, 9.0, -3.0, 14.0, 5.0, 11.0, -1.0, 9.0, 14.0, 1.0], Training time: 0.5hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [7.0, -5.0, 11.0, 1.0, 4.0, -2.0, 11.0, 7.0, 13.0, 23.0], Training time: 0.9hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [11.0, 1.0, 4.0, -2.0, 11.0, 7.0, 13.0, 23.0, -2.0, 6.0], Training time: 0.9hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [1.0, 4.0, -2.0, 11.0, 7.0, 13.0, 23.0, -2.0, 6.0, 16.0], Training time: 0.9hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [4.0, -2.0, 11.0, 7.0, 13.0, 23.0, -2.0, 6.0, 16.0, 11.0], Training time: 0.9hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-2.0, 11.0, 7.0, 13.0, 23.0, -2.0, 6.0, 16.0, 11.0, 8.0], Training time: 0.9hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [11.0, 7.0, 13.0, 23.0, -2.0, 6.0, 16.0, 11.0, 8.0, 6.0], Training time: 0.9hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [-2.0, 26.0, 14.0, 7.0, 3.0, 2.0, 0.0, 33.0, 16.0, 5.0], Training time: 1.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [26.0, 14.0, 7.0, 3.0, 2.0, 0.0, 33.0, 16.0, 5.0, 6.0], Training time: 1.0hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [15.0, 18.0, 5.0, 14.0, 9.0, 4.0, 13.0, 19.0, 13.0, 11.0], Training time: 1.6hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [18.0, 5.0, 14.0, 9.0, 4.0, 13.0, 19.0, 13.0, 11.0, 20.0], Training time: 1.6hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [13.0, 19.0, 13.0, 11.0, 20.0, 13.0, 0.0, 14.0, 13.0, 22.0], Training time: 1.6hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [19.0, 13.0, 11.0, 20.0, 13.0, 0.0, 14.0, 13.0, 22.0, 21.0], Training time: 1.6hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [5.0, 18.0, 5.0, 15.0, 10.0, 4.0, 8.0, 43.0, 20.0, 25.0], Training time: 2.4hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [18.0, 5.0, 15.0, 10.0, 4.0, 8.0, 43.0, 20.0, 25.0, 7.0], Training time: 2.4hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [15.0, 10.0, 4.0, 8.0, 43.0, 20.0, 25.0, 7.0, 11.0, 17.0], Training time: 2.4hrs MacaronRL /Value_Based /Vanila_DQN
              | Model saved. Recent scores: [8.0, 43.0, 20.0, 25.0, 7.0, 11.0, 17.0, 7.0, 13.0, 11.0], Training time: 2.4hrs MacaronRL /Value_Based /Vanila_DQN