################################################################################
> # **Part - I**

*   define actor critic networks
*   define PPO algorithm

################################################################################

In [1]:


############################### Import libraries ###############################


import os
import glob
import time
from datetime import datetime

import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

import numpy as np

import gym
import pongGym

################################## set device ##################################

print("============================================================================================")


# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
    
print("============================================================================================")




################################## PPO Policy ##################################


class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init):
        super(ActorCritic, self).__init__()

#         self.quant = torch.quantization.QuantStub()
        self.actor = nn.Sequential(
                        nn.Linear(state_dim, 128),
                        nn.ReLU(),
                        nn.Linear(128, 128),
                        nn.ReLU(),
                        nn.Linear(128, 128),
                        nn.ReLU(),
                        nn.Linear(128, action_dim),
                        nn.Softmax(dim=-1)
                    )

#         self.dequant = torch.quantization.DeQuantStub()
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 128),
                        nn.ReLU(),
                        nn.Linear(128, 128),
                        nn.ReLU(),
                        nn.Linear(128, 128),
                        nn.ReLU(),
                        nn.Linear(128, 1)
                    )

    def forward(self):
        raise NotImplementedError
    

    def act(self, state):

        
        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    

    def evaluate(self, state, action):


        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std_init=0.6):

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()


    def select_action(self, state):

        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.act(state)

        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_logprob)

        return action.item()


    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)

        
        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        
        
       



pygame 2.1.0 (SDL 2.0.16, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html
Device set to : NVIDIA GeForce RTX 3070


################################################################################
> # **Part - II**

*   train PPO algorithm on environments
*   save preTrained networks weights and log files

################################################################################

In [None]:

print("============================================================================================")


################################### Training ###################################


####### initialize environment hyperparameters ######

env_name = "pong_game"

max_ep_len = 60*60                    # max timesteps in one episode
max_training_timesteps = int(1e5)   # break training loop if timeteps > max_training_timesteps

print_freq = max_ep_len * 4     # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2       # log avg reward in the interval (in num timesteps)
save_model_freq = int(2e4)      # save model frequency (in num timesteps)

action_std = None


#####################################################


## Note : print/log frequencies should be > than max_ep_len


################ PPO hyperparameters ################


update_timestep = max_ep_len * 4      # update policy every n timesteps
K_epochs = 60*3               # update policy for K epochs
# K_epochs = 4
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.0003       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network

random_seed = 0         # set random seed if required (0 = no random seed)

#####################################################



print("training environment name : " + env_name)

env = pongGym.DoublePong()

# state space dimension
state_dim = env.observation_space.shape[0]

# action space dimension

action_dim = env.action_space.n



###################### logging ######################

#### log files for multiple runs are NOT overwritten

log_dir = "PPO_logs"
if not os.path.exists(log_dir):
      os.makedirs(log_dir)

log_dir = log_dir + '/' + env_name + '/'
if not os.path.exists(log_dir):
      os.makedirs(log_dir)


#### get number of log files in log directory
run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files)


#### create new log file for each run 
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

print("current logging run number for " + env_name + " : ", run_num)
print("logging at : " + log_f_name)

#####################################################


################### checkpointing ###################

run_num_pretrained = 0      #### change this to prevent overwriting weights in same env_name folder

directory = "PPO_preTrained"
if not os.path.exists(directory):
      os.makedirs(directory)

directory = directory + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)

#####################################################


############# print all hyperparameters #############

print("--------------------------------------------------------------------------------------------")

print("max training timesteps : ", max_training_timesteps)
print("max timesteps per episode : ", max_ep_len)

print("model saving frequency : " + str(save_model_freq) + " timesteps")
print("log frequency : " + str(log_freq) + " timesteps")
print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")

print("--------------------------------------------------------------------------------------------")

print("state space dimension : ", state_dim)
print("action space dimension : ", action_dim)

print("--------------------------------------------------------------------------------------------")

print("Initializing a discrete action space policy")

print("--------------------------------------------------------------------------------------------")

print("PPO update frequency : " + str(update_timestep) + " timesteps") 
print("PPO K epochs : ", K_epochs)
print("PPO epsilon clip : ", eps_clip)
print("discount factor (gamma) : ", gamma)

print("--------------------------------------------------------------------------------------------")

print("optimizer learning rate actor : ", lr_actor)
print("optimizer learning rate critic : ", lr_critic)

if random_seed:
    print("--------------------------------------------------------------------------------------------")
    print("setting random seed to ", random_seed)
    torch.manual_seed(random_seed)
    env.seed(random_seed)
    np.random.seed(random_seed)

#####################################################

print("============================================================================================")

################# training procedure ################

# initialize a PPO agent
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std)


# track total training time
start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)

print("============================================================================================")


# logging file
log_f = open(log_f_name,"w+")
log_f.write('episode,timestep,reward\n')


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

log_running_reward = 0
log_running_episodes = 0

time_step = 0
i_episode = 0

Max_reward = 0
# training loop
# env = pongGym.DoublePong()
while i_episode < 100000:
    
    state = env.reset()
    current_ep_reward = 0
    for t in range(1, max_ep_len+1):
        
        # select action with policy
        action = ppo_agent.select_action(state)
        state, reward, done, _ = env.step(action)
#         if i_episode > 100:
#             env.render()
        
        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)
        
        time_step +=1
        current_ep_reward += reward

        # update PPO agent
        if time_step % update_timestep == 0:
            ppo_agent.update()

        # log in logging file
        if time_step % log_freq == 0:

            # log average reward till last episode
            log_avg_reward = log_running_reward / log_running_episodes
            log_avg_reward = round(log_avg_reward, 4)

            log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
            log_f.flush()

            log_running_reward = 0
            log_running_episodes = 0
            
        # printing average reward
        if time_step % print_freq == 0:

            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
            print_running_reward = 0
            print_running_episodes = 0
        # break; if the episode is over
        if done:
            break

    # save model weights
#     print("Episode : {} \t\t Totol reward : {}".format(i_episode, current_ep_reward))
    if current_ep_reward > Max_reward:
        Max_reward = current_ep_reward
        checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, int(Max_reward), run_num_pretrained)
        print("--------------------------------------------------------------------------------------------")
        print("saving model at : " + checkpoint_path)
        ppo_agent.save(checkpoint_path)
        print("model saved")
        print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
        print("--------------------------------------------------------------------------------------------")
    print_running_reward += current_ep_reward
    print_running_episodes += 1

    log_running_reward += current_ep_reward
    log_running_episodes += 1

    i_episode += 1


log_f.close()
env.close()




# print total training time
print("============================================================================================")
end_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)
print("Finished training at (GMT) : ", end_time)
print("Total training time  : ", end_time - start_time)
print("============================================================================================")







training environment name : pong_game
current logging run number for pong_game :  0
logging at : PPO_logs/pong_game//PPO_pong_game_log_0.csv
--------------------------------------------------------------------------------------------
max training timesteps :  100000
max timesteps per episode :  3600
model saving frequency : 20000 timesteps
log frequency : 7200 timesteps
printing average reward over episodes in last : 14400 timesteps
--------------------------------------------------------------------------------------------
state space dimension :  22
action space dimension :  4
--------------------------------------------------------------------------------------------
Initializing a discrete action space policy
--------------------------------------------------------------------------------------------
PPO update frequency : 14400 timesteps
PPO K epochs :  180
PPO epsilon clip :  0.2
discount factor (gamma) :  0.99
---------------------------------------------------------------------

Episode : 379 		 Timestep : 1368000 		 Average Reward : -169.58
Episode : 383 		 Timestep : 1382400 		 Average Reward : -194.15
Episode : 387 		 Timestep : 1396800 		 Average Reward : -205.69
Episode : 391 		 Timestep : 1411200 		 Average Reward : -184.84
Episode : 395 		 Timestep : 1425600 		 Average Reward : -185.26
Episode : 399 		 Timestep : 1440000 		 Average Reward : -179.88
Episode : 403 		 Timestep : 1454400 		 Average Reward : -237.32
Episode : 407 		 Timestep : 1468800 		 Average Reward : -136.27
Episode : 411 		 Timestep : 1483200 		 Average Reward : -208.75
Episode : 415 		 Timestep : 1497600 		 Average Reward : -115.48
Episode : 419 		 Timestep : 1512000 		 Average Reward : -104.81
Episode : 423 		 Timestep : 1526400 		 Average Reward : -119.02
Episode : 427 		 Timestep : 1540800 		 Average Reward : -186.17
Episode : 431 		 Timestep : 1555200 		 Average Reward : -184.22
Episode : 435 		 Timestep : 1569600 		 Average Reward : -166.89
Episode : 439 		 Timestep : 1584000 		 A

Episode : 803 		 Timestep : 2894400 		 Average Reward : -91.21
Episode : 807 		 Timestep : 2908800 		 Average Reward : -190.46
Episode : 811 		 Timestep : 2923200 		 Average Reward : -171.97
Episode : 815 		 Timestep : 2937600 		 Average Reward : -127.55
Episode : 819 		 Timestep : 2952000 		 Average Reward : -234.22
Episode : 823 		 Timestep : 2966400 		 Average Reward : -92.07
Episode : 827 		 Timestep : 2980800 		 Average Reward : -74.27
Episode : 831 		 Timestep : 2995200 		 Average Reward : -257.17
Episode : 835 		 Timestep : 3009600 		 Average Reward : -159.95
Episode : 839 		 Timestep : 3024000 		 Average Reward : -47.47
Episode : 843 		 Timestep : 3038400 		 Average Reward : -107.66
Episode : 847 		 Timestep : 3052800 		 Average Reward : -91.12
Episode : 851 		 Timestep : 3067200 		 Average Reward : -138.56
Episode : 855 		 Timestep : 3081600 		 Average Reward : -122.53
Episode : 859 		 Timestep : 3096000 		 Average Reward : -56.66
Episode : 863 		 Timestep : 3110400 		 Average

Episode : 1299 		 Timestep : 4680000 		 Average Reward : -33.12
Episode : 1303 		 Timestep : 4694400 		 Average Reward : -142.77
Episode : 1307 		 Timestep : 4708800 		 Average Reward : -65.09
Episode : 1311 		 Timestep : 4723200 		 Average Reward : -141.3
Episode : 1315 		 Timestep : 4737600 		 Average Reward : -71.11
Episode : 1319 		 Timestep : 4752000 		 Average Reward : -176.12
Episode : 1323 		 Timestep : 4766400 		 Average Reward : -70.2
Episode : 1327 		 Timestep : 4780800 		 Average Reward : -134.83
Episode : 1331 		 Timestep : 4795200 		 Average Reward : -53.59
Episode : 1335 		 Timestep : 4809600 		 Average Reward : -86.98
Episode : 1339 		 Timestep : 4824000 		 Average Reward : -119.32
Episode : 1343 		 Timestep : 4838400 		 Average Reward : 38.36
Episode : 1347 		 Timestep : 4852800 		 Average Reward : -27.12
Episode : 1351 		 Timestep : 4867200 		 Average Reward : -110.27
Episode : 1355 		 Timestep : 4881600 		 Average Reward : -96.41
Episode : 1359 		 Timestep : 4896000 

Episode : 1795 		 Timestep : 6465600 		 Average Reward : -39.91
Episode : 1799 		 Timestep : 6480000 		 Average Reward : -154.24
Episode : 1803 		 Timestep : 6494400 		 Average Reward : -67.54
Episode : 1807 		 Timestep : 6508800 		 Average Reward : -115.21
Episode : 1811 		 Timestep : 6523200 		 Average Reward : -108.9
Episode : 1815 		 Timestep : 6537600 		 Average Reward : -76.14
Episode : 1819 		 Timestep : 6552000 		 Average Reward : -75.8
Episode : 1823 		 Timestep : 6566400 		 Average Reward : -58.76
Episode : 1827 		 Timestep : 6580800 		 Average Reward : -43.19
Episode : 1831 		 Timestep : 6595200 		 Average Reward : -94.02
Episode : 1835 		 Timestep : 6609600 		 Average Reward : -149.19
Episode : 1839 		 Timestep : 6624000 		 Average Reward : -128.47
Episode : 1843 		 Timestep : 6638400 		 Average Reward : -150.41
Episode : 1847 		 Timestep : 6652800 		 Average Reward : -160.85
Episode : 1851 		 Timestep : 6667200 		 Average Reward : -131.67
Episode : 1855 		 Timestep : 66816

Episode : 2307 		 Timestep : 8308800 		 Average Reward : -157.44
Episode : 2311 		 Timestep : 8323200 		 Average Reward : -94.32
Episode : 2315 		 Timestep : 8337600 		 Average Reward : -130.04
Episode : 2319 		 Timestep : 8352000 		 Average Reward : -63.8
Episode : 2323 		 Timestep : 8366400 		 Average Reward : -126.65
Episode : 2327 		 Timestep : 8380800 		 Average Reward : -173.43
Episode : 2331 		 Timestep : 8395200 		 Average Reward : -138.26
Episode : 2335 		 Timestep : 8409600 		 Average Reward : -96.9
Episode : 2339 		 Timestep : 8424000 		 Average Reward : -128.17
Episode : 2343 		 Timestep : 8438400 		 Average Reward : -62.87
Episode : 2347 		 Timestep : 8452800 		 Average Reward : -105.61
Episode : 2351 		 Timestep : 8467200 		 Average Reward : -111.44
Episode : 2355 		 Timestep : 8481600 		 Average Reward : -179.0
Episode : 2359 		 Timestep : 8496000 		 Average Reward : -17.61
Episode : 2363 		 Timestep : 8510400 		 Average Reward : -41.99
Episode : 2367 		 Timestep : 85248

Episode : 2799 		 Timestep : 10080000 		 Average Reward : -193.93
Episode : 2803 		 Timestep : 10094400 		 Average Reward : -138.07
Episode : 2807 		 Timestep : 10108800 		 Average Reward : -142.21
Episode : 2811 		 Timestep : 10123200 		 Average Reward : -117.62
Episode : 2815 		 Timestep : 10137600 		 Average Reward : -94.5
Episode : 2819 		 Timestep : 10152000 		 Average Reward : -148.58
Episode : 2823 		 Timestep : 10166400 		 Average Reward : -145.23
Episode : 2827 		 Timestep : 10180800 		 Average Reward : -94.07
Episode : 2831 		 Timestep : 10195200 		 Average Reward : -70.52
Episode : 2835 		 Timestep : 10209600 		 Average Reward : -31.22
Episode : 2839 		 Timestep : 10224000 		 Average Reward : -113.35
Episode : 2843 		 Timestep : 10238400 		 Average Reward : -128.63
Episode : 2847 		 Timestep : 10252800 		 Average Reward : -50.0
Episode : 2851 		 Timestep : 10267200 		 Average Reward : -87.47
Episode : 2855 		 Timestep : 10281600 		 Average Reward : -121.65
Episode : 2859 		 

Episode : 3299 		 Timestep : 11880000 		 Average Reward : -195.86
Episode : 3303 		 Timestep : 11894400 		 Average Reward : -129.23
Episode : 3307 		 Timestep : 11908800 		 Average Reward : -119.52
Episode : 3311 		 Timestep : 11923200 		 Average Reward : -109.95
Episode : 3315 		 Timestep : 11937600 		 Average Reward : -69.09
Episode : 3319 		 Timestep : 11952000 		 Average Reward : -159.49
Episode : 3323 		 Timestep : 11966400 		 Average Reward : -110.06
Episode : 3327 		 Timestep : 11980800 		 Average Reward : -34.16
Episode : 3331 		 Timestep : 11995200 		 Average Reward : -183.13
Episode : 3335 		 Timestep : 12009600 		 Average Reward : -219.22
Episode : 3339 		 Timestep : 12024000 		 Average Reward : -187.74
Episode : 3343 		 Timestep : 12038400 		 Average Reward : -205.86
Episode : 3347 		 Timestep : 12052800 		 Average Reward : -91.85
Episode : 3351 		 Timestep : 12067200 		 Average Reward : -145.63
Episode : 3355 		 Timestep : 12081600 		 Average Reward : -87.31
Episode : 3359

Episode : 3799 		 Timestep : 13680000 		 Average Reward : -34.41
Episode : 3803 		 Timestep : 13694400 		 Average Reward : -136.1
Episode : 3807 		 Timestep : 13708800 		 Average Reward : -116.37
Episode : 3811 		 Timestep : 13723200 		 Average Reward : -134.85
Episode : 3815 		 Timestep : 13737600 		 Average Reward : -90.8
Episode : 3819 		 Timestep : 13752000 		 Average Reward : -122.22
Episode : 3823 		 Timestep : 13766400 		 Average Reward : -126.77
Episode : 3827 		 Timestep : 13780800 		 Average Reward : -126.79
Episode : 3831 		 Timestep : 13795200 		 Average Reward : -57.85
Episode : 3835 		 Timestep : 13809600 		 Average Reward : -171.71
Episode : 3839 		 Timestep : 13824000 		 Average Reward : -136.9
Episode : 3843 		 Timestep : 13838400 		 Average Reward : -155.9
Episode : 3847 		 Timestep : 13852800 		 Average Reward : -39.12
Episode : 3851 		 Timestep : 13867200 		 Average Reward : -55.22
Episode : 3855 		 Timestep : 13881600 		 Average Reward : -186.09
Episode : 3859 		 T

Episode : 4303 		 Timestep : 15494400 		 Average Reward : -82.02
Episode : 4307 		 Timestep : 15508800 		 Average Reward : -74.31
Episode : 4311 		 Timestep : 15523200 		 Average Reward : -11.85
Episode : 4315 		 Timestep : 15537600 		 Average Reward : -72.15
Episode : 4319 		 Timestep : 15552000 		 Average Reward : -148.87
Episode : 4323 		 Timestep : 15566400 		 Average Reward : -124.35
Episode : 4327 		 Timestep : 15580800 		 Average Reward : -66.64
Episode : 4331 		 Timestep : 15595200 		 Average Reward : -65.44
Episode : 4335 		 Timestep : 15609600 		 Average Reward : -125.84
Episode : 4339 		 Timestep : 15624000 		 Average Reward : -69.27
Episode : 4343 		 Timestep : 15638400 		 Average Reward : -94.85
Episode : 4347 		 Timestep : 15652800 		 Average Reward : -104.21
Episode : 4351 		 Timestep : 15667200 		 Average Reward : -173.05
Episode : 4355 		 Timestep : 15681600 		 Average Reward : -56.19
Episode : 4359 		 Timestep : 15696000 		 Average Reward : -95.78
Episode : 4363 		 Ti

Episode : 4811 		 Timestep : 17323200 		 Average Reward : 39.27
Episode : 4815 		 Timestep : 17337600 		 Average Reward : 42.54
Episode : 4819 		 Timestep : 17352000 		 Average Reward : -5.89
Episode : 4823 		 Timestep : 17366400 		 Average Reward : -103.19
Episode : 4827 		 Timestep : 17380800 		 Average Reward : -47.47
Episode : 4831 		 Timestep : 17395200 		 Average Reward : -5.45
Episode : 4835 		 Timestep : 17409600 		 Average Reward : 6.4
Episode : 4839 		 Timestep : 17424000 		 Average Reward : -54.25
Episode : 4843 		 Timestep : 17438400 		 Average Reward : -16.12
Episode : 4847 		 Timestep : 17452800 		 Average Reward : -43.91
Episode : 4851 		 Timestep : 17467200 		 Average Reward : -92.56
Episode : 4855 		 Timestep : 17481600 		 Average Reward : -27.42
Episode : 4859 		 Timestep : 17496000 		 Average Reward : -104.77
Episode : 4863 		 Timestep : 17510400 		 Average Reward : -29.08
Episode : 4867 		 Timestep : 17524800 		 Average Reward : -0.79
Episode : 4871 		 Timestep : 17

Episode : 5319 		 Timestep : 19152000 		 Average Reward : -96.36
Episode : 5323 		 Timestep : 19166400 		 Average Reward : -42.36
Episode : 5327 		 Timestep : 19180800 		 Average Reward : 63.33
Episode : 5331 		 Timestep : 19195200 		 Average Reward : -61.16
Episode : 5335 		 Timestep : 19209600 		 Average Reward : -12.16
Episode : 5339 		 Timestep : 19224000 		 Average Reward : -110.72
Episode : 5343 		 Timestep : 19238400 		 Average Reward : -33.71
Episode : 5347 		 Timestep : 19252800 		 Average Reward : 4.81
Episode : 5351 		 Timestep : 19267200 		 Average Reward : -35.46
Episode : 5355 		 Timestep : 19281600 		 Average Reward : 31.36
Episode : 5359 		 Timestep : 19296000 		 Average Reward : -1.01
Episode : 5363 		 Timestep : 19310400 		 Average Reward : -58.71
Episode : 5367 		 Timestep : 19324800 		 Average Reward : -61.18
Episode : 5371 		 Timestep : 19339200 		 Average Reward : -22.02
Episode : 5375 		 Timestep : 19353600 		 Average Reward : 4.11
Episode : 5379 		 Timestep : 19

Episode : 5827 		 Timestep : 20980800 		 Average Reward : -63.24
Episode : 5831 		 Timestep : 20995200 		 Average Reward : -16.44
Episode : 5835 		 Timestep : 21009600 		 Average Reward : -113.32
Episode : 5839 		 Timestep : 21024000 		 Average Reward : -25.51
Episode : 5843 		 Timestep : 21038400 		 Average Reward : 29.8
Episode : 5847 		 Timestep : 21052800 		 Average Reward : 59.01
Episode : 5851 		 Timestep : 21067200 		 Average Reward : 7.03
Episode : 5855 		 Timestep : 21081600 		 Average Reward : -74.19
Episode : 5859 		 Timestep : 21096000 		 Average Reward : -36.22
Episode : 5863 		 Timestep : 21110400 		 Average Reward : -39.19
Episode : 5867 		 Timestep : 21124800 		 Average Reward : -58.08
Episode : 5871 		 Timestep : 21139200 		 Average Reward : 12.26
Episode : 5875 		 Timestep : 21153600 		 Average Reward : -28.21
Episode : 5879 		 Timestep : 21168000 		 Average Reward : -2.17
Episode : 5883 		 Timestep : 21182400 		 Average Reward : 40.45
Episode : 5887 		 Timestep : 211

Episode : 6319 		 Timestep : 22752000 		 Average Reward : -32.38
Episode : 6323 		 Timestep : 22766400 		 Average Reward : -80.61
Episode : 6327 		 Timestep : 22780800 		 Average Reward : -35.63
Episode : 6331 		 Timestep : 22795200 		 Average Reward : -15.73
Episode : 6335 		 Timestep : 22809600 		 Average Reward : -27.37
Episode : 6339 		 Timestep : 22824000 		 Average Reward : 44.27
Episode : 6343 		 Timestep : 22838400 		 Average Reward : 15.72
Episode : 6347 		 Timestep : 22852800 		 Average Reward : 16.53
Episode : 6351 		 Timestep : 22867200 		 Average Reward : -20.74
Episode : 6355 		 Timestep : 22881600 		 Average Reward : 9.16
Episode : 6359 		 Timestep : 22896000 		 Average Reward : -15.26
Episode : 6363 		 Timestep : 22910400 		 Average Reward : -49.9
Episode : 6367 		 Timestep : 22924800 		 Average Reward : -38.25
Episode : 6371 		 Timestep : 22939200 		 Average Reward : -84.75
Episode : 6375 		 Timestep : 22953600 		 Average Reward : -30.26
Episode : 6379 		 Timestep : 22

Episode : 6831 		 Timestep : 24595200 		 Average Reward : -29.98
Episode : 6835 		 Timestep : 24609600 		 Average Reward : -85.37
Episode : 6839 		 Timestep : 24624000 		 Average Reward : -2.57
Episode : 6843 		 Timestep : 24638400 		 Average Reward : -51.85
Episode : 6847 		 Timestep : 24652800 		 Average Reward : -8.38
Episode : 6851 		 Timestep : 24667200 		 Average Reward : -90.22
Episode : 6855 		 Timestep : 24681600 		 Average Reward : -18.4
Episode : 6859 		 Timestep : 24696000 		 Average Reward : -49.32
Episode : 6863 		 Timestep : 24710400 		 Average Reward : -38.2
Episode : 6867 		 Timestep : 24724800 		 Average Reward : -18.38
Episode : 6871 		 Timestep : 24739200 		 Average Reward : -52.66
Episode : 6875 		 Timestep : 24753600 		 Average Reward : -105.44
Episode : 6879 		 Timestep : 24768000 		 Average Reward : -2.97
Episode : 6883 		 Timestep : 24782400 		 Average Reward : -33.03
Episode : 6887 		 Timestep : 24796800 		 Average Reward : -136.24
Episode : 6891 		 Timestep :

Episode : 7323 		 Timestep : 26366400 		 Average Reward : -41.47
Episode : 7327 		 Timestep : 26380800 		 Average Reward : -184.65
Episode : 7331 		 Timestep : 26395200 		 Average Reward : 7.78
Episode : 7335 		 Timestep : 26409600 		 Average Reward : -38.7
Episode : 7339 		 Timestep : 26424000 		 Average Reward : -117.91
Episode : 7343 		 Timestep : 26438400 		 Average Reward : 11.9
Episode : 7347 		 Timestep : 26452800 		 Average Reward : -32.12
Episode : 7351 		 Timestep : 26467200 		 Average Reward : -55.78
Episode : 7355 		 Timestep : 26481600 		 Average Reward : -74.87
Episode : 7359 		 Timestep : 26496000 		 Average Reward : -35.04
Episode : 7363 		 Timestep : 26510400 		 Average Reward : -64.04
Episode : 7367 		 Timestep : 26524800 		 Average Reward : -25.6
Episode : 7371 		 Timestep : 26539200 		 Average Reward : -48.51
Episode : 7375 		 Timestep : 26553600 		 Average Reward : -49.01
Episode : 7379 		 Timestep : 26568000 		 Average Reward : -68.3
Episode : 7383 		 Timestep : 2

Episode : 7831 		 Timestep : 28195200 		 Average Reward : -111.27
Episode : 7835 		 Timestep : 28209600 		 Average Reward : -44.9
Episode : 7839 		 Timestep : 28224000 		 Average Reward : -13.73
Episode : 7843 		 Timestep : 28238400 		 Average Reward : -10.99
Episode : 7847 		 Timestep : 28252800 		 Average Reward : -85.29
Episode : 7851 		 Timestep : 28267200 		 Average Reward : -37.45
Episode : 7855 		 Timestep : 28281600 		 Average Reward : -48.71
Episode : 7859 		 Timestep : 28296000 		 Average Reward : -72.45
Episode : 7863 		 Timestep : 28310400 		 Average Reward : -39.55
Episode : 7867 		 Timestep : 28324800 		 Average Reward : -96.55
Episode : 7871 		 Timestep : 28339200 		 Average Reward : -120.03
Episode : 7875 		 Timestep : 28353600 		 Average Reward : -53.85
Episode : 7879 		 Timestep : 28368000 		 Average Reward : -106.33
Episode : 7883 		 Timestep : 28382400 		 Average Reward : -106.19
Episode : 7887 		 Timestep : 28396800 		 Average Reward : -30.21
Episode : 7891 		 Time

Episode : 8339 		 Timestep : 30024000 		 Average Reward : -32.83
Episode : 8343 		 Timestep : 30038400 		 Average Reward : -28.0
Episode : 8347 		 Timestep : 30052800 		 Average Reward : 23.09
Episode : 8351 		 Timestep : 30067200 		 Average Reward : -2.64
Episode : 8355 		 Timestep : 30081600 		 Average Reward : 11.98
Episode : 8359 		 Timestep : 30096000 		 Average Reward : 8.49
Episode : 8363 		 Timestep : 30110400 		 Average Reward : 12.08
Episode : 8367 		 Timestep : 30124800 		 Average Reward : -91.05
Episode : 8371 		 Timestep : 30139200 		 Average Reward : -28.36
Episode : 8375 		 Timestep : 30153600 		 Average Reward : -6.57
Episode : 8379 		 Timestep : 30168000 		 Average Reward : -90.39
Episode : 8383 		 Timestep : 30182400 		 Average Reward : 11.65
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrained/pong_game/PPO_pong_game_323_0.pth
model saved
Elapsed Time  :  11:39:23
-------------------------------

Episode : 8831 		 Timestep : 31795200 		 Average Reward : -99.71
Episode : 8835 		 Timestep : 31809600 		 Average Reward : 25.66
Episode : 8839 		 Timestep : 31824000 		 Average Reward : -38.11
Episode : 8843 		 Timestep : 31838400 		 Average Reward : 46.12
Episode : 8847 		 Timestep : 31852800 		 Average Reward : -95.51
Episode : 8851 		 Timestep : 31867200 		 Average Reward : -48.92
Episode : 8855 		 Timestep : 31881600 		 Average Reward : -102.9
Episode : 8859 		 Timestep : 31896000 		 Average Reward : -10.45
Episode : 8863 		 Timestep : 31910400 		 Average Reward : -52.76
Episode : 8867 		 Timestep : 31924800 		 Average Reward : 73.16
Episode : 8871 		 Timestep : 31939200 		 Average Reward : 56.17
Episode : 8875 		 Timestep : 31953600 		 Average Reward : -41.51
Episode : 8879 		 Timestep : 31968000 		 Average Reward : -56.31
Episode : 8883 		 Timestep : 31982400 		 Average Reward : 46.52
Episode : 8887 		 Timestep : 31996800 		 Average Reward : -100.15
Episode : 8891 		 Timestep : 

Episode : 9343 		 Timestep : 33638400 		 Average Reward : 14.86
Episode : 9347 		 Timestep : 33652800 		 Average Reward : 35.25
Episode : 9351 		 Timestep : 33667200 		 Average Reward : -64.91
Episode : 9355 		 Timestep : 33681600 		 Average Reward : -0.87
Episode : 9359 		 Timestep : 33696000 		 Average Reward : -18.1
Episode : 9363 		 Timestep : 33710400 		 Average Reward : 14.6
Episode : 9367 		 Timestep : 33724800 		 Average Reward : -45.05
Episode : 9371 		 Timestep : 33739200 		 Average Reward : 38.27
Episode : 9375 		 Timestep : 33753600 		 Average Reward : 32.51
Episode : 9379 		 Timestep : 33768000 		 Average Reward : 12.0
Episode : 9383 		 Timestep : 33782400 		 Average Reward : -40.22
Episode : 9387 		 Timestep : 33796800 		 Average Reward : 16.98
Episode : 9391 		 Timestep : 33811200 		 Average Reward : -34.24
Episode : 9395 		 Timestep : 33825600 		 Average Reward : -27.24
Episode : 9399 		 Timestep : 33840000 		 Average Reward : -93.84
Episode : 9403 		 Timestep : 3385440

Episode : 9855 		 Timestep : 35481600 		 Average Reward : 12.86
Episode : 9859 		 Timestep : 35496000 		 Average Reward : 9.86
Episode : 9863 		 Timestep : 35510400 		 Average Reward : -76.58
Episode : 9867 		 Timestep : 35524800 		 Average Reward : -17.51
Episode : 9871 		 Timestep : 35539200 		 Average Reward : -16.62
Episode : 9875 		 Timestep : 35553600 		 Average Reward : -16.05
Episode : 9879 		 Timestep : 35568000 		 Average Reward : 45.91
Episode : 9883 		 Timestep : 35582400 		 Average Reward : -62.11
Episode : 9887 		 Timestep : 35596800 		 Average Reward : 31.3
Episode : 9891 		 Timestep : 35611200 		 Average Reward : -26.4
Episode : 9895 		 Timestep : 35625600 		 Average Reward : -10.15
Episode : 9899 		 Timestep : 35640000 		 Average Reward : -20.81
Episode : 9903 		 Timestep : 35654400 		 Average Reward : 22.05
Episode : 9907 		 Timestep : 35668800 		 Average Reward : -31.26
Episode : 9911 		 Timestep : 35683200 		 Average Reward : 39.27
Episode : 9915 		 Timestep : 35697

Episode : 10359 		 Timestep : 37296000 		 Average Reward : 8.26
Episode : 10363 		 Timestep : 37310400 		 Average Reward : 47.9
Episode : 10367 		 Timestep : 37324800 		 Average Reward : -6.79
Episode : 10371 		 Timestep : 37339200 		 Average Reward : -8.59
Episode : 10375 		 Timestep : 37353600 		 Average Reward : -14.92
Episode : 10379 		 Timestep : 37368000 		 Average Reward : -57.34
Episode : 10383 		 Timestep : 37382400 		 Average Reward : 0.83
Episode : 10387 		 Timestep : 37396800 		 Average Reward : -22.81
Episode : 10391 		 Timestep : 37411200 		 Average Reward : 39.17
Episode : 10395 		 Timestep : 37425600 		 Average Reward : -5.2
Episode : 10399 		 Timestep : 37440000 		 Average Reward : -55.88
Episode : 10403 		 Timestep : 37454400 		 Average Reward : 30.88
Episode : 10407 		 Timestep : 37468800 		 Average Reward : -92.5
Episode : 10411 		 Timestep : 37483200 		 Average Reward : 27.01
Episode : 10415 		 Timestep : 37497600 		 Average Reward : -24.74
Episode : 10419 		 Times

Episode : 10863 		 Timestep : 39110400 		 Average Reward : -23.68
Episode : 10867 		 Timestep : 39124800 		 Average Reward : -24.72
Episode : 10871 		 Timestep : 39139200 		 Average Reward : 36.83
Episode : 10875 		 Timestep : 39153600 		 Average Reward : -9.21
Episode : 10879 		 Timestep : 39168000 		 Average Reward : -39.48
Episode : 10883 		 Timestep : 39182400 		 Average Reward : -11.26
Episode : 10887 		 Timestep : 39196800 		 Average Reward : -42.42
Episode : 10891 		 Timestep : 39211200 		 Average Reward : 6.55
Episode : 10895 		 Timestep : 39225600 		 Average Reward : -34.89
Episode : 10899 		 Timestep : 39240000 		 Average Reward : -4.46
Episode : 10903 		 Timestep : 39254400 		 Average Reward : 27.29
Episode : 10907 		 Timestep : 39268800 		 Average Reward : -2.66
Episode : 10911 		 Timestep : 39283200 		 Average Reward : 18.24
Episode : 10915 		 Timestep : 39297600 		 Average Reward : -32.44
Episode : 10919 		 Timestep : 39312000 		 Average Reward : -3.37
Episode : 10923 		 

Episode : 11367 		 Timestep : 40924800 		 Average Reward : 13.32
Episode : 11371 		 Timestep : 40939200 		 Average Reward : 39.81
Episode : 11375 		 Timestep : 40953600 		 Average Reward : -2.31
Episode : 11379 		 Timestep : 40968000 		 Average Reward : -44.5
Episode : 11383 		 Timestep : 40982400 		 Average Reward : 86.4
Episode : 11387 		 Timestep : 40996800 		 Average Reward : -24.0
Episode : 11391 		 Timestep : 41011200 		 Average Reward : 0.58
Episode : 11395 		 Timestep : 41025600 		 Average Reward : -32.92
Episode : 11399 		 Timestep : 41040000 		 Average Reward : -29.54
Episode : 11403 		 Timestep : 41054400 		 Average Reward : 47.63
Episode : 11407 		 Timestep : 41068800 		 Average Reward : -1.24
Episode : 11411 		 Timestep : 41083200 		 Average Reward : -41.23
Episode : 11415 		 Timestep : 41097600 		 Average Reward : 4.81
Episode : 11419 		 Timestep : 41112000 		 Average Reward : -2.07
Episode : 11423 		 Timestep : 41126400 		 Average Reward : 8.48
Episode : 11427 		 Timeste

Episode : 11871 		 Timestep : 42739200 		 Average Reward : 54.11
Episode : 11875 		 Timestep : 42753600 		 Average Reward : 0.22
Episode : 11879 		 Timestep : 42768000 		 Average Reward : 93.82
Episode : 11883 		 Timestep : 42782400 		 Average Reward : -35.79
Episode : 11887 		 Timestep : 42796800 		 Average Reward : -16.54
Episode : 11891 		 Timestep : 42811200 		 Average Reward : -2.3
Episode : 11895 		 Timestep : 42825600 		 Average Reward : 21.56
Episode : 11899 		 Timestep : 42840000 		 Average Reward : 55.44
Episode : 11903 		 Timestep : 42854400 		 Average Reward : 30.86
Episode : 11907 		 Timestep : 42868800 		 Average Reward : 30.14
Episode : 11911 		 Timestep : 42883200 		 Average Reward : 0.52
Episode : 11915 		 Timestep : 42897600 		 Average Reward : 64.3
Episode : 11919 		 Timestep : 42912000 		 Average Reward : 82.48
Episode : 11923 		 Timestep : 42926400 		 Average Reward : 92.86
Episode : 11927 		 Timestep : 42940800 		 Average Reward : 57.78
Episode : 11931 		 Timestep

Episode : 12375 		 Timestep : 44553600 		 Average Reward : -26.05
Episode : 12379 		 Timestep : 44568000 		 Average Reward : 73.31
Episode : 12383 		 Timestep : 44582400 		 Average Reward : 41.7
Episode : 12387 		 Timestep : 44596800 		 Average Reward : 4.18
Episode : 12391 		 Timestep : 44611200 		 Average Reward : -29.09
Episode : 12395 		 Timestep : 44625600 		 Average Reward : -13.41
Episode : 12399 		 Timestep : 44640000 		 Average Reward : 36.68
Episode : 12403 		 Timestep : 44654400 		 Average Reward : 78.4
Episode : 12407 		 Timestep : 44668800 		 Average Reward : 29.86
Episode : 12411 		 Timestep : 44683200 		 Average Reward : -44.14
Episode : 12415 		 Timestep : 44697600 		 Average Reward : -11.07
Episode : 12419 		 Timestep : 44712000 		 Average Reward : 0.78
Episode : 12423 		 Timestep : 44726400 		 Average Reward : 14.84
Episode : 12427 		 Timestep : 44740800 		 Average Reward : -55.48
Episode : 12431 		 Timestep : 44755200 		 Average Reward : 69.38
Episode : 12435 		 Time

Episode : 12879 		 Timestep : 46368000 		 Average Reward : -50.38
Episode : 12883 		 Timestep : 46382400 		 Average Reward : 80.11
Episode : 12887 		 Timestep : 46396800 		 Average Reward : -9.06
Episode : 12891 		 Timestep : 46411200 		 Average Reward : -44.0
Episode : 12895 		 Timestep : 46425600 		 Average Reward : 14.23
Episode : 12899 		 Timestep : 46440000 		 Average Reward : -43.33
Episode : 12903 		 Timestep : 46454400 		 Average Reward : 29.02
Episode : 12907 		 Timestep : 46468800 		 Average Reward : 19.97
Episode : 12911 		 Timestep : 46483200 		 Average Reward : 10.14
Episode : 12915 		 Timestep : 46497600 		 Average Reward : -11.88
Episode : 12919 		 Timestep : 46512000 		 Average Reward : 56.27
Episode : 12923 		 Timestep : 46526400 		 Average Reward : 29.44
Episode : 12927 		 Timestep : 46540800 		 Average Reward : 41.46
Episode : 12931 		 Timestep : 46555200 		 Average Reward : 39.83
Episode : 12935 		 Timestep : 46569600 		 Average Reward : -8.16
Episode : 12939 		 Tim

Episode : 13383 		 Timestep : 48182400 		 Average Reward : 76.1
Episode : 13387 		 Timestep : 48196800 		 Average Reward : 20.3
Episode : 13391 		 Timestep : 48211200 		 Average Reward : 49.84
Episode : 13395 		 Timestep : 48225600 		 Average Reward : 8.47
Episode : 13399 		 Timestep : 48240000 		 Average Reward : 6.36
Episode : 13403 		 Timestep : 48254400 		 Average Reward : 62.75
Episode : 13407 		 Timestep : 48268800 		 Average Reward : -33.14
Episode : 13411 		 Timestep : 48283200 		 Average Reward : 46.32
Episode : 13415 		 Timestep : 48297600 		 Average Reward : 43.61
Episode : 13419 		 Timestep : 48312000 		 Average Reward : -13.07
Episode : 13423 		 Timestep : 48326400 		 Average Reward : 13.65
Episode : 13427 		 Timestep : 48340800 		 Average Reward : 2.06
Episode : 13431 		 Timestep : 48355200 		 Average Reward : 8.04
Episode : 13435 		 Timestep : 48369600 		 Average Reward : 25.24
Episode : 13439 		 Timestep : 48384000 		 Average Reward : 24.15
Episode : 13443 		 Timestep :

Episode : 13891 		 Timestep : 50011200 		 Average Reward : 50.41
Episode : 13895 		 Timestep : 50025600 		 Average Reward : 98.47
Episode : 13899 		 Timestep : 50040000 		 Average Reward : 48.22
Episode : 13903 		 Timestep : 50054400 		 Average Reward : 45.23
Episode : 13907 		 Timestep : 50068800 		 Average Reward : 22.09
Episode : 13911 		 Timestep : 50083200 		 Average Reward : 16.18
Episode : 13915 		 Timestep : 50097600 		 Average Reward : 9.5
Episode : 13919 		 Timestep : 50112000 		 Average Reward : 58.7
Episode : 13923 		 Timestep : 50126400 		 Average Reward : 130.32
Episode : 13927 		 Timestep : 50140800 		 Average Reward : 18.37
Episode : 13931 		 Timestep : 50155200 		 Average Reward : 18.85
Episode : 13935 		 Timestep : 50169600 		 Average Reward : 30.47
Episode : 13939 		 Timestep : 50184000 		 Average Reward : 14.86
Episode : 13943 		 Timestep : 50198400 		 Average Reward : -24.2
Episode : 13947 		 Timestep : 50212800 		 Average Reward : 52.26
Episode : 13951 		 Timestep

Episode : 14399 		 Timestep : 51840000 		 Average Reward : 7.62
Episode : 14403 		 Timestep : 51854400 		 Average Reward : 33.84
Episode : 14407 		 Timestep : 51868800 		 Average Reward : -12.74
Episode : 14411 		 Timestep : 51883200 		 Average Reward : -8.61
Episode : 14415 		 Timestep : 51897600 		 Average Reward : 0.58
Episode : 14419 		 Timestep : 51912000 		 Average Reward : -27.04
Episode : 14423 		 Timestep : 51926400 		 Average Reward : 87.02
Episode : 14427 		 Timestep : 51940800 		 Average Reward : 26.14
Episode : 14431 		 Timestep : 51955200 		 Average Reward : 12.13
Episode : 14435 		 Timestep : 51969600 		 Average Reward : 54.48
Episode : 14439 		 Timestep : 51984000 		 Average Reward : 7.46
Episode : 14443 		 Timestep : 51998400 		 Average Reward : -6.35
Episode : 14447 		 Timestep : 52012800 		 Average Reward : -29.17
Episode : 14451 		 Timestep : 52027200 		 Average Reward : -16.97
Episode : 14455 		 Timestep : 52041600 		 Average Reward : 9.05
Episode : 14459 		 Timest

Episode : 14907 		 Timestep : 53668800 		 Average Reward : 38.2
Episode : 14911 		 Timestep : 53683200 		 Average Reward : 84.97
Episode : 14915 		 Timestep : 53697600 		 Average Reward : -76.63
Episode : 14919 		 Timestep : 53712000 		 Average Reward : 19.12
Episode : 14923 		 Timestep : 53726400 		 Average Reward : 46.25
Episode : 14927 		 Timestep : 53740800 		 Average Reward : 8.52
Episode : 14931 		 Timestep : 53755200 		 Average Reward : 118.78
Episode : 14935 		 Timestep : 53769600 		 Average Reward : 7.56
Episode : 14939 		 Timestep : 53784000 		 Average Reward : 14.34
Episode : 14943 		 Timestep : 53798400 		 Average Reward : -59.63
Episode : 14947 		 Timestep : 53812800 		 Average Reward : 40.71
Episode : 14951 		 Timestep : 53827200 		 Average Reward : -32.84
Episode : 14955 		 Timestep : 53841600 		 Average Reward : -8.0
Episode : 14959 		 Timestep : 53856000 		 Average Reward : 25.57
Episode : 14963 		 Timestep : 53870400 		 Average Reward : 65.07
Episode : 14967 		 Timest

In [None]:
checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, "final", run_num_pretrained)
print("--------------------------------------------------------------------------------------------")
print("saving model at : " + checkpoint_path)
ppo_agent.save(checkpoint_path)
print("model saved")
print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
print("--------------------------------------------------------------------------------------------")