###### > # **Part - I**

*   define actor critic networks
*   define PPO algorithm

################################################################################

In [5]:


############################### Import libraries ###############################


import os
import glob
import time
from datetime import datetime

import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

import numpy as np

import gym
import pongGym

################################## set device ##################################

print("============================================================================================")


# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
    
print("============================================================================================")




################################## PPO Policy ##################################


class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init):
        super(ActorCritic, self).__init__()

#         self.quant = torch.quantization.QuantStub()
        self.actor = nn.Sequential(
                        torch.quantization.QuantStub(),
                        nn.Linear(state_dim, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64), 
                        nn.ReLU(),
                        nn.Linear(64, 64), 
                        nn.ReLU(),
                        nn.Linear(64, action_dim),
                        torch.quantization.DeQuantStub(),
                        nn.Softmax(dim=-1)
                    )
#         self.dequant = torch.quantization.DeQuantStub()
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64), 
                        nn.ReLU(),
                        nn.Linear(64, 1)
                    )
    def forward(self):
        raise NotImplementedError
    

    def act(self, state):

        
        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    

    def evaluate(self, state, action):


        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std_init=0.6):

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()


    def select_action(self, state):

        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.act(state)

        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_logprob)

        return action.item()


    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)

        
        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        
        
       



Device set to : NVIDIA GeForce RTX 3070


################################################################################
> # **Part - II**

*   train PPO algorithm on environments
*   save preTrained networks weights and log files

################################################################################

In [None]:

print("============================================================================================")


################################### Training ###################################


####### initialize environment hyperparameters ######

env_name = "pong_game"

max_ep_len = 60*120                    # max timesteps in one episode
max_training_timesteps = int(1e5)   # break training loop if timeteps > max_training_timesteps

print_freq = max_ep_len * 3     # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2       # log avg reward in the interval (in num timesteps)
save_model_freq = int(2e4)      # save model frequency (in num timesteps)

action_std = None


#####################################################


## Note : print/log frequencies should be > than max_ep_len


################ PPO hyperparameters ################


update_timestep = max_ep_len * 10      # update policy every n timesteps
K_epochs = 60*10               # update policy for K epochs
# K_epochs = 4
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr_actor = 0.0001       # learning rate for actor network
lr_critic = 0.001       # learning rate for critic network

random_seed = 910426         # set random seed if required (0 = no random seed)

#####################################################



print("training environment name : " + env_name)

env = pongGym.DoublePong()

# state space dimension
state_dim = env.observation_space.shape[0]

# action space dimension

action_dim = env.action_space.n



###################### logging ######################

#### log files for multiple runs are NOT overwritten

log_dir = "PPO_logs"
if not os.path.exists(log_dir):
      os.makedirs(log_dir)

log_dir = log_dir + '/' + env_name + '/'
if not os.path.exists(log_dir):
      os.makedirs(log_dir)


#### get number of log files in log directory
run_num = 0
current_num_files = next(os.walk(log_dir))[2]
run_num = len(current_num_files)


#### create new log file for each run 
log_f_name = log_dir + '/PPO_' + env_name + "_log_" + str(run_num) + ".csv"

print("current logging run number for " + env_name + " : ", run_num)
print("logging at : " + log_f_name)

#####################################################


################### checkpointing ###################

run_num_pretrained = 1      #### change this to prevent overwriting weights in same env_name folder

directory = "PPO_preTrained"
if not os.path.exists(directory):
      os.makedirs(directory)

directory = directory + '/' + env_name + '/'
if not os.path.exists(directory):
      os.makedirs(directory)

#####################################################


############# print all hyperparameters #############

print("--------------------------------------------------------------------------------------------")

print("max training timesteps : ", max_training_timesteps)
print("max timesteps per episode : ", max_ep_len)

print("model saving frequency : " + str(save_model_freq) + " timesteps")
print("log frequency : " + str(log_freq) + " timesteps")
print("printing average reward over episodes in last : " + str(print_freq) + " timesteps")

print("--------------------------------------------------------------------------------------------")

print("state space dimension : ", state_dim)
print("action space dimension : ", action_dim)

print("--------------------------------------------------------------------------------------------")

print("Initializing a discrete action space policy")

print("--------------------------------------------------------------------------------------------")

print("PPO update frequency : " + str(update_timestep) + " timesteps") 
print("PPO K epochs : ", K_epochs)
print("PPO epsilon clip : ", eps_clip)
print("discount factor (gamma) : ", gamma)

print("--------------------------------------------------------------------------------------------")

print("optimizer learning rate actor : ", lr_actor)
print("optimizer learning rate critic : ", lr_critic)

if random_seed:
    print("--------------------------------------------------------------------------------------------")
    print("setting random seed to ", random_seed)
    torch.manual_seed(random_seed)
#     env.seed(random_seed)
    np.random.seed(random_seed)

#####################################################

print("============================================================================================")

################# training procedure ################

# initialize a PPO agent
ppo_agent = PPO(state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std)


# track total training time
start_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)

print("============================================================================================")


# logging file
log_f = open(log_f_name,"w+")
log_f.write('episode,timestep,reward\n')


# printing and logging variables
print_running_reward = 0
print_running_episodes = 0

log_running_reward = 0
log_running_episodes = 0

time_step = 0
i_episode = 0

Max_reward = 0
Max_episode = 0
# training loop
# env = pongGym.DoublePong()
while i_episode < 100000:
    
    state = env.reset()
    current_ep_reward = 0
    for t in range(1, max_ep_len+1):
        
        # select action with policy
        action = ppo_agent.select_action(state)
        state, reward, done, _ = env.step(action)
#         if i_episode > 100:
#             env.render()
        
        # saving reward and is_terminals
        ppo_agent.buffer.rewards.append(reward)
        ppo_agent.buffer.is_terminals.append(done)
        
        time_step +=1
        current_ep_reward += reward

        # update PPO agent
        if time_step % update_timestep == 0:
            ppo_agent.update()

        # log in logging file
        if time_step % log_freq == 0:

            # log average reward till last episode
            log_avg_reward = log_running_reward / log_running_episodes
            log_avg_reward = round(log_avg_reward, 4)

            log_f.write('{},{},{}\n'.format(i_episode, time_step, log_avg_reward))
            log_f.flush()

            log_running_reward = 0
            log_running_episodes = 0
            
        # printing average reward
        if time_step % print_freq == 0:

            # print average reward till last episode
            print_avg_reward = print_running_reward / print_running_episodes
            print_avg_reward = round(print_avg_reward, 2)

            print("Episode : {} \t\t Timestep : {} \t\t Average Reward : {}".format(i_episode, time_step, print_avg_reward))
            print_running_reward = 0
            print_running_episodes = 0
        # break; if the episode is over
        if done:
            break

    # save model weights
#     print("Episode : {} \t\t Totol reward : {}".format(i_episode, current_ep_reward))
    if i_episode >= Max_episode + 1000:
        Max_episode = i_episode
        checkpoint_path = directory + "PPO_{}_{}_{}.pth".format(env_name, int(Max_episode), run_num_pretrained)
        print("--------------------------------------------------------------------------------------------")
        print("saving model at : " + checkpoint_path)
        ppo_agent.save(checkpoint_path)
        print("model saved")
        print("Elapsed Time  : ", datetime.now().replace(microsecond=0) - start_time)
        print("--------------------------------------------------------------------------------------------")
    print_running_reward += current_ep_reward
    print_running_episodes += 1

    log_running_reward += current_ep_reward
    log_running_episodes += 1

    i_episode += 1


log_f.close()
env.close()




# print total training time
print("============================================================================================")
end_time = datetime.now().replace(microsecond=0)
print("Started training at (GMT) : ", start_time)
print("Finished training at (GMT) : ", end_time)
print("Total training time  : ", end_time - start_time)
print("============================================================================================")







training environment name : pong_game
current logging run number for pong_game :  1
logging at : PPO_logs/pong_game//PPO_pong_game_log_1.csv
--------------------------------------------------------------------------------------------
max training timesteps :  100000
max timesteps per episode :  7200
model saving frequency : 20000 timesteps
log frequency : 14400 timesteps
printing average reward over episodes in last : 21600 timesteps
--------------------------------------------------------------------------------------------
state space dimension :  24
action space dimension :  4
--------------------------------------------------------------------------------------------
Initializing a discrete action space policy
--------------------------------------------------------------------------------------------
PPO update frequency : 72000 timesteps
PPO K epochs :  600
PPO epsilon clip :  0.2
discount factor (gamma) :  0.99
--------------------------------------------------------------------

Episode : 326 		 Timestep : 2354400 		 Average Reward : 75.76
Episode : 329 		 Timestep : 2376000 		 Average Reward : 141.45
Episode : 332 		 Timestep : 2397600 		 Average Reward : 129.46
Episode : 335 		 Timestep : 2419200 		 Average Reward : 40.73
Episode : 338 		 Timestep : 2440800 		 Average Reward : 148.77
Episode : 341 		 Timestep : 2462400 		 Average Reward : 133.83
Episode : 344 		 Timestep : 2484000 		 Average Reward : 93.96
Episode : 347 		 Timestep : 2505600 		 Average Reward : 51.13
Episode : 350 		 Timestep : 2527200 		 Average Reward : 63.3
Episode : 353 		 Timestep : 2548800 		 Average Reward : 58.4
Episode : 356 		 Timestep : 2570400 		 Average Reward : 75.6
Episode : 359 		 Timestep : 2592000 		 Average Reward : 81.21
Episode : 362 		 Timestep : 2613600 		 Average Reward : 57.45
Episode : 365 		 Timestep : 2635200 		 Average Reward : 82.37
Episode : 368 		 Timestep : 2656800 		 Average Reward : 139.51
Episode : 371 		 Timestep : 2678400 		 Average Reward : 49.99
Episod

Episode : 722 		 Timestep : 5205600 		 Average Reward : 73.02
Episode : 725 		 Timestep : 5227200 		 Average Reward : -56.69
Episode : 728 		 Timestep : 5248800 		 Average Reward : 128.18
Episode : 731 		 Timestep : 5270400 		 Average Reward : 237.11
Episode : 734 		 Timestep : 5292000 		 Average Reward : 123.0
Episode : 737 		 Timestep : 5313600 		 Average Reward : 296.93
Episode : 740 		 Timestep : 5335200 		 Average Reward : 225.63
Episode : 743 		 Timestep : 5356800 		 Average Reward : 111.48
Episode : 746 		 Timestep : 5378400 		 Average Reward : 279.94
Episode : 749 		 Timestep : 5400000 		 Average Reward : 134.12
Episode : 752 		 Timestep : 5421600 		 Average Reward : 264.96
Episode : 755 		 Timestep : 5443200 		 Average Reward : 156.47
Episode : 758 		 Timestep : 5464800 		 Average Reward : 226.11
Episode : 761 		 Timestep : 5486400 		 Average Reward : 72.31
Episode : 764 		 Timestep : 5508000 		 Average Reward : 326.16
Episode : 767 		 Timestep : 5529600 		 Average Reward : 13

Episode : 1100 		 Timestep : 7927200 		 Average Reward : 295.67
Episode : 1103 		 Timestep : 7948800 		 Average Reward : 157.19
Episode : 1106 		 Timestep : 7970400 		 Average Reward : 292.64
Episode : 1109 		 Timestep : 7992000 		 Average Reward : 254.5
Episode : 1112 		 Timestep : 8013600 		 Average Reward : 141.27
Episode : 1115 		 Timestep : 8035200 		 Average Reward : 240.94
Episode : 1118 		 Timestep : 8056800 		 Average Reward : 161.49
Episode : 1121 		 Timestep : 8078400 		 Average Reward : 194.73
Episode : 1124 		 Timestep : 8100000 		 Average Reward : 169.62
Episode : 1127 		 Timestep : 8121600 		 Average Reward : 295.61
Episode : 1130 		 Timestep : 8143200 		 Average Reward : 199.95
Episode : 1133 		 Timestep : 8164800 		 Average Reward : 217.35
Episode : 1136 		 Timestep : 8186400 		 Average Reward : 164.03
Episode : 1139 		 Timestep : 8208000 		 Average Reward : 260.24
Episode : 1142 		 Timestep : 8229600 		 Average Reward : 329.21
Episode : 1145 		 Timestep : 8251200 		 A

Episode : 1484 		 Timestep : 10692000 		 Average Reward : 211.41
Episode : 1487 		 Timestep : 10713600 		 Average Reward : 325.74
Episode : 1490 		 Timestep : 10735200 		 Average Reward : 424.56
Episode : 1493 		 Timestep : 10756800 		 Average Reward : 358.94
Episode : 1496 		 Timestep : 10778400 		 Average Reward : 325.05
Episode : 1499 		 Timestep : 10800000 		 Average Reward : 282.88
Episode : 1502 		 Timestep : 10821600 		 Average Reward : 267.02
Episode : 1505 		 Timestep : 10843200 		 Average Reward : 439.16
Episode : 1508 		 Timestep : 10864800 		 Average Reward : 345.09
Episode : 1511 		 Timestep : 10886400 		 Average Reward : 419.05
Episode : 1514 		 Timestep : 10908000 		 Average Reward : 457.33
Episode : 1517 		 Timestep : 10929600 		 Average Reward : 389.27
Episode : 1520 		 Timestep : 10951200 		 Average Reward : 331.32
Episode : 1523 		 Timestep : 10972800 		 Average Reward : 229.29
Episode : 1526 		 Timestep : 10994400 		 Average Reward : 299.87
Episode : 1529 		 Timeste

Episode : 1865 		 Timestep : 13435200 		 Average Reward : 370.53
Episode : 1868 		 Timestep : 13456800 		 Average Reward : 344.58
Episode : 1871 		 Timestep : 13478400 		 Average Reward : 366.2
Episode : 1874 		 Timestep : 13500000 		 Average Reward : 322.93
Episode : 1877 		 Timestep : 13521600 		 Average Reward : 282.55
Episode : 1880 		 Timestep : 13543200 		 Average Reward : 215.76
Episode : 1883 		 Timestep : 13564800 		 Average Reward : 476.05
Episode : 1886 		 Timestep : 13586400 		 Average Reward : 250.73
Episode : 1889 		 Timestep : 13608000 		 Average Reward : 386.2
Episode : 1892 		 Timestep : 13629600 		 Average Reward : 301.06
Episode : 1895 		 Timestep : 13651200 		 Average Reward : 404.44
Episode : 1898 		 Timestep : 13672800 		 Average Reward : 388.81
Episode : 1901 		 Timestep : 13694400 		 Average Reward : 236.06
Episode : 1904 		 Timestep : 13716000 		 Average Reward : 267.25
Episode : 1907 		 Timestep : 13737600 		 Average Reward : 302.56
Episode : 1910 		 Timestep 

Episode : 2231 		 Timestep : 16070400 		 Average Reward : 471.98
Episode : 2234 		 Timestep : 16092000 		 Average Reward : 468.17
Episode : 2237 		 Timestep : 16113600 		 Average Reward : 297.28
Episode : 2240 		 Timestep : 16135200 		 Average Reward : 481.83
Episode : 2243 		 Timestep : 16156800 		 Average Reward : 501.41
Episode : 2246 		 Timestep : 16178400 		 Average Reward : 385.16
Episode : 2249 		 Timestep : 16200000 		 Average Reward : 453.55
Episode : 2252 		 Timestep : 16221600 		 Average Reward : 326.87
Episode : 2255 		 Timestep : 16243200 		 Average Reward : 373.63
Episode : 2258 		 Timestep : 16264800 		 Average Reward : 386.55
Episode : 2261 		 Timestep : 16286400 		 Average Reward : 326.05
Episode : 2264 		 Timestep : 16308000 		 Average Reward : 358.72
Episode : 2267 		 Timestep : 16329600 		 Average Reward : 358.53
Episode : 2270 		 Timestep : 16351200 		 Average Reward : 250.62
Episode : 2273 		 Timestep : 16372800 		 Average Reward : 365.06
Episode : 2276 		 Timeste

Episode : 2612 		 Timestep : 18813600 		 Average Reward : 363.11
Episode : 2615 		 Timestep : 18835200 		 Average Reward : 438.73
Episode : 2618 		 Timestep : 18856800 		 Average Reward : 424.16
Episode : 2621 		 Timestep : 18878400 		 Average Reward : 347.31
Episode : 2624 		 Timestep : 18900000 		 Average Reward : 430.61
Episode : 2627 		 Timestep : 18921600 		 Average Reward : 309.55
Episode : 2630 		 Timestep : 18943200 		 Average Reward : 394.26
Episode : 2633 		 Timestep : 18964800 		 Average Reward : 282.68
Episode : 2636 		 Timestep : 18986400 		 Average Reward : 373.93
Episode : 2639 		 Timestep : 19008000 		 Average Reward : 329.89
Episode : 2642 		 Timestep : 19029600 		 Average Reward : 409.32
Episode : 2645 		 Timestep : 19051200 		 Average Reward : 358.93
Episode : 2648 		 Timestep : 19072800 		 Average Reward : 514.81
Episode : 2651 		 Timestep : 19094400 		 Average Reward : 495.18
Episode : 2654 		 Timestep : 19116000 		 Average Reward : 445.07
Episode : 2657 		 Timeste

Episode : 2993 		 Timestep : 21556800 		 Average Reward : 410.33
Episode : 2996 		 Timestep : 21578400 		 Average Reward : 438.19
Episode : 2999 		 Timestep : 21600000 		 Average Reward : 446.29
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrained/pong_game/PPO_pong_game_3000_1.pth
model saved
Elapsed Time  :  1:59:37
--------------------------------------------------------------------------------------------
Episode : 3002 		 Timestep : 21621600 		 Average Reward : 386.36
Episode : 3005 		 Timestep : 21643200 		 Average Reward : 483.98
Episode : 3008 		 Timestep : 21664800 		 Average Reward : 282.21
Episode : 3011 		 Timestep : 21686400 		 Average Reward : 402.06
Episode : 3014 		 Timestep : 21708000 		 Average Reward : 430.25
Episode : 3017 		 Timestep : 21729600 		 Average Reward : 374.26
Episode : 3020 		 Timestep : 21751200 		 Average Reward : 537.1
Episode : 3023 		 Timestep : 21772800 		 Average Reward : 35

Episode : 3359 		 Timestep : 24192000 		 Average Reward : 522.24
Episode : 3362 		 Timestep : 24213600 		 Average Reward : 445.81
Episode : 3365 		 Timestep : 24235200 		 Average Reward : 409.35
Episode : 3368 		 Timestep : 24256800 		 Average Reward : 521.31
Episode : 3371 		 Timestep : 24278400 		 Average Reward : 512.04
Episode : 3374 		 Timestep : 24300000 		 Average Reward : 419.24
Episode : 3377 		 Timestep : 24321600 		 Average Reward : 515.76
Episode : 3380 		 Timestep : 24343200 		 Average Reward : 421.38
Episode : 3383 		 Timestep : 24364800 		 Average Reward : 556.95
Episode : 3386 		 Timestep : 24386400 		 Average Reward : 456.13
Episode : 3389 		 Timestep : 24408000 		 Average Reward : 162.07
Episode : 3392 		 Timestep : 24429600 		 Average Reward : 575.13
Episode : 3395 		 Timestep : 24451200 		 Average Reward : 491.83
Episode : 3398 		 Timestep : 24472800 		 Average Reward : 370.06
Episode : 3401 		 Timestep : 24494400 		 Average Reward : 360.39
Episode : 3404 		 Timeste

Episode : 3740 		 Timestep : 26935200 		 Average Reward : 446.26
Episode : 3743 		 Timestep : 26956800 		 Average Reward : 563.79
Episode : 3746 		 Timestep : 26978400 		 Average Reward : 383.47
Episode : 3749 		 Timestep : 27000000 		 Average Reward : 397.65
Episode : 3752 		 Timestep : 27021600 		 Average Reward : 390.94
Episode : 3755 		 Timestep : 27043200 		 Average Reward : 404.58
Episode : 3758 		 Timestep : 27064800 		 Average Reward : 452.53
Episode : 3761 		 Timestep : 27086400 		 Average Reward : 530.49
Episode : 3764 		 Timestep : 27108000 		 Average Reward : 342.93
Episode : 3767 		 Timestep : 27129600 		 Average Reward : 425.31
Episode : 3770 		 Timestep : 27151200 		 Average Reward : 443.98
Episode : 3773 		 Timestep : 27172800 		 Average Reward : 326.25
Episode : 3776 		 Timestep : 27194400 		 Average Reward : 430.84
Episode : 3779 		 Timestep : 27216000 		 Average Reward : 357.31
Episode : 3782 		 Timestep : 27237600 		 Average Reward : 378.34
Episode : 3785 		 Timeste

Episode : 4106 		 Timestep : 29570400 		 Average Reward : 539.48
Episode : 4109 		 Timestep : 29592000 		 Average Reward : 439.9
Episode : 4112 		 Timestep : 29613600 		 Average Reward : 484.4
Episode : 4115 		 Timestep : 29635200 		 Average Reward : 400.13
Episode : 4118 		 Timestep : 29656800 		 Average Reward : 466.06
Episode : 4121 		 Timestep : 29678400 		 Average Reward : 521.95
Episode : 4124 		 Timestep : 29700000 		 Average Reward : 365.8
Episode : 4127 		 Timestep : 29721600 		 Average Reward : 471.11
Episode : 4130 		 Timestep : 29743200 		 Average Reward : 447.99
Episode : 4133 		 Timestep : 29764800 		 Average Reward : 586.92
Episode : 4136 		 Timestep : 29786400 		 Average Reward : 356.07
Episode : 4139 		 Timestep : 29808000 		 Average Reward : 594.23
Episode : 4142 		 Timestep : 29829600 		 Average Reward : 530.27
Episode : 4145 		 Timestep : 29851200 		 Average Reward : 421.67
Episode : 4148 		 Timestep : 29872800 		 Average Reward : 449.54
Episode : 4151 		 Timestep :

Episode : 4487 		 Timestep : 32313600 		 Average Reward : 528.33
Episode : 4490 		 Timestep : 32335200 		 Average Reward : 471.27
Episode : 4493 		 Timestep : 32356800 		 Average Reward : 350.95
Episode : 4496 		 Timestep : 32378400 		 Average Reward : 521.47
Episode : 4499 		 Timestep : 32400000 		 Average Reward : 624.63
Episode : 4502 		 Timestep : 32421600 		 Average Reward : 411.68
Episode : 4505 		 Timestep : 32443200 		 Average Reward : 512.05
Episode : 4508 		 Timestep : 32464800 		 Average Reward : 600.37
Episode : 4511 		 Timestep : 32486400 		 Average Reward : 444.19
Episode : 4514 		 Timestep : 32508000 		 Average Reward : 576.01
Episode : 4517 		 Timestep : 32529600 		 Average Reward : 512.99
Episode : 4520 		 Timestep : 32551200 		 Average Reward : 588.11
Episode : 4523 		 Timestep : 32572800 		 Average Reward : 560.02
Episode : 4526 		 Timestep : 32594400 		 Average Reward : 611.52
Episode : 4529 		 Timestep : 32616000 		 Average Reward : 467.24
Episode : 4532 		 Timeste

Episode : 4868 		 Timestep : 35056800 		 Average Reward : 435.4
Episode : 4871 		 Timestep : 35078400 		 Average Reward : 405.37
Episode : 4874 		 Timestep : 35100000 		 Average Reward : 436.31
Episode : 4877 		 Timestep : 35121600 		 Average Reward : 475.67
Episode : 4880 		 Timestep : 35143200 		 Average Reward : 600.24
Episode : 4883 		 Timestep : 35164800 		 Average Reward : 431.59
Episode : 4886 		 Timestep : 35186400 		 Average Reward : 834.41
Episode : 4889 		 Timestep : 35208000 		 Average Reward : 325.26
Episode : 4892 		 Timestep : 35229600 		 Average Reward : 437.26
Episode : 4895 		 Timestep : 35251200 		 Average Reward : 600.21
Episode : 4898 		 Timestep : 35272800 		 Average Reward : 519.11
Episode : 4901 		 Timestep : 35294400 		 Average Reward : 432.13
Episode : 4904 		 Timestep : 35316000 		 Average Reward : 462.52
Episode : 4907 		 Timestep : 35337600 		 Average Reward : 554.08
Episode : 4910 		 Timestep : 35359200 		 Average Reward : 602.44
Episode : 4913 		 Timestep

Episode : 5234 		 Timestep : 37692000 		 Average Reward : 397.73
Episode : 5237 		 Timestep : 37713600 		 Average Reward : 516.95
Episode : 5240 		 Timestep : 37735200 		 Average Reward : 474.2
Episode : 5243 		 Timestep : 37756800 		 Average Reward : 555.76
Episode : 5246 		 Timestep : 37778400 		 Average Reward : 509.01
Episode : 5249 		 Timestep : 37800000 		 Average Reward : 395.23
Episode : 5252 		 Timestep : 37821600 		 Average Reward : 399.27
Episode : 5255 		 Timestep : 37843200 		 Average Reward : 374.47
Episode : 5258 		 Timestep : 37864800 		 Average Reward : 582.38
Episode : 5261 		 Timestep : 37886400 		 Average Reward : 456.18
Episode : 5264 		 Timestep : 37908000 		 Average Reward : 635.72
Episode : 5267 		 Timestep : 37929600 		 Average Reward : 448.24
Episode : 5270 		 Timestep : 37951200 		 Average Reward : 490.67
Episode : 5273 		 Timestep : 37972800 		 Average Reward : 511.59
Episode : 5276 		 Timestep : 37994400 		 Average Reward : 629.82
Episode : 5279 		 Timestep

Episode : 5615 		 Timestep : 40435200 		 Average Reward : 637.28
Episode : 5618 		 Timestep : 40456800 		 Average Reward : 505.58
Episode : 5621 		 Timestep : 40478400 		 Average Reward : 496.54
Episode : 5624 		 Timestep : 40500000 		 Average Reward : 418.72
Episode : 5627 		 Timestep : 40521600 		 Average Reward : 429.65
Episode : 5630 		 Timestep : 40543200 		 Average Reward : 451.13
Episode : 5633 		 Timestep : 40564800 		 Average Reward : 401.28
Episode : 5636 		 Timestep : 40586400 		 Average Reward : 438.33
Episode : 5639 		 Timestep : 40608000 		 Average Reward : 451.07
Episode : 5642 		 Timestep : 40629600 		 Average Reward : 368.28
Episode : 5645 		 Timestep : 40651200 		 Average Reward : 560.9
Episode : 5648 		 Timestep : 40672800 		 Average Reward : 476.69
Episode : 5651 		 Timestep : 40694400 		 Average Reward : 482.79
Episode : 5654 		 Timestep : 40716000 		 Average Reward : 562.02
Episode : 5657 		 Timestep : 40737600 		 Average Reward : 481.78
Episode : 5660 		 Timestep

Episode : 5996 		 Timestep : 43178400 		 Average Reward : 346.9
Episode : 5999 		 Timestep : 43200000 		 Average Reward : 550.7
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrained/pong_game/PPO_pong_game_6000_1.pth
model saved
Elapsed Time  :  3:57:38
--------------------------------------------------------------------------------------------
Episode : 6002 		 Timestep : 43221600 		 Average Reward : 640.47
Episode : 6005 		 Timestep : 43243200 		 Average Reward : 346.91
Episode : 6008 		 Timestep : 43264800 		 Average Reward : 415.98
Episode : 6011 		 Timestep : 43286400 		 Average Reward : 577.4
Episode : 6014 		 Timestep : 43308000 		 Average Reward : 566.67
Episode : 6017 		 Timestep : 43329600 		 Average Reward : 441.26
Episode : 6020 		 Timestep : 43351200 		 Average Reward : 455.63
Episode : 6023 		 Timestep : 43372800 		 Average Reward : 445.27
Episode : 6026 		 Timestep : 43394400 		 Average Reward : 458.

Episode : 6362 		 Timestep : 45813600 		 Average Reward : 513.24
Episode : 6365 		 Timestep : 45835200 		 Average Reward : 510.17
Episode : 6368 		 Timestep : 45856800 		 Average Reward : 362.56
Episode : 6371 		 Timestep : 45878400 		 Average Reward : 379.48
Episode : 6374 		 Timestep : 45900000 		 Average Reward : 492.22
Episode : 6377 		 Timestep : 45921600 		 Average Reward : 414.39
Episode : 6380 		 Timestep : 45943200 		 Average Reward : 563.58
Episode : 6383 		 Timestep : 45964800 		 Average Reward : 591.97
Episode : 6386 		 Timestep : 45986400 		 Average Reward : 417.73
Episode : 6389 		 Timestep : 46008000 		 Average Reward : 406.35
Episode : 6392 		 Timestep : 46029600 		 Average Reward : 374.12
Episode : 6395 		 Timestep : 46051200 		 Average Reward : 623.11
Episode : 6398 		 Timestep : 46072800 		 Average Reward : 474.62
Episode : 6401 		 Timestep : 46094400 		 Average Reward : 561.32
Episode : 6404 		 Timestep : 46116000 		 Average Reward : 391.17
Episode : 6407 		 Timeste

Episode : 6743 		 Timestep : 48556800 		 Average Reward : 507.99
Episode : 6746 		 Timestep : 48578400 		 Average Reward : 483.03
Episode : 6749 		 Timestep : 48600000 		 Average Reward : 578.73
Episode : 6752 		 Timestep : 48621600 		 Average Reward : 526.45
Episode : 6755 		 Timestep : 48643200 		 Average Reward : 361.98
Episode : 6758 		 Timestep : 48664800 		 Average Reward : 412.24
Episode : 6761 		 Timestep : 48686400 		 Average Reward : 541.71
Episode : 6764 		 Timestep : 48708000 		 Average Reward : 455.73
Episode : 6767 		 Timestep : 48729600 		 Average Reward : 580.49
Episode : 6770 		 Timestep : 48751200 		 Average Reward : 551.91
Episode : 6773 		 Timestep : 48772800 		 Average Reward : 551.24
Episode : 6776 		 Timestep : 48794400 		 Average Reward : 554.91
Episode : 6779 		 Timestep : 48816000 		 Average Reward : 513.13
Episode : 6782 		 Timestep : 48837600 		 Average Reward : 451.15
Episode : 6785 		 Timestep : 48859200 		 Average Reward : 461.19
Episode : 6788 		 Timeste

Episode : 7109 		 Timestep : 51192000 		 Average Reward : 496.27
Episode : 7112 		 Timestep : 51213600 		 Average Reward : 479.98
Episode : 7115 		 Timestep : 51235200 		 Average Reward : 424.84
Episode : 7118 		 Timestep : 51256800 		 Average Reward : 356.43
Episode : 7121 		 Timestep : 51278400 		 Average Reward : 381.42
Episode : 7124 		 Timestep : 51300000 		 Average Reward : 463.8
Episode : 7127 		 Timestep : 51321600 		 Average Reward : 413.09
Episode : 7130 		 Timestep : 51343200 		 Average Reward : 530.5
Episode : 7133 		 Timestep : 51364800 		 Average Reward : 445.66
Episode : 7136 		 Timestep : 51386400 		 Average Reward : 441.0
Episode : 7139 		 Timestep : 51408000 		 Average Reward : 566.49
Episode : 7142 		 Timestep : 51429600 		 Average Reward : 464.01
Episode : 7145 		 Timestep : 51451200 		 Average Reward : 348.89
Episode : 7148 		 Timestep : 51472800 		 Average Reward : 546.95
Episode : 7151 		 Timestep : 51494400 		 Average Reward : 563.33
Episode : 7154 		 Timestep :

Episode : 7490 		 Timestep : 53935200 		 Average Reward : 414.47
Episode : 7493 		 Timestep : 53956800 		 Average Reward : 603.69
Episode : 7496 		 Timestep : 53978400 		 Average Reward : 464.8
Episode : 7499 		 Timestep : 54000000 		 Average Reward : 617.86
Episode : 7502 		 Timestep : 54021600 		 Average Reward : 476.16
Episode : 7505 		 Timestep : 54043200 		 Average Reward : 488.13
Episode : 7508 		 Timestep : 54064800 		 Average Reward : 548.5
Episode : 7511 		 Timestep : 54086400 		 Average Reward : 426.2
Episode : 7514 		 Timestep : 54108000 		 Average Reward : 444.71
Episode : 7517 		 Timestep : 54129600 		 Average Reward : 472.09
Episode : 7520 		 Timestep : 54151200 		 Average Reward : 366.93
Episode : 7523 		 Timestep : 54172800 		 Average Reward : 502.77
Episode : 7526 		 Timestep : 54194400 		 Average Reward : 645.18
Episode : 7529 		 Timestep : 54216000 		 Average Reward : 463.82
Episode : 7532 		 Timestep : 54237600 		 Average Reward : 604.1
Episode : 7535 		 Timestep : 

Episode : 7871 		 Timestep : 56678400 		 Average Reward : 428.5
Episode : 7874 		 Timestep : 56700000 		 Average Reward : 456.54
Episode : 7877 		 Timestep : 56721600 		 Average Reward : 562.12
Episode : 7880 		 Timestep : 56743200 		 Average Reward : 396.72
Episode : 7883 		 Timestep : 56764800 		 Average Reward : 541.59
Episode : 7886 		 Timestep : 56786400 		 Average Reward : 339.3
Episode : 7889 		 Timestep : 56808000 		 Average Reward : 517.66
Episode : 7892 		 Timestep : 56829600 		 Average Reward : 384.08
Episode : 7895 		 Timestep : 56851200 		 Average Reward : 437.61
Episode : 7898 		 Timestep : 56872800 		 Average Reward : 601.58
Episode : 7901 		 Timestep : 56894400 		 Average Reward : 598.54
Episode : 7904 		 Timestep : 56916000 		 Average Reward : 477.68
Episode : 7907 		 Timestep : 56937600 		 Average Reward : 406.92
Episode : 7910 		 Timestep : 56959200 		 Average Reward : 520.06
Episode : 7913 		 Timestep : 56980800 		 Average Reward : 698.8
Episode : 7916 		 Timestep :

Episode : 8237 		 Timestep : 59313600 		 Average Reward : 596.82
Episode : 8240 		 Timestep : 59335200 		 Average Reward : 598.42
Episode : 8243 		 Timestep : 59356800 		 Average Reward : 547.41
Episode : 8246 		 Timestep : 59378400 		 Average Reward : 475.89
Episode : 8249 		 Timestep : 59400000 		 Average Reward : 558.46
Episode : 8252 		 Timestep : 59421600 		 Average Reward : 441.01
Episode : 8255 		 Timestep : 59443200 		 Average Reward : 484.78
Episode : 8258 		 Timestep : 59464800 		 Average Reward : 420.61
Episode : 8261 		 Timestep : 59486400 		 Average Reward : 499.78
Episode : 8264 		 Timestep : 59508000 		 Average Reward : 559.76
Episode : 8267 		 Timestep : 59529600 		 Average Reward : 419.12
Episode : 8270 		 Timestep : 59551200 		 Average Reward : 378.79
Episode : 8273 		 Timestep : 59572800 		 Average Reward : 710.44
Episode : 8276 		 Timestep : 59594400 		 Average Reward : 357.2
Episode : 8279 		 Timestep : 59616000 		 Average Reward : 477.0
Episode : 8282 		 Timestep 

Episode : 8618 		 Timestep : 62056800 		 Average Reward : 567.33
Episode : 8621 		 Timestep : 62078400 		 Average Reward : 367.23
Episode : 8624 		 Timestep : 62100000 		 Average Reward : 363.99
Episode : 8627 		 Timestep : 62121600 		 Average Reward : 492.3
Episode : 8630 		 Timestep : 62143200 		 Average Reward : 460.12
Episode : 8633 		 Timestep : 62164800 		 Average Reward : 562.57
Episode : 8636 		 Timestep : 62186400 		 Average Reward : 386.01
Episode : 8639 		 Timestep : 62208000 		 Average Reward : 598.44
Episode : 8642 		 Timestep : 62229600 		 Average Reward : 375.18
Episode : 8645 		 Timestep : 62251200 		 Average Reward : 228.92
Episode : 8648 		 Timestep : 62272800 		 Average Reward : 542.17
Episode : 8651 		 Timestep : 62294400 		 Average Reward : 445.93
Episode : 8654 		 Timestep : 62316000 		 Average Reward : 434.08
Episode : 8657 		 Timestep : 62337600 		 Average Reward : 363.52
Episode : 8660 		 Timestep : 62359200 		 Average Reward : 503.54
Episode : 8663 		 Timestep

Episode : 8999 		 Timestep : 64800000 		 Average Reward : 655.17
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrained/pong_game/PPO_pong_game_9000_1.pth
model saved
Elapsed Time  :  6:05:07
--------------------------------------------------------------------------------------------
Episode : 9002 		 Timestep : 64821600 		 Average Reward : 380.04
Episode : 9005 		 Timestep : 64843200 		 Average Reward : 418.3
Episode : 9008 		 Timestep : 64864800 		 Average Reward : 550.12
Episode : 9011 		 Timestep : 64886400 		 Average Reward : 474.9
Episode : 9014 		 Timestep : 64908000 		 Average Reward : 449.1
Episode : 9017 		 Timestep : 64929600 		 Average Reward : 456.91
Episode : 9020 		 Timestep : 64951200 		 Average Reward : 482.75
Episode : 9023 		 Timestep : 64972800 		 Average Reward : 516.2
Episode : 9026 		 Timestep : 64994400 		 Average Reward : 449.53
Episode : 9029 		 Timestep : 65016000 		 Average Reward : 337.2

Episode : 9365 		 Timestep : 67435200 		 Average Reward : 418.68
Episode : 9368 		 Timestep : 67456800 		 Average Reward : 626.07
Episode : 9371 		 Timestep : 67478400 		 Average Reward : 430.31
Episode : 9374 		 Timestep : 67500000 		 Average Reward : 399.77
Episode : 9377 		 Timestep : 67521600 		 Average Reward : 406.1
Episode : 9380 		 Timestep : 67543200 		 Average Reward : 421.29
Episode : 9383 		 Timestep : 67564800 		 Average Reward : 544.2
Episode : 9386 		 Timestep : 67586400 		 Average Reward : 479.85
Episode : 9389 		 Timestep : 67608000 		 Average Reward : 333.35
Episode : 9392 		 Timestep : 67629600 		 Average Reward : 365.95
Episode : 9395 		 Timestep : 67651200 		 Average Reward : 470.6
Episode : 9398 		 Timestep : 67672800 		 Average Reward : 553.22
Episode : 9401 		 Timestep : 67694400 		 Average Reward : 501.59
Episode : 9404 		 Timestep : 67716000 		 Average Reward : 439.57
Episode : 9407 		 Timestep : 67737600 		 Average Reward : 282.13
Episode : 9410 		 Timestep :

Episode : 9746 		 Timestep : 70178400 		 Average Reward : 335.35
Episode : 9749 		 Timestep : 70200000 		 Average Reward : 429.46
Episode : 9752 		 Timestep : 70221600 		 Average Reward : 342.79
Episode : 9755 		 Timestep : 70243200 		 Average Reward : 411.12
Episode : 9758 		 Timestep : 70264800 		 Average Reward : 441.98
Episode : 9761 		 Timestep : 70286400 		 Average Reward : 482.84
Episode : 9764 		 Timestep : 70308000 		 Average Reward : 523.36
Episode : 9767 		 Timestep : 70329600 		 Average Reward : 417.41
Episode : 9770 		 Timestep : 70351200 		 Average Reward : 408.92
Episode : 9773 		 Timestep : 70372800 		 Average Reward : 392.18
Episode : 9776 		 Timestep : 70394400 		 Average Reward : 460.56
Episode : 9779 		 Timestep : 70416000 		 Average Reward : 336.96
Episode : 9782 		 Timestep : 70437600 		 Average Reward : 458.71
Episode : 9785 		 Timestep : 70459200 		 Average Reward : 503.38
Episode : 9788 		 Timestep : 70480800 		 Average Reward : 435.33
Episode : 9791 		 Timeste

Episode : 10112 		 Timestep : 72813600 		 Average Reward : 644.2
Episode : 10115 		 Timestep : 72835200 		 Average Reward : 417.57
Episode : 10118 		 Timestep : 72856800 		 Average Reward : 626.43
Episode : 10121 		 Timestep : 72878400 		 Average Reward : 408.5
Episode : 10124 		 Timestep : 72900000 		 Average Reward : 448.61
Episode : 10127 		 Timestep : 72921600 		 Average Reward : 391.81
Episode : 10130 		 Timestep : 72943200 		 Average Reward : 491.7
Episode : 10133 		 Timestep : 72964800 		 Average Reward : 538.76
Episode : 10136 		 Timestep : 72986400 		 Average Reward : 581.91
Episode : 10139 		 Timestep : 73008000 		 Average Reward : 314.79
Episode : 10142 		 Timestep : 73029600 		 Average Reward : 565.51
Episode : 10145 		 Timestep : 73051200 		 Average Reward : 439.91
Episode : 10148 		 Timestep : 73072800 		 Average Reward : 327.66
Episode : 10151 		 Timestep : 73094400 		 Average Reward : 642.01
Episode : 10154 		 Timestep : 73116000 		 Average Reward : 532.76
Episode : 101

Episode : 10487 		 Timestep : 75513600 		 Average Reward : 607.85
Episode : 10490 		 Timestep : 75535200 		 Average Reward : 567.04
Episode : 10493 		 Timestep : 75556800 		 Average Reward : 502.24
Episode : 10496 		 Timestep : 75578400 		 Average Reward : 431.21
Episode : 10499 		 Timestep : 75600000 		 Average Reward : 526.77
Episode : 10502 		 Timestep : 75621600 		 Average Reward : 553.2
Episode : 10505 		 Timestep : 75643200 		 Average Reward : 578.2
Episode : 10508 		 Timestep : 75664800 		 Average Reward : 447.07
Episode : 10511 		 Timestep : 75686400 		 Average Reward : 537.06
Episode : 10514 		 Timestep : 75708000 		 Average Reward : 427.58
Episode : 10517 		 Timestep : 75729600 		 Average Reward : 576.86
Episode : 10520 		 Timestep : 75751200 		 Average Reward : 437.74
Episode : 10523 		 Timestep : 75772800 		 Average Reward : 433.8
Episode : 10526 		 Timestep : 75794400 		 Average Reward : 414.01
Episode : 10529 		 Timestep : 75816000 		 Average Reward : 574.05
Episode : 105

Episode : 10862 		 Timestep : 78213600 		 Average Reward : 417.17
Episode : 10865 		 Timestep : 78235200 		 Average Reward : 565.34
Episode : 10868 		 Timestep : 78256800 		 Average Reward : 405.29
Episode : 10871 		 Timestep : 78278400 		 Average Reward : 376.54
Episode : 10874 		 Timestep : 78300000 		 Average Reward : 484.93
Episode : 10877 		 Timestep : 78321600 		 Average Reward : 378.27
Episode : 10880 		 Timestep : 78343200 		 Average Reward : 483.37
Episode : 10883 		 Timestep : 78364800 		 Average Reward : 405.79
Episode : 10886 		 Timestep : 78386400 		 Average Reward : 581.32
Episode : 10889 		 Timestep : 78408000 		 Average Reward : 411.53
Episode : 10892 		 Timestep : 78429600 		 Average Reward : 291.47
Episode : 10895 		 Timestep : 78451200 		 Average Reward : 562.4
Episode : 10898 		 Timestep : 78472800 		 Average Reward : 452.27
Episode : 10901 		 Timestep : 78494400 		 Average Reward : 338.47
Episode : 10904 		 Timestep : 78516000 		 Average Reward : 537.73
Episode : 1

Episode : 11222 		 Timestep : 80805600 		 Average Reward : 510.37
Episode : 11225 		 Timestep : 80827200 		 Average Reward : 495.96
Episode : 11228 		 Timestep : 80848800 		 Average Reward : 479.46
Episode : 11231 		 Timestep : 80870400 		 Average Reward : 557.7
Episode : 11234 		 Timestep : 80892000 		 Average Reward : 462.82
Episode : 11237 		 Timestep : 80913600 		 Average Reward : 503.34
Episode : 11240 		 Timestep : 80935200 		 Average Reward : 408.92
Episode : 11243 		 Timestep : 80956800 		 Average Reward : 439.21
Episode : 11246 		 Timestep : 80978400 		 Average Reward : 393.49
Episode : 11249 		 Timestep : 81000000 		 Average Reward : 398.02
Episode : 11252 		 Timestep : 81021600 		 Average Reward : 514.27
Episode : 11255 		 Timestep : 81043200 		 Average Reward : 605.66
Episode : 11258 		 Timestep : 81064800 		 Average Reward : 353.68
Episode : 11261 		 Timestep : 81086400 		 Average Reward : 542.04
Episode : 11264 		 Timestep : 81108000 		 Average Reward : 605.31
Episode : 1

Episode : 11597 		 Timestep : 83505600 		 Average Reward : 243.93
Episode : 11600 		 Timestep : 83527200 		 Average Reward : 446.9
Episode : 11603 		 Timestep : 83548800 		 Average Reward : 464.77
Episode : 11606 		 Timestep : 83570400 		 Average Reward : 484.51
Episode : 11609 		 Timestep : 83592000 		 Average Reward : 363.65
Episode : 11612 		 Timestep : 83613600 		 Average Reward : 543.51
Episode : 11615 		 Timestep : 83635200 		 Average Reward : 423.03
Episode : 11618 		 Timestep : 83656800 		 Average Reward : 623.07
Episode : 11621 		 Timestep : 83678400 		 Average Reward : 527.07
Episode : 11624 		 Timestep : 83700000 		 Average Reward : 300.53
Episode : 11627 		 Timestep : 83721600 		 Average Reward : 479.87
Episode : 11630 		 Timestep : 83743200 		 Average Reward : 574.41
Episode : 11633 		 Timestep : 83764800 		 Average Reward : 479.96
Episode : 11636 		 Timestep : 83786400 		 Average Reward : 637.06
Episode : 11639 		 Timestep : 83808000 		 Average Reward : 437.36
Episode : 1

Episode : 11972 		 Timestep : 86205600 		 Average Reward : 531.05
Episode : 11975 		 Timestep : 86227200 		 Average Reward : 532.81
Episode : 11978 		 Timestep : 86248800 		 Average Reward : 533.32
Episode : 11981 		 Timestep : 86270400 		 Average Reward : 415.31
Episode : 11984 		 Timestep : 86292000 		 Average Reward : 378.21
Episode : 11987 		 Timestep : 86313600 		 Average Reward : 539.36
Episode : 11990 		 Timestep : 86335200 		 Average Reward : 331.14
Episode : 11993 		 Timestep : 86356800 		 Average Reward : 350.17
Episode : 11996 		 Timestep : 86378400 		 Average Reward : 534.82
Episode : 11999 		 Timestep : 86400000 		 Average Reward : 383.82
--------------------------------------------------------------------------------------------
saving model at : PPO_preTrained/pong_game/PPO_pong_game_12000_1.pth
model saved
Elapsed Time  :  9:52:36
--------------------------------------------------------------------------------------------
Episode : 12002 		 Timestep : 86421600 		 Averag

Episode : 12332 		 Timestep : 88797600 		 Average Reward : 402.63
Episode : 12335 		 Timestep : 88819200 		 Average Reward : 377.42
Episode : 12338 		 Timestep : 88840800 		 Average Reward : 382.42
Episode : 12341 		 Timestep : 88862400 		 Average Reward : 411.95
Episode : 12344 		 Timestep : 88884000 		 Average Reward : 455.01
Episode : 12347 		 Timestep : 88905600 		 Average Reward : 330.41
Episode : 12350 		 Timestep : 88927200 		 Average Reward : 405.74
Episode : 12353 		 Timestep : 88948800 		 Average Reward : 325.98
Episode : 12356 		 Timestep : 88970400 		 Average Reward : 505.07
Episode : 12359 		 Timestep : 88992000 		 Average Reward : 512.6
Episode : 12362 		 Timestep : 89013600 		 Average Reward : 679.01
Episode : 12365 		 Timestep : 89035200 		 Average Reward : 669.94
Episode : 12368 		 Timestep : 89056800 		 Average Reward : 599.78
Episode : 12371 		 Timestep : 89078400 		 Average Reward : 469.24
Episode : 12374 		 Timestep : 89100000 		 Average Reward : 341.66
Episode : 1

Episode : 12707 		 Timestep : 91497600 		 Average Reward : 382.0
Episode : 12710 		 Timestep : 91519200 		 Average Reward : 464.56
Episode : 12713 		 Timestep : 91540800 		 Average Reward : 442.59
Episode : 12716 		 Timestep : 91562400 		 Average Reward : 565.52
Episode : 12719 		 Timestep : 91584000 		 Average Reward : 501.65
Episode : 12722 		 Timestep : 91605600 		 Average Reward : 423.0
Episode : 12725 		 Timestep : 91627200 		 Average Reward : 386.12
Episode : 12728 		 Timestep : 91648800 		 Average Reward : 542.52
Episode : 12731 		 Timestep : 91670400 		 Average Reward : 441.53
Episode : 12734 		 Timestep : 91692000 		 Average Reward : 506.1
Episode : 12737 		 Timestep : 91713600 		 Average Reward : 482.23
Episode : 12740 		 Timestep : 91735200 		 Average Reward : 443.37
Episode : 12743 		 Timestep : 91756800 		 Average Reward : 489.05
Episode : 12746 		 Timestep : 91778400 		 Average Reward : 618.69
Episode : 12749 		 Timestep : 91800000 		 Average Reward : 444.15
Episode : 127

Episode : 13067 		 Timestep : 94089600 		 Average Reward : 507.91
Episode : 13070 		 Timestep : 94111200 		 Average Reward : 363.16
Episode : 13073 		 Timestep : 94132800 		 Average Reward : 459.99
Episode : 13076 		 Timestep : 94154400 		 Average Reward : 585.48
Episode : 13079 		 Timestep : 94176000 		 Average Reward : 384.05
Episode : 13082 		 Timestep : 94197600 		 Average Reward : 526.21
Episode : 13085 		 Timestep : 94219200 		 Average Reward : 508.66
Episode : 13088 		 Timestep : 94240800 		 Average Reward : 514.91
Episode : 13091 		 Timestep : 94262400 		 Average Reward : 453.04
Episode : 13094 		 Timestep : 94284000 		 Average Reward : 432.67
Episode : 13097 		 Timestep : 94305600 		 Average Reward : 504.52
Episode : 13100 		 Timestep : 94327200 		 Average Reward : 427.97
Episode : 13103 		 Timestep : 94348800 		 Average Reward : 497.52
Episode : 13106 		 Timestep : 94370400 		 Average Reward : 521.82
Episode : 13109 		 Timestep : 94392000 		 Average Reward : 411.48
Episode : 

Episode : 13442 		 Timestep : 96789600 		 Average Reward : 415.31
Episode : 13445 		 Timestep : 96811200 		 Average Reward : 596.85
Episode : 13448 		 Timestep : 96832800 		 Average Reward : 682.02
Episode : 13451 		 Timestep : 96854400 		 Average Reward : 627.8
Episode : 13454 		 Timestep : 96876000 		 Average Reward : 431.02
Episode : 13457 		 Timestep : 96897600 		 Average Reward : 557.52
Episode : 13460 		 Timestep : 96919200 		 Average Reward : 390.93
Episode : 13463 		 Timestep : 96940800 		 Average Reward : 548.63
Episode : 13466 		 Timestep : 96962400 		 Average Reward : 521.86
Episode : 13469 		 Timestep : 96984000 		 Average Reward : 544.06
Episode : 13472 		 Timestep : 97005600 		 Average Reward : 512.48
Episode : 13475 		 Timestep : 97027200 		 Average Reward : 616.55
Episode : 13478 		 Timestep : 97048800 		 Average Reward : 507.84
Episode : 13481 		 Timestep : 97070400 		 Average Reward : 574.75
Episode : 13484 		 Timestep : 97092000 		 Average Reward : 383.27
Episode : 1

Episode : 13817 		 Timestep : 99489600 		 Average Reward : 489.0
Episode : 13820 		 Timestep : 99511200 		 Average Reward : 342.91
Episode : 13823 		 Timestep : 99532800 		 Average Reward : 478.3
Episode : 13826 		 Timestep : 99554400 		 Average Reward : 476.15
Episode : 13829 		 Timestep : 99576000 		 Average Reward : 404.87
Episode : 13832 		 Timestep : 99597600 		 Average Reward : 503.14
Episode : 13835 		 Timestep : 99619200 		 Average Reward : 345.8
Episode : 13838 		 Timestep : 99640800 		 Average Reward : 578.3
Episode : 13841 		 Timestep : 99662400 		 Average Reward : 440.85
Episode : 13844 		 Timestep : 99684000 		 Average Reward : 365.86
Episode : 13847 		 Timestep : 99705600 		 Average Reward : 496.16
Episode : 13850 		 Timestep : 99727200 		 Average Reward : 336.18
Episode : 13853 		 Timestep : 99748800 		 Average Reward : 452.81
Episode : 13856 		 Timestep : 99770400 		 Average Reward : 648.35
Episode : 13859 		 Timestep : 99792000 		 Average Reward : 419.88
Episode : 1386

Episode : 14174 		 Timestep : 102060000 		 Average Reward : 525.3
Episode : 14177 		 Timestep : 102081600 		 Average Reward : 434.61
Episode : 14180 		 Timestep : 102103200 		 Average Reward : 451.91
Episode : 14183 		 Timestep : 102124800 		 Average Reward : 642.18
Episode : 14186 		 Timestep : 102146400 		 Average Reward : 465.48
Episode : 14189 		 Timestep : 102168000 		 Average Reward : 404.42
Episode : 14192 		 Timestep : 102189600 		 Average Reward : 461.63
Episode : 14195 		 Timestep : 102211200 		 Average Reward : 394.26
Episode : 14198 		 Timestep : 102232800 		 Average Reward : 581.36
Episode : 14201 		 Timestep : 102254400 		 Average Reward : 326.7
Episode : 14204 		 Timestep : 102276000 		 Average Reward : 459.88
Episode : 14207 		 Timestep : 102297600 		 Average Reward : 472.43
Episode : 14210 		 Timestep : 102319200 		 Average Reward : 475.12
Episode : 14213 		 Timestep : 102340800 		 Average Reward : 549.74
Episode : 14216 		 Timestep : 102362400 		 Average Reward : 499.

Episode : 14543 		 Timestep : 104716800 		 Average Reward : 498.86
Episode : 14546 		 Timestep : 104738400 		 Average Reward : 452.06
Episode : 14549 		 Timestep : 104760000 		 Average Reward : 477.6
Episode : 14552 		 Timestep : 104781600 		 Average Reward : 511.81
Episode : 14555 		 Timestep : 104803200 		 Average Reward : 528.12
Episode : 14558 		 Timestep : 104824800 		 Average Reward : 499.31
Episode : 14561 		 Timestep : 104846400 		 Average Reward : 312.82
Episode : 14564 		 Timestep : 104868000 		 Average Reward : 517.88
Episode : 14567 		 Timestep : 104889600 		 Average Reward : 614.87
Episode : 14570 		 Timestep : 104911200 		 Average Reward : 444.45
Episode : 14573 		 Timestep : 104932800 		 Average Reward : 387.88
Episode : 14576 		 Timestep : 104954400 		 Average Reward : 468.72
Episode : 14579 		 Timestep : 104976000 		 Average Reward : 585.77
Episode : 14582 		 Timestep : 104997600 		 Average Reward : 383.66
Episode : 14585 		 Timestep : 105019200 		 Average Reward : 423

Episode : 14912 		 Timestep : 107373600 		 Average Reward : 468.36
Episode : 14915 		 Timestep : 107395200 		 Average Reward : 502.79
Episode : 14918 		 Timestep : 107416800 		 Average Reward : 474.25
Episode : 14921 		 Timestep : 107438400 		 Average Reward : 428.11
Episode : 14924 		 Timestep : 107460000 		 Average Reward : 556.91
Episode : 14927 		 Timestep : 107481600 		 Average Reward : 483.55
Episode : 14930 		 Timestep : 107503200 		 Average Reward : 519.78
Episode : 14933 		 Timestep : 107524800 		 Average Reward : 573.21
Episode : 14936 		 Timestep : 107546400 		 Average Reward : 367.71
Episode : 14939 		 Timestep : 107568000 		 Average Reward : 467.88
Episode : 14942 		 Timestep : 107589600 		 Average Reward : 594.14
Episode : 14945 		 Timestep : 107611200 		 Average Reward : 500.35
Episode : 14948 		 Timestep : 107632800 		 Average Reward : 511.81
Episode : 14951 		 Timestep : 107654400 		 Average Reward : 510.38
Episode : 14954 		 Timestep : 107676000 		 Average Reward : 43

Episode : 15269 		 Timestep : 109944000 		 Average Reward : 358.38
Episode : 15272 		 Timestep : 109965600 		 Average Reward : 562.57
Episode : 15275 		 Timestep : 109987200 		 Average Reward : 572.45
Episode : 15278 		 Timestep : 110008800 		 Average Reward : 385.92
Episode : 15281 		 Timestep : 110030400 		 Average Reward : 513.58
Episode : 15284 		 Timestep : 110052000 		 Average Reward : 478.5
Episode : 15287 		 Timestep : 110073600 		 Average Reward : 502.52
Episode : 15290 		 Timestep : 110095200 		 Average Reward : 550.8
Episode : 15293 		 Timestep : 110116800 		 Average Reward : 521.01
Episode : 15296 		 Timestep : 110138400 		 Average Reward : 422.5
Episode : 15299 		 Timestep : 110160000 		 Average Reward : 598.98
Episode : 15302 		 Timestep : 110181600 		 Average Reward : 547.28
Episode : 15305 		 Timestep : 110203200 		 Average Reward : 603.64
Episode : 15308 		 Timestep : 110224800 		 Average Reward : 575.9
Episode : 15311 		 Timestep : 110246400 		 Average Reward : 488.97

Episode : 15638 		 Timestep : 112600800 		 Average Reward : 490.28
Episode : 15641 		 Timestep : 112622400 		 Average Reward : 481.69
Episode : 15644 		 Timestep : 112644000 		 Average Reward : 596.98
Episode : 15647 		 Timestep : 112665600 		 Average Reward : 462.66
Episode : 15650 		 Timestep : 112687200 		 Average Reward : 395.46
Episode : 15653 		 Timestep : 112708800 		 Average Reward : 418.98
Episode : 15656 		 Timestep : 112730400 		 Average Reward : 343.57
Episode : 15659 		 Timestep : 112752000 		 Average Reward : 380.93
Episode : 15662 		 Timestep : 112773600 		 Average Reward : 464.79
Episode : 15665 		 Timestep : 112795200 		 Average Reward : 523.54
Episode : 15668 		 Timestep : 112816800 		 Average Reward : 426.64
Episode : 15671 		 Timestep : 112838400 		 Average Reward : 495.97
Episode : 15674 		 Timestep : 112860000 		 Average Reward : 376.35
Episode : 15677 		 Timestep : 112881600 		 Average Reward : 658.08
Episode : 15680 		 Timestep : 112903200 		 Average Reward : 53

In [None]:


checkpoint_path = "PPO_pong_game_24000_0.pth"
# checkpoint_path = "./PPO_preTrained/pong_game/PPO_pong_game_1798_0.pth"
ppo_agent.load(checkpoint_path)
env = pongGym.DoublePong()
state = env.reset()+torch.quantization.DeQuantStub()
current_ep_reward = 0
for t in range(1, max_ep_len+1):

    # select action with policy
    action = ppo_agent.select_action(state)
    state, reward, done, _ = env.step(action)
#         if i_episode > 100:
    env.render()
    time.sleep(1/60)
    # saving reward and is_terminals
    ppo_agent.buffer.rewards.append(reward)
    ppo_agent.buffer.is_terminals.append(done)

    time_step +=1
    current_ep_reward += reward


    if done: 
        break




log_f.close()
env.close()


