################################################################################
> # **Part - I**

*   define actor critic networks
*   define PPO algorithm

################################################################################

In [1]:


############################### Import libraries ###############################


import os
import glob
import time
from datetime import datetime

import torch
import torch.nn as nn
from torch.distributions import MultivariateNormal
from torch.distributions import Categorical

import numpy as np

import gym
import pongGym

################################## set device ##################################

print("============================================================================================")


# set device to cpu or cuda
device = torch.device('cpu')

if(torch.cuda.is_available()): 
    device = torch.device('cuda:0') 
    torch.cuda.empty_cache()
    print("Device set to : " + str(torch.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")
    
print("============================================================================================")




################################## PPO Policy ##################################


class RolloutBuffer:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    

    def clear(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]


class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, action_std_init):
        super(ActorCritic, self).__init__()

        
        self.actor = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.ReLU(),
                        nn.Linear(64, action_dim),
                        nn.Softmax(dim=-1)
                    )

        
        # critic
        self.critic = nn.Sequential(
                        nn.Linear(state_dim, 64),
                        nn.ReLU(),
                        nn.Linear(64, 64),
                        nn.ReLU(),
                        nn.Linear(64, 1)
                    )

    def forward(self):
        raise NotImplementedError
    

    def act(self, state):

        
        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        return action.detach(), action_logprob.detach()
    

    def evaluate(self, state, action):


        action_probs = self.actor(state)
        dist = Categorical(action_probs)

        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_values = self.critic(state)
        
        return action_logprobs, state_values, dist_entropy


class PPO:
    def __init__(self, state_dim, action_dim, lr_actor, lr_critic, gamma, K_epochs, eps_clip, action_std_init=0.6):

        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.buffer = RolloutBuffer()

        self.policy = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.optimizer = torch.optim.Adam([
                        {'params': self.policy.actor.parameters(), 'lr': lr_actor},
                        {'params': self.policy.critic.parameters(), 'lr': lr_critic}
                    ])

        self.policy_old = ActorCritic(state_dim, action_dim, action_std_init).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()


    def select_action(self, state):

        with torch.no_grad():
            state = torch.FloatTensor(state).to(device)
            action, action_logprob = self.policy_old.act(state)

        self.buffer.states.append(state)
        self.buffer.actions.append(action)
        self.buffer.logprobs.append(action_logprob)

        return action.item()


    def update(self):

        # Monte Carlo estimate of returns
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.buffer.rewards), reversed(self.buffer.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
            
        # Normalizing the rewards
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-7)

        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.buffer.states, dim=0)).detach().to(device)
        old_actions = torch.squeeze(torch.stack(self.buffer.actions, dim=0)).detach().to(device)
        old_logprobs = torch.squeeze(torch.stack(self.buffer.logprobs, dim=0)).detach().to(device)

        
        # Optimize policy for K epochs
        for _ in range(self.K_epochs):

            # Evaluating old actions and values
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)

            # match state_values tensor dimensions with rewards tensor
            state_values = torch.squeeze(state_values)
            
            # Finding the ratio (pi_theta / pi_theta__old)
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages

            # final loss of clipped objective PPO
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy
        self.policy_old.load_state_dict(self.policy.state_dict())

        # clear buffer
        self.buffer.clear()
    
    
    def save(self, checkpoint_path):
        torch.save(self.policy_old.state_dict(), checkpoint_path)
   

    def load(self, checkpoint_path):
        self.policy_old.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        self.policy.load_state_dict(torch.load(checkpoint_path, map_location=lambda storage, loc: storage))
        
        
       



pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
Hello from the pygame community. https://www.pygame.org/contribute.html
Device set to : GeForce RTX 2060


################################################################################
> # **Part - II**

*   train PPO algorithm on environments
*   save preTrained networks weights and log files

################################################################################

In [5]:


max_ep_len = 400
print_freq = max_ep_len * 4     # print avg reward in the interval (in num timesteps)
log_freq = max_ep_len * 2       # log avg reward in the interval (in num timesteps)
save_model_freq = int(2e4)      # save model frequency (in num timesteps)

action_std = None

def TrainPong(config):
    env = pongGym.DoublePong()

    # state space dimension
    state_dim = env.observation_space.shape[0]

    # action space dimension

    action_dim = env.action_space.n


    # initialize a PPO agent
    ppo_agent = PPO(state_dim, action_dim, config["lr_actor"], config["lr_critic"], config["gamma"], config["K_epochs"],
                    config["eps_clip"], action_std)


    # track total training time
    start_time = datetime.now().replace(microsecond=0)


    # printing and logging variables
    print_running_reward = 0
    print_running_episodes = 0

    log_running_reward = 0
    log_running_episodes = 0

    time_step = 0
    i_episode = 0

    Max_reward = 0

    # training loop
    # env = pongGym.DoublePong()
    while i_episode <= 250:

        state = env.reset()
        current_ep_reward = 0
        for t in range(1, max_ep_len+1):

            # select action with policy
            action = ppo_agent.select_action(state)
            state, reward, done, _ = env.step(action)

            # saving reward and is_terminals
            ppo_agent.buffer.rewards.append(reward)
            ppo_agent.buffer.is_terminals.append(done)

            time_step +=1
            current_ep_reward += reward

            # update PPO agent
            if time_step % config["update_timestep"] == 0:
                ppo_agent.update()

            # printing average reward
            if time_step % print_freq == 0:

                # print average reward till last episode
                print_avg_reward = print_running_reward / print_running_episodes
                print_avg_reward = round(print_avg_reward, 2)

                print_running_reward = 0
                print_running_episodes = 0
            # break; if the episode is over
            if done:
                break

        print_running_reward += current_ep_reward
        print_running_episodes += 1

        log_running_reward += current_ep_reward
        log_running_episodes += 1

        i_episode += 1


    log_f.close()
    env.close()
    tune.report(avreward=print_avg_reward)





In [7]:
from ray import tune
from ray.tune.suggest import ConcurrencyLimiter
from ray.tune.suggest.bayesopt import BayesOptSearch

analysis = tune.run(
    TrainPong,
    config={
        "update_timestep":tune.randint(max_ep_len * 2, max_ep_len * 4),      # update policy every n timesteps
        "K_epochs": tune.randint(1, 60),               # update policy for K epochs
        "eps_clip": tune.uniform(0, 1),             # clip parameter for PPO
        "gamma": tune.uniform(0, 1),               # discount factor

        "lr_actor": tune.sample_from(lambda spec: np.random.uniform(0.000001, 0.1)),       # learning rate for actor network
        "lr_critic": tune.sample_from(lambda spec: np.random.uniform(0.000001, 0.1)),       # learning rate for critic network

        "random_seed": 0         # set random seed if required (0 = no random seed)
    },
#     search_alg=ConcurrencyLimiter(
#         BayesOptSearch(random_search_steps=4),
#         max_concurrent=2),
    stop={"avreward": 20},
    num_samples=10,
)

print("Best config: ", analysis.get_best_config(
    metric="avreward", mode="max"))

# Get a dataframe for analyzing trial results.
df = analysis.results_df



[2m[36m(bundle_reservation_check_func pid=19072)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
[2m[36m(bundle_reservation_check_func pid=19072)[0m Hello from the pygame community. https://www.pygame.org/contribute.html


Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,PENDING,,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,PENDING,,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,PENDING,,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,PENDING,,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,PENDING,,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,PENDING,,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,PENDING,,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,PENDING,,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,PENDING,,9,0.554034,0.747747,0.0491182,0.0566373,975


 pid=3956)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
 pid=3956)[0m Hello from the pygame community. https://www.pygame.org/contribute.html
 pid=12760)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
 pid=12760)[0m Hello from the pygame community. https://www.pygame.org/contribute.html
[2m[36m(pid=12060)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
[2m[36m(pid=12060)[0m Hello from the pygame community. https://www.pygame.org/contribute.html
[2m[36m(pid=14744)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
[2m[36m(pid=14744)[0m Hello from the pygame community. https://www.pygame.org/contribute.html
[2m[36m(pid=8940)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
[2m[36m(pid=8940)[0m Hello from the pygame community. https://www.pygame.org/contribute.html
[2m[36m(pid=5924)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
[2m[36m(pid=5924)[0m Hello from the pygame community. https://www.pygame.org/contribute.html
[2m[36m(pid=4332)[0m pygame 2.1.0 (SDL 2.0.16, Python 3.9.6)
[2m[

Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975


Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975


Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975


Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975


Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975


Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975




Trial name,status,loc,K_epochs,eps_clip,gamma,lr_actor,lr_critic,update_timestep
TrainPong_0ea25_00000,RUNNING,127.0.0.1:19072,33,0.595794,0.0678902,0.0398071,0.0946692,861
TrainPong_0ea25_00001,RUNNING,127.0.0.1:12760,3,0.957571,0.12243,0.0161546,0.00714115,1128
TrainPong_0ea25_00002,RUNNING,127.0.0.1:3956,50,0.661534,0.653304,0.0317257,0.0508752,1490
TrainPong_0ea25_00003,RUNNING,127.0.0.1:8940,23,0.187536,0.159415,0.0803501,0.00198785,835
TrainPong_0ea25_00004,RUNNING,127.0.0.1:5924,5,0.123324,0.971315,0.0694425,0.00180382,944
TrainPong_0ea25_00005,RUNNING,127.0.0.1:4332,50,0.819511,0.780882,0.0413326,0.0864021,1337
TrainPong_0ea25_00006,RUNNING,127.0.0.1:14744,55,0.295955,0.0749162,0.000133926,0.0380577,1107
TrainPong_0ea25_00007,RUNNING,127.0.0.1:12060,16,0.723108,0.596551,0.0106558,0.0276353,937
TrainPong_0ea25_00008,RUNNING,127.0.0.1:10396,11,0.0558435,0.555242,0.0512582,0.0440808,1527
TrainPong_0ea25_00009,RUNNING,127.0.0.1:16252,9,0.554034,0.747747,0.0491182,0.0566373,975


KeyboardInterrupt: 