<a href="https://colab.research.google.com/github/dude123studios/AdvancedReinforcementLearning/blob/main/PPO_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch as T
from torch import nn
import torch.distributions as D 
from torch import optim as O
import numpy as np
import gym

In [2]:
device = T.device('cpu')

if(T.cuda.is_available()): 
    device = T.device('cuda:0') 
    T.cuda.empty_cache()
    print("Device set to : " + str(T.cuda.get_device_name(device)))
else:
    print("Device set to : cpu")

Device set to : Tesla K80


In [3]:
env = gym.make('Pendulum-v0')
state_shape = env.observation_space.shape[0]
action_shape = env.observation_space.shape[0]
action_bound = [env.action_space.low, env.action_space.high]
print(f'state shape: {state_shape}')
print(f'action shape: {action_shape}')
print(f'action bound: {action_bound}')

state shape: 3
action shape: 3
action bound: [array([-2.], dtype=float32), array([2.], dtype=float32)]


In [4]:
class PolicyNW(nn.Module):

    def __init__(self, state_shape, action_shape):
        super(PolicyNW, self).__init__()
        self.fc1 = nn.Linear(state_shape, 128)
        self.relu1 = nn.ReLU()
        self.fcmu = nn.Linear(128, action_shape)
        self.mu_act = nn.Tanh()
        self.fcsigma = nn.Linear(128, action_shape)
        self.sigma_act = nn.Softplus()

    def forward(self, x):

        x = self.relu1(self.fc1(x))
        mu = 2 * self.mu_act(self.fcmu(x))
        sigma = self.sigma_act(self.fcsigma(x))

        return mu, sigma

pi = PolicyNW(state_shape, action_shape).to(device)
old_pi = PolicyNW(state_shape, action_shape).to(device)

In [5]:
def update_old_pi():
    old_pi.load_state_dict(pi.state_dict())

update_old_pi()

In [6]:
v = nn.Sequential(nn.Linear(state_shape, 128), nn.ReLU(), nn.Linear(128, 1)).to(device)

In [7]:
pi_optim = O.Adam(pi.parameters(), lr=1e-3)
v_optim = O.Adam(v.parameters(), lr=2e-3)

In [8]:
#Hyper Parameters
num_episodes = 2000
num_timesteps = 200
gamma = 0.9
delta = 0.3
beta = 0.2
epsilon = 0.2
batch_size = 32
epochs = 10

In [9]:
def policy(state):
    with T.no_grad():
        state = T.tensor(state, dtype=T.float32).unsqueeze(0).to(device)
        mu, sigma = pi(state)
        dist = D.Normal(mu[0], sigma[0])
        action = dist.sample().unsqueeze(0)
        clipped = T.clip(action, min=-2, max=2)[0]
    
    return clipped.cpu().numpy()

In [10]:
def value(state):
    with T.no_grad():
        state = T.tensor(state, dtype=T.float32).unsqueeze(0).to(device)
        out = v(state)[0, 0]
    return out.cpu().numpy()

In [11]:
def train_step(state, action, reward, _beta):

    update_old_pi()

    state = T.tensor(state, dtype=T.float32).to(device)
    action = T.tensor(action, dtype=T.float32).to(device)
    reward = T.tensor(reward, dtype=T.float32).to(device)

    with T.no_grad():
        advantage_cost = reward - v(state)

    for _ in range(epochs):

        mu, sigma = pi(state)
        dist = D.Normal(mu, sigma)
        pi_prob = dist.log_prob(action)

        with T.no_grad():
            mu_, sigma_ = old_pi(state)
            dist_ = D.Normal(mu_, sigma_)
            old_pi_prob = dist_.log_prob(action)

        kl_div = nn.KLDivLoss()(pi_prob, old_pi_prob)

        ratio = pi_prob/(old_pi_prob + 1e-8)
        objective = ratio * advantage_cost

        clipped = T.minimum(objective, T.clip(ratio, 1 - epsilon, 1+epsilon) * advantage_cost)

        pi_loss = -T.mean(clipped - _beta * kl_div)

        pi_optim.zero_grad()
        pi_loss.backward()
        pi_optim.step()
    

    mean_kl = T.mean(kl_div)
    if mean_kl > 1.5 * delta:
        _beta *= 2.0
    elif mean_kl < delta/1.5:
        _beta *= 0.5
    
    advantage = reward - v(state)
    v_loss = T.mean(T.square(advantage))

    v_optim.zero_grad()
    v_loss.backward()
    v_optim.step()

    return _beta

In [12]:
for i in range(1, num_episodes + 1):

    state = env.reset()
    episode_states, episode_actions, episode_rewards = [], [], []
    Return = 0

    for t in range(num_timesteps):

        action = policy(state)

        next_state, reward, done, _ = env.step(action)

        episode_states.append(state)
        episode_rewards.append(reward)
        episode_actions.append(action)

        state = next_state

        Return += reward

        if (t+1) % batch_size == 0 or t == num_timesteps - 1:

            v_s_ = value(state)

            discounted_r = []
            for reward in episode_rewards[::-1]:
                v_s_ = reward + gamma * v_s_
                discounted_r.append(v_s_)
            
            discounted_r.reverse()

            es, ea, er = np.vstack(episode_states), np.vstack(episode_actions), np.array(discounted_r, np.float32)[:, np.newaxis]

            beta = train_step(es, ea, er, beta)

            episode_states, episode_actions, episode_rewards = [], [], []
    
    if i % 10 == 0:
        print('Episode: {}, Return: {}'.format(i, Return))

  "reduction: 'mean' divides the total loss by both the batch size and the support size."


Episode: 10, Return: -1522.7318403153251
Episode: 20, Return: -1436.8637394367115
Episode: 30, Return: -1342.8536676300869
Episode: 40, Return: -1345.5870603192936
Episode: 50, Return: -1274.18784320794
Episode: 60, Return: -1190.8840683438277
Episode: 70, Return: -1422.1157097649468
Episode: 80, Return: -1183.722822864579
Episode: 90, Return: -1200.5849702249411
Episode: 100, Return: -1204.150827137444
Episode: 110, Return: -1365.8312773984346
Episode: 120, Return: -1310.4954128822549
Episode: 130, Return: -1182.0792397321698
Episode: 140, Return: -1295.447985118234
Episode: 150, Return: -1227.4772388815893
Episode: 160, Return: -1205.4716990149443
Episode: 170, Return: -1517.9987064175868
Episode: 180, Return: -1206.5832777223195
Episode: 190, Return: -1310.6899235531148
Episode: 200, Return: -1591.7302249432098
Episode: 210, Return: -1508.1832235734726
Episode: 220, Return: -1512.0088363106863
Episode: 230, Return: -1431.5759301405528
Episode: 240, Return: -1489.595996586204
Episode