In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.distributions import Categorical



In [2]:
%matplotlib notebook

In [3]:
class PPO(nn.Module):
    def __init__(self, state_dim, action_dim, gamma=0.99, batch_size=128, 
                 epsilon=0.4, epoch_n=100, pi_lr=1e-4, v_lr=1e-3):

        super().__init__()
        
        self.action_dim = action_dim
        
        self.device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')
        
        self.pi_model_base = nn.Sequential(nn.Linear(state_dim, 64), nn.LeakyReLU(0.1),
                                           nn.Linear(64, 64), nn.LeakyReLU(0.1),
                                      nn.Linear(64, 64), nn.LeakyReLU(0.1)).to(self.device)
        self.pi_model_m = nn.Sequential(nn.Linear(64, action_dim), nn.Softmax())
        
        self.v_model = nn.Sequential(nn.Linear(state_dim, 128), nn.ReLU(),
                                     nn.Linear(128, 128), nn.ReLU(),
                                     nn.Linear(128, 1)).to(self.device)
        
        self.gamma = gamma
        self.batch_size = batch_size
        self.l_epsilon = 1
        self.epsilon = epsilon
        self.epoch_n = epoch_n
        self.pi_optimizer = torch.optim.Adam(
            list(self.pi_model_base.parameters())+list(self.pi_model_m.parameters()),
            lr=pi_lr)
        self.v_optimizer = torch.optim.Adam(self.v_model.parameters(), lr=v_lr)
        
    def forward_pi(self, state):
        x = self.pi_model_base(state)
        
        return self.pi_model_m(x)

    def get_action(self, state, rand_factor = 1):
        logits = self.forward_pi(torch.FloatTensor(state))
        norm = torch.ones((self.action_dim,))/self.action_dim
        logits_ = (1-self.l_epsilon)*logits + self.l_epsilon*norm
        dist = Categorical(logits_)
        action = dist.sample()
        return action.numpy()
    
    def to_cpu(self):
        self.pi_model_base.to('cpu')
        self.pi_model_m.to('cpu')
        self.v_model.to('cpu')
        
    def to_dev(self):
        self.pi_model_base.to(self.device)
        self.pi_model_m.to(self.device)
        self.v_model.to(self.device)

    def fit(self, states, actions, rewards, dones):
        
        states, actions, rewards, dones = map(np.array, [states, actions, rewards, dones])
        rewards, dones = rewards.reshape(-1, 1), dones.reshape(-1, 1)

        returns = np.zeros(rewards.shape)
        returns[-1] = rewards[-1]
        for t in range(returns.shape[0] - 2, -1, -1):
            returns[t] = rewards[t] + (1 - dones[t]) * self.gamma * returns[t + 1]

        states, actions, returns = map(torch.FloatTensor, [states, actions, returns])
        states, actions, returns = states.to(self.device), actions.to(self.device), returns.to(self.device)

        logits = self.forward_pi(states)
        norm = torch.ones((self.action_dim,))/self.action_dim
        logits_ = (1-self.l_epsilon)*logits + self.l_epsilon*norm
        dist = Categorical(logits_)
        old_log_probs = dist.log_prob(actions).detach()

        for epoch in range(self.epoch_n):
            
            idxs = np.random.permutation(returns.shape[0])
            for i in range(0, returns.shape[0], self.batch_size):
                b_idxs = idxs[i: i + self.batch_size]
                b_states = states[b_idxs]
                b_actions = actions[b_idxs]
                b_returns = returns[b_idxs]
                b_old_log_probs = old_log_probs[b_idxs]
    
                b_advantage = b_returns.detach() - self.v_model(b_states)
                
                b_logits = self.forward_pi(b_states)
                b_dist = Categorical(b_logits)
                b_new_log_probs = b_dist.log_prob(b_actions)
    
                b_ratio = torch.exp(b_new_log_probs - b_old_log_probs)
                pi_loss_1 = b_ratio * b_advantage.detach()
                pi_loss_2 = torch.clamp(b_ratio, 1. - self.epsilon,  1. + self.epsilon) * b_advantage.detach()
                pi_loss = - torch.mean(torch.min(pi_loss_1, pi_loss_2))
                
                pi_loss.backward()
                self.pi_optimizer.step()
                self.pi_optimizer.zero_grad()
                
                v_loss = torch.mean(b_advantage ** 2)
    
                v_loss.backward()
                self.v_optimizer.step()
                self.v_optimizer.zero_grad()

In [4]:
%%time

env = gym.make("Acrobot-v1")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = PPO(state_dim, action_dim)

total_rewards = []


CPU times: total: 15.6 ms
Wall time: 49 ms


In [16]:
%%time

episode_n = 60
trajectory_n = 50

for episode in range(episode_n):

    states, actions, rewards, dones = [], [], [], []
    
    agent.to_cpu()
    for _ in range(trajectory_n):
        total_reward = 0

        state = env.reset()
        for t in range(500):
            states.append(state)
            
            action = agent.get_action(state)
            actions.append(action)
            
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            dones.append(done)

            total_reward += reward
            if done:
                break
            
        total_rewards.append(total_reward)
    agent.to_dev()
    agent.fit(states, actions, rewards, dones)
    agent.l_epsilon -= 0.02
    if agent.l_epsilon <= 0:
        agent.l_epsilon = 0
    
    print(episode,total_reward,end='\r')

CPU times: total: 27min 26s
Wall time: 10min 23s


In [19]:
plt.plot(np.convolve(total_rewards,np.ones(10)/10,mode='valid'), label = r'$target = G_t$')
plt.title('sliding window = 10 trajectories')
plt.legend()
plt.ylabel('reward')
plt.xlabel('# trajectories')
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

In [15]:
plt.savefig('chubar6_3.png',dpi=300, bbox_inches='tight', facecolor='w')

In [132]:
runs = 100
do_render = False
rews = []
agent.to_cpu()
for i in range(runs):
    state = env.reset()
    total_reward = 0
    for t in range(500):
        states.append(state)
        action = agent.get_action(state,1)
        actions.append(action)

        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        dones.append(done)
        if do_render:
            env.render()
        total_reward += reward
        if done:
            break
    rews.append(total_reward)
print(np.mean(rews),np.min(rews),np.median(rews))
agent.to_dev()

-159.05 -401.0 -149.5


In [131]:
agent.l_epsilon = 0.5

In [68]:
class PPO(nn.Module):
    def __init__(self, state_dim, action_dim, gamma=0.99, batch_size=128, 
                 epsilon=0.4, epoch_n=40, pi_lr=1e-4, v_lr=1e-3):

        super().__init__()
        
        self.action_dim = action_dim
        
        self.device = torch.device('cpu' if torch.cuda.is_available() else 'cpu')
        
        self.pi_model_base = nn.Sequential(nn.Linear(state_dim, 64), nn.LeakyReLU(0.1),
                                           nn.Linear(64, 64), nn.LeakyReLU(0.1),
                                      nn.Linear(64, 64), nn.LeakyReLU(0.1)).to(self.device)
        self.pi_model_m = nn.Sequential(nn.Linear(64, action_dim), nn.Softmax())
        
        self.v_model = nn.Sequential(nn.Linear(state_dim, 128), nn.ReLU(),
                                     nn.Linear(128, 128), nn.ReLU(),
                                     nn.Linear(128, 1)).to(self.device)
        
        self.gamma = gamma
        self.batch_size = batch_size
        self.epsilon = epsilon
        self.epoch_n = epoch_n
        self.pi_optimizer = torch.optim.Adam(
            list(self.pi_model_base.parameters())+list(self.pi_model_m.parameters()),
            lr=pi_lr)
        self.v_optimizer = torch.optim.Adam(self.v_model.parameters(), lr=v_lr)
        
    def forward_pi(self, state):
        x = self.pi_model_base(state)
        
        return self.pi_model_m(x)

    def get_action(self, state, rand_factor = 1):
        logits = self.forward_pi(torch.FloatTensor(state))
        dist = Categorical(logits)
        action = dist.sample()
        return action.numpy()
    
    def to_cpu(self):
        self.pi_model_base.to('cpu')
        self.pi_model_m.to('cpu')
        self.v_model.to('cpu')
        
    def to_dev(self):
        self.pi_model_base.to(self.device)
        self.pi_model_m.to(self.device)
        self.v_model.to(self.device)

    def fit(self, states, actions, rewards, dones):
        
        states, actions, rewards, dones = map(np.array, [states, actions, rewards, dones])
        rewards, dones = rewards.reshape(-1, 1), dones.reshape(-1, 1)

        returns = np.zeros(rewards.shape)
        returns[-1] = rewards[-1]
        for t in range(returns.shape[0] - 2, -1, -1):
            returns[t] = rewards[t] + (1 - dones[t]) * self.gamma * returns[t + 1]

        states, actions, returns = map(torch.FloatTensor, [states, actions, returns])
        states, actions, returns = states.to(self.device), actions.to(self.device), returns.to(self.device)

        logits = self.forward_pi(states)
        dist = Categorical(logits)
        old_log_probs = dist.log_prob(actions).detach()

        for epoch in range(self.epoch_n):
            
            idxs = np.random.permutation(returns.shape[0])
            for i in range(0, returns.shape[0], self.batch_size):
                b_idxs = idxs[i: i + self.batch_size]
                b_states = states[b_idxs]
                b_actions = actions[b_idxs]
                b_returns = returns[b_idxs]
                b_old_log_probs = old_log_probs[b_idxs]
    
                b_advantage = b_returns.detach() - self.v_model(b_states)
                
                b_logits = self.forward_pi(b_states)
                b_dist = Categorical(b_logits)
                b_new_log_probs = b_dist.log_prob(b_actions)
    
                b_ratio = torch.exp(b_new_log_probs - b_old_log_probs)
                pi_loss_1 = b_ratio * b_advantage.detach()
                pi_loss_2 = torch.clamp(b_ratio, 1. - self.epsilon,  1. + self.epsilon) * b_advantage.detach()
                pi_loss = - torch.mean(torch.min(pi_loss_1, pi_loss_2))
                
                pi_loss.backward()
                self.pi_optimizer.step()
                self.pi_optimizer.zero_grad()
                
                v_loss = torch.mean(b_advantage ** 2)
    
                v_loss.backward()
                self.v_optimizer.step()
                self.v_optimizer.zero_grad()

In [69]:
%%time

env = gym.make("LunarLander-v2")
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

agent = PPO(state_dim, action_dim)

total_rewards = []


CPU times: total: 0 ns
Wall time: 4 ms


In [72]:
%%time

episode_n = 20
trajectory_n = 40

for episode in range(episode_n):

    states, actions, rewards, dones = [], [], [], []
    
    agent.to_cpu()
    for _ in range(trajectory_n):
        total_reward = 0

        state = env.reset()
        for t in range(1000):
            states.append(state)
            
            action = agent.get_action(state)
            actions.append(action)
            
            state, reward, done, _ = env.step(action)
            rewards.append(reward)
            dones.append(done)

            total_reward += reward
            if done:
                break
            
        total_rewards.append(total_reward)
    agent.to_dev()
    agent.fit(states, actions, rewards, dones)
    
    print(episode,total_reward,end='\r')

CPU times: total: 28min 56s
Wall time: 10min 59s


In [73]:
plt.plot(total_rewards)
plt.title('Total Rewards')
plt.grid()
plt.show()

<IPython.core.display.Javascript object>

In [75]:
runs = 1
do_render = True
rews = []
agent.to_cpu()
for i in range(runs):
    state = env.reset()
    total_reward = 0
    for t in range(1000):
        states.append(state)
        action = agent.get_action(state)
        actions.append(action)

        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        dones.append(done)
        if do_render:
            env.render()
        total_reward += reward
        if done:
            break
    rews.append(total_reward)
print(np.mean(rews),np.min(rews),np.median(rews))
agent.to_dev()

112.93157733763553 112.93157733763553 112.93157733763553
