In [None]:
!pip install torch
!pip install gym[box2d]

In [27]:
import gym
import torch
import numpy as np
import time
import random
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

# pytorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import MultivariateNormal

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
env = gym.make('LunarLanderContinuous-v2')
print('State shape: ', env.observation_space.shape)
print('Number of Actions: ', env.action_space)

cpu
State shape:  (8,)
Number of Actions:  Box([-1. -1.], [1. 1.], (2,), float32)


In [28]:
STATE_SIZE = env.observation_space.shape[0]
ACTION_SIZE = env.action_space.shape[0]

#These parameters assume all actions has the same high-low
ACTION_HIGH = env.action_space.high[0]
ACTION_LOW = env.action_space.low[0]
print(STATE_SIZE,ACTION_SIZE,ACTION_HIGH,ACTION_LOW)

# Define the agents hyperparameters, incl. Replay Memory
BUFFER_SIZE = int(1e5)  # replay buffer size
BATCH_SIZE = 64  # minibatch size
GAMMA = 0.99  # discount factor
TAU = 1e-3  # for soft update of target parameters
LR = 5e-4  # learning rate
UPDATE_EVERY = 4  # how often to update the network

8 2 1.0 -1.0


In [29]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

class ActorCriticContinuous(nn.Module):
    def __init__(self, state_dim, action_dim, action_std):
        super(ActorCriticContinuous, self).__init__()
        # action mean range -1 to 1
        self.actor =  nn.Sequential(
                nn.Linear(state_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, action_dim),
                nn.ReLU()
                )
        # critic
        self.critic = nn.Sequential(
                nn.Linear(state_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Linear(64, 1)
                )
        self.action_var = torch.full((action_dim,), action_std*action_std).to(device)

    
    def act(self, state, memory):
        action_mean = self.actor(state)
        cov_mat = torch.diag(self.action_var).to(device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        action = dist.sample()
        action_logprob = dist.log_prob(action)
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(action_logprob)
        
        return action.detach()
    
    def evaluate(self, state, action):   
        action_mean = self.actor(state)
        
        action_var = self.action_var.expand_as(action_mean)
        cov_mat = torch.diag_embed(action_var).to(device)
        
        dist = MultivariateNormal(action_mean, cov_mat)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        state_value = self.critic(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

class PPOContinuousAgent:
    def __init__(self, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        self.time_step = 0
        self.memory = Memory()
        
        self.policy = ActorCriticContinuous(state_dim, action_dim, action_std).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        
        self.policy_old = ActorCriticContinuous(state_dim, action_dim, action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def select_action(self, state):
        state = torch.FloatTensor(state.reshape(1, -1)).to(device)
        return self.policy_old.act(state, self.memory).cpu().data.numpy().flatten()
    
    def update(self):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(self.memory.rewards), reversed(self.memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        
        # convert list to tensor
        old_states = torch.squeeze(torch.stack(self.memory.states).to(device), 1).detach()
        old_actions = torch.squeeze(torch.stack(self.memory.actions).to(device), 1).detach()
        old_logprobs = torch.squeeze(torch.stack(self.memory.logprobs), 1).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())

            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()   
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            
            # take gradient step
            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()
            
        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())

    def step(self, reward, done):
        self.time_step += 1
        # Saving reward and is_terminals:
        self.memory.rewards.append(reward)
        self.memory.is_terminals.append(done)
        
        # update if its time
        if self.time_step % update_timestep == 0:
            self.update()
            self.memory.clear_memory()
            self.time_step = 0

    def act(self, state):
        return self.select_action(state)

In [30]:
def train(agent, n_episodes=100, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995, agent_type = "PPO_CONTINUOUS"):
    """Deep Q-Learning.
    
    Params
    ======
        n_episodes (int): maximum number of training episodes
        max_t (int): maximum number of timesteps per episode
        eps_start (float): starting value of epsilon, for epsilon-greedy action selection
        eps_end (float): minimum value of epsilon
        eps_decay (float): multiplicative factor (per episode) for decreasing epsilon
        agent_type (str): determines agent's type (q-learning , sac)
    """
    scores = []  # list containing scores from each episode
    avg_scores = [] # list contating avg scores 
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start  # initialize epsilon
    for i_episode in range(1, n_episodes + 1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            #choose an action
            if agent_type == "SAC":
                action = agent.act(state)
            elif agent_type == "PPO_CONTINUOUS":
                action = agent.act(state)
            #do action in environment
            next_state, reward, done, _ = env.step(action)
            
            #observe and learn (by the agent)
            if agent_type == "SAC":
                agent.step(state,action,reward,next_state,done)
            elif agent_type == "PPO_CONTINUOUS":
                 agent.step(reward, done)
            #accumulate score and move to next state
            state = next_state
            score += reward
            
            #stop episode if done
            if done:
                break
        
        
        scores_window.append(score)  # save most recent score
        scores.append(score)  # save most recent score
        avg_scores.append(np.mean(scores_window)) # save current avg score
     
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)), end="")
        
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_window)))
            
        if np.mean(scores_window) >= 200.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode - 100,
                                                                                         np.mean(scores_window)))
        
            #torch.save(agent.state_dict(), '/saved')
            break
        
    return scores,avg_scores

In [31]:
TOTAL_TIMESTEPS = 1500 #max timesteps, this should be high enough so convergence happens.
RUNS = 3 #how many runs of each agent (with different seed)
#seed the environment.
def init_seed(seed):
    #run this before any agent. checked to stabilize the randomness.
    env.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
ppo_scores = []
ppo_avg_scores = []
ppo_times = []

solved_reward = 200        # stop training if avg_reward > solved_reward
log_interval = 1           # print avg reward in the interval
max_episodes = 2000        # max training episodes
max_timesteps = 300        # max timesteps in one episode

update_timestep = 1200      # update policy every n timesteps
action_std = 0.5            # constant std for action distribution (Multivariate Normal)
K_epochs = 80               # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
gamma = 0.99                # discount factor

lr = 0.0003                 # parameters for Adam optimizer
betas = (0.9, 0.999)

random_seed = 1

for i in range(1,RUNS+1):
    init_seed(i)
    PPO =  PPOContinuousAgent(STATE_SIZE, ACTION_SIZE, action_std, lr, betas, gamma, K_epochs, eps_clip)
    start = time.time()
    score, avg_score = train(PPO,TOTAL_TIMESTEPS,agent_type="PPO_CONTINUOUS")
    end = time.time()
    ppo_scores.append(score)
    ppo_avg_scores.append(avg_score)
    ppo_times.append(end - start)
    torch.save(PPO.policy.state_dict(), "PPO_"+str(i)+".pt")
'''
ppo_scores = []
for i in range(1,RUNS+1):
    init_seed(i)
    ppo_scores.append(train(PPO,TOTAL_TIMESTEPS,agent_type="PPO"))
    torch.save(PPO.*policy_network.state_dict(), "PPO_"+str(i)+".pt")
'''

In [33]:
def print_graph(name,i,clr='blue'): 
    plt.plot(np.arange(len(ppo_scores [3*i])), ppo_scores [3*i], label= name,color = clr,alpha=0.3)
    plt.plot(np.arange(len(ppo_avg_scores [3*i])), ppo_avg_scores [3*i], label=name + ' average',color = clr)
    #plt.figure().add_subplot(111).text(3,8,len(scores[i]))
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.legend()
    plt.show()  

In [None]:
names = ['DQN','DQN_DUELING','DDQN','DDQN_DUELING']
for i in range(1):
    print_graph('PPO_CONTINUOUS',i)