In [None]:
import torch    
import torch.autograd as autograd         
import torch.nn as nn                     
import torch.nn.functional as F           
import torch.optim as optim               
import torch.distributions

import pandas as pd
import numpy as np

import holoviews as hv
from holoviews import opts
from holoviews.streams import Pipe, Buffer

import streamz
import streamz.dataframe

import pdb, gym

hv.extension('bokeh')

In [None]:
# Useful function from Shangton Zhang's code
def random_sample(indices, batch_size):
    indices = np.asarray(np.random.permutation(indices))
    batches = indices[:len(indices) // batch_size * batch_size].reshape(-1, batch_size)
    for batch in batches:
        yield batch
    r = len(indices) % batch_size
    if r:
        yield indices[-r:]

In [None]:
# Setup training progress output
def init_training_progress():

    max_length = 1000000
    rolling_size = 25

    training_stream = streamz.Stream()
    example = pd.DataFrame({'x': [0]}, index=[0])

    training_sdf = streamz.dataframe.DataFrame(training_stream, example=example)

    training_raw_buffer = Buffer(training_sdf, length=max_length)
    training_smooth_buffer = Buffer(training_sdf.x.rolling(rolling_size).median())

    training_raw_dmap = hv.DynamicMap(hv.Curve, streams=[training_raw_buffer]).relabel('raw')
    training_smooth_dmap = hv.DynamicMap(hv.Curve, streams=[training_smooth_buffer]).relabel('smooth')
    
    return training_stream, training_raw_dmap, training_smooth_dmap

In [None]:
class PolicyNet(nn.Module):
    
    def __init__(self, state_size, action_size):
        super(PolicyNet, self).__init__()
        
        # Hidden layers
        hidden_size = 32
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        
        # Output layer of action means
        self.fc3= nn.Linear(hidden_size, action_size)
        
        # Standard deviations approximated seperately
        self.register_parameter('log_sigma', None)
        self.log_sigma = nn.Parameter(torch.zeros(action_size), requires_grad=True)
        
    def forward(self, x):
        
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        
        means = torch.tanh(self.fc3(x))
        sigmas = torch.exp(self.log_sigma).expand(means.shape)
        
        return means, sigmas

In [None]:
class ValueNet(nn.Module):
    
    def __init__(self, state_size):
        super(ValueNet, self).__init__()
        
        # Hidden layers
        hidden_size = 32
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        
        # Output layer - single state value
        self.fc3= nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

In [None]:
class Rollout():
    
    def __init__(self):
        self.start_rollout()
    
    def start_rollout(self):
        self.states = []
        self.actions = []
        self.rewards = []
        self.log_probs = []
        
    def __len__(self):
        return len(self.states)
        
    def record_decision(self, state, action, log_prob):
        self.states.append(state)
        self.actions.append(action)
        self.log_probs.append(log_prob)        
    
    def record_outcome(self, reward):
        self.rewards.append(reward)
    
    def flatten_trajectories(self):
        
        # Create tensors from states and actions
        states_tensors = torch.stack(self.states)
        actions_tensors = torch.stack(self.actions)
        
        # Calculate future return (at each step, for each trajectory)
        # This is just the cummulative sum of rewards calculated backwards from the episode end
        future_returns = np.cumsum(self.rewards[::-1], axis=0)[::-1].copy()
        future_returns = torch.tensor(future_returns).float()
        
        # Normalise future return (at each step, for each trajectory)
        mean = future_returns.mean(dim=1).unsqueeze(1)
        sigma = future_returns.std(dim=1).unsqueeze(1)
        normalised_future_returns = (future_returns - mean) / (sigma + 1.0e-10)

        # Sum the log probabilities over the possible actions (at each step, for each trajectory)
        # We don't differentiate with respect to these, hence we detach them from the computation graph
        original_policy_log_probs = torch.stack(self.log_probs).sum(-1).detach()
        original_policy_probs = torch.exp(original_policy_log_probs)
        
        # Flatten trajectories
        return (states_tensors.view(-1, states_tensors.shape[-1]), 
                actions_tensors.view(-1, actions_tensors.shape[-1]),
                normalised_future_returns.view(-1),
                original_policy_probs.view(-1))


In [None]:
class Agent():
    
    def __init__(self, state_size, action_size, lr=1e-3, clipping_epsilon=0.1,
                 ppo_epochs=10, minibatch_size=32, rollout_length=1000):
        self.lr = lr
        self.clipping_epsilon = clipping_epsilon
        self.ppo_epochs = ppo_epochs
        self.minibatch_size = minibatch_size
        self.rollout_length = rollout_length
        
        self.actor = PolicyNet(state_size, action_size)
        self.critic = ValueNet(state_size)
        
        self.rollout = Rollout()

    def start_episode(self):
        self.episode_rewards = []
        self.rollout.start_rollout()

    def act(self, state):
        
        # Check if the rollout is full and needs processing
        if len(self.rollout) == self.rollout_length:
            self.learn()
            self.rollout.start_rollout()
        
        # Derive action distribution from the policy
        m = self.action_distribution_for_states(state)
        
        # DELETE ME !!!
        torch.manual_seed(0)
        
        action = m.sample()
        log_prob = m.log_prob(action)
        
        # Record decision and return sampled action
        self.rollout.record_decision(state, action, log_prob)
        return action
    
    def finish_episode(self):
        self.learn()
    
    def record_outcome(self, reward):
        self.episode_rewards.append(reward)
        self.rollout.record_outcome(reward)
        
    def action_distribution_for_states(self, states):
        means, sigmas = self.actor(states)
        return torch.distributions.Normal(means, sigmas)
    
    def average_episode_return(self):
        return np.sum([np.mean(r) for r in self.episode_rewards])
    
    def get_current_policy_probs(self, states, actions):
        
        # For the given state/action pairs, create a distribution from the policy and get the log probs
        current_policy_log_probs = self.action_distribution_for_states(states).log_prob(actions)

        # Sum log probs over the possible actions
        current_policy_log_probs = current_policy_log_probs.sum(-1)
        
        return torch.exp(current_policy_log_probs)
    
    def learn(self):
        
        (states, actions, normalised_future_returns, original_policy_probs) = \
            self.rollout.flatten_trajectories()
        
        # Run through PPO epochs
        optimiser = optim.Adam(self.actor.parameters(), lr=self.lr, eps=1e-5)
        for ppo_epoch in range(self.ppo_epochs):
            
            # Sample the trajectories randomly in mini-batches
            for indices in random_sample(np.arange(states.shape[0]), self.minibatch_size):
                
                # Sample using sample indices
                states_sample = states[indices]
                actions_sample = actions[indices]
                normalised_future_returns_sample = normalised_future_returns[indices]
                original_policy_probs_sample = original_policy_probs[indices]
            
                # Use the current policy to get the probabilities for the sample states and actions
                # We use these to weight the likehoods, allowing resuse of the rollout
                current_policy_probs_sample = self.get_current_policy_probs(states_sample, actions_sample)

                # Define PPO surrogate and clip
                sampling_ratio = current_policy_probs_sample / original_policy_probs_sample
                clip = torch.clamp(sampling_ratio, 1 - self.clipping_epsilon, 1 + self.clipping_epsilon)
                clipped_surrogate = torch.min(
                    sampling_ratio * normalised_future_returns_sample,
                    clip * normalised_future_returns_sample)

                # Average over trajectories and timesteps
                # Apparently this is preferable to summing over timesteps, since we normalised our rewards
                # (also negative since we want to ascend)
                loss = -torch.mean(clipped_surrogate) 

                # Update model
                optimiser.zero_grad()
                loss.backward()
                optimiser.step()
    


In [None]:
def play_episode_and_learn(env, agent, max_episode_length=1000):
    
    # Run concurrent episode on all environments
    env_info = env.reset(train_mode=True)[brain_name]
    state = torch.from_numpy(env_info.vector_observations).float()
    
    # Start episode
    agent.start_episode()
    
    for _ in range(max_episode_length):    
        
        # Calculate actions for all envs
        action = agent.act(state)
        
        # Run through the envs in parallel
        env_info = env.step(action.numpy())[brain_name]
        next_state = torch.from_numpy(env_info.vector_observations).float()
        reward = env_info.rewards
        done = env_info.local_done
        
        # Record the experience tuple with the agent
        agent.record_outcome(reward)
        
        # Advance
        state = next_state

        # We want rectangular input to network, so if any finish early we finish all early
        if np.any(done):
            print("Someone finished")
            break
    
    # Finalise episode
    agent.finish_episode()

In [None]:
torch.manual_seed(0)
np.random.seed(0)
agent = Agent(33, 4, lr=3e-4, rollout_length=3)

agent.start_episode()

# Dummy interact with env
states = np.zeros((20,33))

for i in range(3):

    # Get actions
    actions = agent.act(torch.from_numpy(states).float())
    actions = torch.clamp(actions, -1, 1)
    
    # Dummy interact with env
    next_states = (i+1) * 0.1 * np.ones((20,33))
    rewards = 0.1 * np.ones(20)
    dones = [False] * 20

    # Teach agent
    agent.record_outcome(rewards)
    pdb.set_trace()
    
    # Advance
    states = next_states

agent.finish_episode()