In [None]:
import torch    
import torch.autograd as autograd         
import torch.nn as nn                     
import torch.nn.functional as F           
import torch.optim as optim               
import torch.distributions

import pandas as pd
import numpy as np

import holoviews as hv
from holoviews import opts
from holoviews.streams import Pipe, Buffer

import streamz
import streamz.dataframe

import pdb, gym

hv.extension('bokeh')

In [None]:
# Setup training progress output
def init_training_progress():

    max_length = 1000000
    rolling_size = 25

    training_stream = streamz.Stream()
    example = pd.DataFrame({'x': [0]}, index=[0])

    training_sdf = streamz.dataframe.DataFrame(training_stream, example=example)

    training_raw_buffer = Buffer(training_sdf, length=max_length)
    training_smooth_buffer = Buffer(training_sdf.x.rolling(rolling_size).median())

    training_raw_dmap = hv.DynamicMap(hv.Curve, streams=[training_raw_buffer]).relabel('raw')
    training_smooth_dmap = hv.DynamicMap(hv.Curve, streams=[training_smooth_buffer]).relabel('smooth')
    
    return training_stream, training_raw_dmap, training_smooth_dmap

In [None]:
class PolicyNet(nn.Module):
    def __init__(self):
        super(PolicyNet, self).__init__()

        # 2 continous input state variables
        state_size = 2
        
        # 1 continuous output action, mean and sigma (std. dev.) for each
        actions_size = 1
        
        # Hidden layers
        hidden_size = 32
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        
        # Output layer of action means
        self.fc4 = nn.Linear(hidden_size, actions_size)
        
        # Standard deviations approximated seperately
        self.register_parameter('log_sigma', None)
        self.log_sigma = nn.Parameter(torch.ones(actions_size), requires_grad=True)        

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))   
        x = F.relu(self.fc3(x))   
        
        means = F.tanh(self.fc4(x))
        sigmas = torch.exp(self.log_sigma).expand(means.shape)
        
        return means, sigmas

In [None]:
class Agent():
    def __init__(self, policy, lr=1e-2):
        self.policy = policy
        self.optimiser = optim.Adam(policy.parameters(), lr=lr)
        
        
    def act(self, states):
        
        # Create a distribution using the output of the network
        means, sigmas = self.policy.forward(states)
        m = torch.distributions.Normal(means, sigmas)
        
        # Sample the distribution to select an action (for each agent)
        actions = m.sample()
        
        # Return sampled action along with the log probability
        return actions, m.log_prob(actions)
    
    
    def learn(self, per_trajectory_log_prob, per_trajectory_rewards):
        
        # Normalise return
        per_trajectory_return = torch.tensor(per_trajectory_rewards).sum(dim=0)
        per_trajectory_return -= per_trajectory_return.mean()
        per_trajectory_return /=  per_trajectory_return.std()

        # Define loss based on reinforce
        per_trajectory_log_prob_sum = torch.stack(per_trajectory_log_prob).squeeze(2).sum(dim=0)
        per_trajectory_loss = -per_trajectory_return * per_trajectory_log_prob_sum
        loss = torch.mean(per_trajectory_loss)
        
         # Update model
        self.optimiser.zero_grad()
        loss.backward()
        self.optimiser.step()

In [None]:
def collect_trajectories(envs, agent, time_max=999):
    
    per_trajectory_log_prob = []
    per_trajectory_rewards = []

    # Run concurrent episode on all environments
    states = torch.tensor([env.reset() for env in envs]).float()
    for time in range(time_max):
        
        # Calculate all next actions for all envs
        actions, log_probs = agent.act(states)
        
        # Run through the envs in parallel
        outcomes = [env.step(action) for env, action in zip(envs, actions)]
        next_states, rewards, dones = ([s for s,r,d,_ in outcomes],
                                 [r for s,r,d,_ in outcomes],
                                 [d for s,r,d,_ in outcomes])
        
        # Pack up the next_states ready to send back to the agent
        states = torch.tensor(next_states).float()
        
        # Record result
        per_trajectory_log_prob.append(log_probs)
        per_trajectory_rewards.append(rewards)
    
        # We want rectangular input to network, so if any finish early we finish all early
        if any(dones):
            break
    
    return (per_trajectory_log_prob, per_trajectory_rewards)

In [None]:
# Create parallel envs
num_envs = 10

def make_env(env_id, rank, seed=0):
    env = gym.make(env_id)
    env.seed(seed + rank)
    return env

envs = [make_env('MountainCarContinuous-v0', i) for i in range(num_envs)]

In [None]:
# Create policy and agent
policy = PolicyNet()
agent = Agent(policy, lr=1e-2)

# Create returns record, this will stay consistent across runs
time_index = 0
average_returns = []

In [None]:
# Training progress
#%%opts Curve [width=700 height=200 show_grid=True tools=['hover']]
stream, smooth, raw = init_training_progress()
layout = (smooth * raw)
layout.opts(
    opts.Curve(width=900, height=300, show_grid=True, tools=['hover'])
)

In [None]:
episode_max = 1000
time_max = 999

for i_episode in range(episode_max):
    
    (per_trajectory_log_prob, per_trajectory_rewards) = collect_trajectories(envs, agent, time_max=time_max)
    agent.learn(per_trajectory_log_prob, per_trajectory_rewards)

    average_episode_return = np.sum(per_trajectory_rewards) / len(envs)
    stream.emit( pd.DataFrame({'x': average_episode_return}, index=[time_index]) )
    time_index += 1

In [None]:
for x in policy.parameters():
    print(x)
    break

In [None]:
# Play through episode
time_max = 999

env = envs[0]
states = torch.tensor([env.reset()]).float()
for i in range(time_max):
    
    env.render()

    # Calculate all next actions for all envs
    actions, _ = agent.act(states)

    # Run through the envs in parallel
    next_state, reward, done, _ = env.step(actions[0])
    print(reward)

    # Pack up the next_states ready to send back to the agent
    states = torch.tensor([next_state]).float()