In [1]:
# to log experiments
# sort of cloud Tensorboard to 
# https://www.comet.ml/emergent-dynamics/projects

from comet_ml import Experiment
COMET_ENABLED = False


In [2]:
import os
import random
import time
import resource
import pickle
import math
import json
import multiprocessing

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow.keras as keras

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import glorot_uniform

keras.backend.set_floatx('float64')

import plotly
import plotly.graph_objects as go

import seaborn as sns

import matplotlib.pyplot as plt

# requires python 3.6
# conda install -c akode gym
import gym

# set seeds for reproducibility
# np.random.uniform(0,10000) 4465
GLOBAL_SEED = 4465
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
#tf.random.set_seed(GLOBAL_SEED)

print("TensorFlow %s" % tf.__version__)
print("Keras %s" % keras.__version__)
print("gym %s" % gym.__version__)
print("plotly %s" % plotly.__version__)
print("pandas %s" % pd.__version__)
print("numpy %s" % np.__version__)


TensorFlow 2.1.0
Keras 2.2.4-tf
gym 0.15.6
plotly 4.5.0
pandas 1.0.1
numpy 1.18.1


In [3]:
STATE_SIZE = 4
ACTION_SIZE = 2

MAX_TIMESTEPS = 500
N_EPISODES = 2000
WIN_REWARD = 100
DISCOUNT_RATE = 0.98
SAMPLE_SIZE = 128
BATCH_SIZE = 1
OUTPUT_DIR = 'model_output/cartpole/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

N_HIDDEN_LAYERS = 1
HIDDEN_LAYER_SIZE = 32

CARTPOLE_EPISODES = 200
LUNARLANDER_EPISODES = 500
RENDER = False


In [4]:
params = {
    "discount_rate": DISCOUNT_RATE,
    "n_hidden_layers": N_HIDDEN_LAYERS,
    "hidden_layer_size": HIDDEN_LAYER_SIZE,
}

if COMET_ENABLED:
    experiment.log_parameters(params)

In [5]:
# show memory usage (some versions of TensorFlow gave memory issues)
def sizeof_fmt(num, suffix='B'):
    """given memory as int format as memory units eg KB"""
    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Y', suffix)

def memusage():
    """print memory usage"""
    return sizeof_fmt(int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

memusage()


'359.1 KB'

In [6]:
class Agent:
    """abstract base class for agents"""

    def __init__(self, state_size, action_size, filename="model",
                 *args, **kwargs):
        self.state_size = state_size
        self.action_size = action_size
        self.filename = filename
        self.timestep = 0
        self.total_reward = 0
        self.save_interval = 10
        self.max_score = -9999
        self.max_avg = -9999
        self.state = None

    def run_episode(self, env, render=RENDER):
        """run a full episode"""

        self.reset()
        self.state = env.reset()
        self.done = False

        while not self.done:
            if render:
                env.render()
            self.action = self.act(self.state.reshape([1, self.state_size]))
            self.next_state, self.reward, self.done, _ = env.step(self.action)
            self.total_reward += self.reward
            # should get extra reward for max + not done vs. max + done
            if self.done and self.timestep == (MAX_TIMESTEPS - 1):
                self.reward += WIN_REWARD

            self.remember()
            self.state = self.next_state
            self.increment_time()
            
        if render:
            env.render()
            
        self.train()    
        
    def build_model(self, *args, **kwargs):
        """build a model"""
        raise NotImplementedError

    def reset(self):
        """reset agent for start of an episode"""
        self.timestep = 0
        self.total_reward = 0
        self.max_score = -9999
        self.max_avg = -9999

    def act(self, *args, **kwargs):
        """pick an action using model"""
        raise NotImplementedError

    def increment_time(self):
        """increment timestep counter"""
        self.timestep += 1

    def remember(self, *args, **kwargs):
        """store the states and rewards needed to fit the model"""
        raise NotImplementedError

    def train(self, *args, **kwargs):
        """train the model on experience stored by remember"""
        raise NotImplementedError

    def save_score(self):
        """save score of each episode"""
        self.results.append(self.total_reward)

    def score_episode(self, episode_num, n_episodes):
        """output results and save"""
        self.save_score()
        avglen = min(episode_num+1, self.save_interval)
        self.avgscore = sum(self.results[-avglen:])/avglen
        self.max_score = max(self.total_reward, self.max_score)
        self.max_avg = max(self.avgscore, self.max_avg)

        formatstr = "{} episode {}/{}:, score: {} (max {}), {}-episode avg: {:.1f} (max {:.1f}) Memory: {}             "
        print(formatstr.format(time.strftime("%H:%M:%S"), episode_num,
                               n_episodes, self.total_reward, self.max_score, avglen,
                               self.avgscore, self.max_avg, memusage()),
              end="\r", flush=False)

    def view(self, render=True):
        """Run an episode without training, with rendering"""
        
        if not render:
            return
        state = env.reset()
        state = np.reshape(state, [1, self.state_size])
        done = False

        # run an episode
        self.timestep = 0
        r = 0
        while not done:
            env.render()
            action = self.act(state, argmax=True)
            state, reward, done, _ = env.step(action)
            r += reward
            state = np.reshape(state, [1, self.state_size])
            self.timestep += 1
        env.render()
        print(r)
        env.close()
        return self.timestep

    def rlplot(self, title='Cartpole Agent Training Progress'):
        """plot training progress"""
        df = pd.DataFrame({'timesteps': self.results})
        df['avg'] = df['timesteps'].rolling(10).mean()

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['timesteps'],
                                 mode='markers',
                                 name='timesteps',
                                 marker=dict(
                                     color='mediumblue',
                                     size=4,
                                 ),
                                ))

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['avg'],
                                 mode='lines',
                                 line_width=3,
                                 name='moving average'))

        fig.update_layout(
            title=dict(text=title,
                       x=0.5,
                       xanchor='center'),
            xaxis=dict(
                title="Episodes",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                title="Total Reward per Episode",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            legend=go.layout.Legend(
                x=0.01,
                y=0.99,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=12,
                    color="black"
                ),
                #bgcolor="LightSteelBlue",
                bordercolor="Black",
                borderwidth=1,
            ),
        )

        return fig.show()
    
    def save_agent(self, *args, **kwargs):
        """save agent to disk"""
        raise NotImplementedError

    def load_agent(*args, **kwargs):
        """load agent from disk"""
        raise NotImplementedError


In [7]:
# Deep Q Network
# based on Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN
# to speed training, put memory in dataframe and train in batch
# still slow and so-so performing

class PolicyModel(Model):
            
    def __init__(self, 
                 state_size, 
                 action_size,
                 n_hidden_layers=N_HIDDEN_LAYERS,
                 hidden_layer_size=HIDDEN_LAYER_SIZE,
                 activation='relu',
                 reg_penalty=0.001,
                 dropout=0.0625,
                 verbose=True
                ):

        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.hidden_layer_size = hidden_layer_size
        self.activation = activation
        self.reg_penalty = reg_penalty
        self.dropout = dropout
        self.verbose = verbose
                
        self.rllayers = []

        for i in range(n_hidden_layers):
            if verbose:
                formatstr = "layer %d size %d, %s, reg_penalty %.8f, dropout %.3f"
                print(formatstr % (i + 1, hidden_layer_size, activation, reg_penalty, dropout))
                
            # add dropout, only between hidden layers
            if i and dropout:
                self.rllayers.append(Dropout(dropout, name="Dropout%02d" % i))
                
            self.rllayers.append(Dense(hidden_layer_size, activation=activation, name="Dense%02d" % i))

        self.rllayers.append(Dense(self.action_size, activation='linear', name="Output"))

    def call(self, x):
        # Forward pass
        for layer in self.rllayers:
            x = layer(x)
        return x  

    def get_config(self):
        return {'state_size': self.state_size,
                'action_size': self.action_size,
                'n_hidden_layers': self.n_hidden_layers,
                'hidden_layer_size': self.hidden_layer_size,
                'activation': self.activation,
                'reg_penalty': self.reg_penalty,
                'dropout': self.dropout,
                'verbose': self.verbose,
               }


In [8]:
# verify the model does something

env = gym.make('CartPole-v1')
testmodel = PolicyModel(env.observation_space.shape[0], env.action_space.n)
obs = env.reset()
print(obs)
# make n x action_size array
obs = obs[None, :]
print(obs)
z = testmodel.predict(obs)
z

layer 1 size 32, relu, reg_penalty 0.00100000, dropout 0.062
[0.00832242 0.02016691 0.01829497 0.01116533]
[[0.00832242 0.02016691 0.01829497 0.01116533]]



[33mWARN: Box bound precision lowered by casting to float32[0m



array([[-0.00032636,  0.00070215]])

In [9]:
testmodel.predict(np.random.uniform(size=(1, env.observation_space.shape[0]))/10)


array([[ 0.02943283, -0.01134082]])

In [10]:
class DQN_Agent(Agent):
    def __init__(self, state_size, action_size, filename="dqn",
                 discount_rate=DISCOUNT_RATE,
                 learning_rate=None,
                 epsilon=1.0,
                 epsilon_decay=0.995,
                 epsilon_min=0.01,
                 n_hidden_layers=N_HIDDEN_LAYERS,
                 hidden_layer_size=HIDDEN_LAYER_SIZE,
                ):

        self.state_size = state_size
        self.action_size = action_size
        self.filename = filename
        self.discount_rate = discount_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.learning_rate = learning_rate
        self.model = PolicyModel(state_size, action_size,
                                 n_hidden_layers=n_hidden_layers,
                                 hidden_layer_size=hidden_layer_size)
        
        self.loss_function = keras.losses.MSE
        self.optimizer = keras.optimizers.Adam()        

        self.memory = pd.DataFrame(columns=["state", "action", "next_state",
                                            "reward", "done"])
        self.memory_size = 20000
        self.results = []
        self.train_batch_size = BATCH_SIZE
        self.timestep = 0
        self.save_interval = 10
        
        self.max_score = 0
        self.max_avg = 0

    def remember(self):
        """store the states and rewards needed to fit the model"""
        # append in place
        self.memory.loc[self.memory.shape[0]] = [self.state,
                                                 self.action,
                                                 self.next_state,
                                                 self.reward,
                                                 self.done]
        
    @tf.function
    def train_step(self, X, Y):
        train_ds = tf.data.Dataset.from_tensor_slices((X, Y)).batch(BATCH_SIZE)
        for X_batch, Y_batch in train_ds:
            with tf.GradientTape() as tape:
                predictions = self.model(X_batch)
                loss = self.loss_function(Y_batch, predictions)
            gradients = tape.gradient(loss, self.model.trainable_variables)
            self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))

    def train(self):
        """train the model on experience stored by remember
        warning: strange loop-y magic"""

        # need at least SAMPLE_SIZE observations
        if self.memory.shape[0] < SAMPLE_SIZE:
            return

        # truncate memory
        self.memory = self.memory[-self.memory_size:]
        # sample sample_size observations from memory
        minibatch = self.memory.sample(n=SAMPLE_SIZE)

        # target is our best estimate of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((SAMPLE_SIZE, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit model against model's own prediction, gets us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward obtained + predicted value of the observed next state
        # this is the strange loop-y magic of temporal difference RL
        minibatch['target_observed'] = minibatch['reward']
        # if done, target is the reward
        # reward by gym env is only 1 for each timestep of survival
        # (but we also added a reward of for reaching the end successfully)
        # if not done, add discount_rate  * Q-value prediction for  observed next state
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        # run all predictions at once
        # iterates faster but does not train after each prediction
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] \
            += self.discount_rate * y_observed_pred
        # vectorized vlookup - update col specified by action with target_observed
        np.put_along_axis(Y_pred,
                          minibatch['action'].astype(int).values.reshape(SAMPLE_SIZE, 1),
                          minibatch['target_observed'].values.reshape(SAMPLE_SIZE, 1),
                          axis=1)
        # fit model against improved target
        self.train_step(tf.convert_to_tensor(X_fit, dtype=tf.float64),
                        Y_pred)
        # self.model.fit(X_fit, Y_pred
                       #epochs=1,
                       #batch_size=self.train_batch_size,
                       #verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def act(self, state, argmax=False):
        """pick an action using model"""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def get_config(self):
        return {'state_size': self.state_size,
                'action_size': self.action_size,
                'filename': self.filename,
                'discount_rate': self.discount_rate,
                'learning_rate': self.learning_rate,
                'epsilon': self.epsilon,
                'epsilon_decay': self.epsilon_decay,
                'epsilon_min': self.epsilon_min,
                'loss_function': self.loss_function,
                'optimizer': self.optimizer,
                'memory': self.memory,
                'memory_size': self.memory_size,
                'results': self.results,
                'train_batch_size': self.train_batch_size,
                'timestep': self.timestep,
                'save_interval': self.save_interval,
               }

    def save_agent(self):
        """save agent: pickle self and use Keras native save model"""
        fullname = "%s%s%05d" % (OUTPUT_DIR, self.filename, len(self.results))
        agent_attrs = self.get_config()
        model_attrs = self.model.get_config()
        self.model.save_weights("%s.h5" % fullname)
        pickle.dump({'agent_attrs': agent_attrs, 'model_attrs': model_attrs}, open("%s.p" % fullname, "wb"))

    def load_agent(filename):
        """load saved agent"""
        pickledict = pickle.load(open("%s.p" % filename, "rb"))
        new = DQN_Agent(state_size=pickledict['agent_attrs']['state_size'],
                        action_size=pickledict['agent_attrs']['action_size'],
                        discount_rate=pickledict['agent_attrs']['discount_rate'],
                        learning_rate=pickledict['agent_attrs']['learning_rate'],
                        epsilon=pickledict['agent_attrs']['epsilon'],
                        epsilon_decay=pickledict['agent_attrs']['epsilon_decay'],
                        epsilon_min=pickledict['agent_attrs']['epsilon_min']                        
                       )
        for name, value in pickledict['agent_attrs'].items():
            setattr(new, name, value)
        # make a prediction to fix input size
        new.model.predict(np.random.uniform(size=(1, new.state_size))/10)
            
        new.model.load_weights("%s.h5" % filename, by_name=False)           
        for name, value in pickledict['model_attrs'].items():
            setattr(new.model, name, value)
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(new.results),
                                                                      len(new.memory),
                                                                      new.epsilon))
        return new

In [11]:
def run_experiment(agent, env, n_episodes):
    print("Start training: %s" % time.strftime("%H:%M:%S"))
    
    for e in range(n_episodes):
        agent.run_episode(env)
        agent.score_episode(e, n_episodes)
        if COMET_ENABLED:
            experiment.log_metrics({
                'episode': len(agent.results),
                'reward': agent.total_reward,
                'avg_reward': agent.avgscore
            })
        
        if e and (e+1) % agent.save_interval == 0:
            agent.save_agent()
            
    print("\nFinish training: %s" % time.strftime("%H:%M:%S"))



In [None]:
# https://gym.openai.com/envs/CartPole-v1/

if COMET_ENABLED:
    experiment = Experiment(project_name="DV_Cartpole_DQN",
                            auto_param_logging=False)

N_EPISODES=CARTPOLE_EPISODES
env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)

agent = DQN_Agent(state_size=env.observation_space.shape[0],
                  action_size=env.action_space.n)

run_experiment(agent, env, N_EPISODES)



layer 1 size 32, relu, reg_penalty 0.00100000, dropout 0.062
Start training: 17:49:15
17:50:22 episode 138/200:, score: 41.0 (max 41.0), 10-episode avg: 46.2 (max 46.2) Memory: 1.7 MB               

In [None]:
if COMET_ENABLED:
    experiment.end()

In [None]:
# plot training progress

agent.rlplot("DQN Cartpole Agent Training Progress")

In [None]:
# view agent in action
# can use early stopping to pick a good model
env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)
# start_epoch = 1730
# loadmodel = '%05d' % start_epoch
# agent = DQN_Agent.load_agent(OUTPUT_DIR + 'dqn' + loadmodel)
agent.view(render=RENDER)


In [None]:
# train additional episodes from a good model

env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)

# load old model
start_epoch = 520

loadmodel = '%05d' % start_epoch
agent = DQN_Agent.load_agent(OUTPUT_DIR + 'dqn' + loadmodel)

z = 10
print("Resume training: %s" % time.strftime("%H:%M:%S"))
for e in range(z):
    agent.run_episode(env)
    agent.score_episode(e, z)
    if e and (e+1) % agent.save_interval == 0:
        agent.save_agent()  
print("\nFinish training: %s" % time.strftime("%H:%M:%S"))

      

In [None]:
# training as above does well up to a point but not very stable.
# sometimes performance goes off a cliff esp with more complex NNs like 2x32.
# continuing to train sometimes results in forgetting what it learned.
# trained repeatedly, when it fell off a cliff restarted using best previous model
# used early stopping after achieving a model that wins many times in a row,

agent = DQN_Agent.load_agent("good")
agent.view(render=RENDER)


In [None]:
# REINFORCE is a policy gradient method
# key changes vs. DQN
# 1) Monte Carlo instead of temporal difference learning:
#    after each episode, compute rewards for trajectory runout
#    train on the full last episode
#    then throw it away (no resampling history)
# 2) Use pure python logistic regression here instead of deep NN
#    logistic regression outputs probabilities
#    act by sampling the predicted probabilities for each action
#    one action may become strongly favored 
#    but you always explore the other a nonzero % of time
#    so no epsilon
# 3) DQN trains against estimate of a Q state-action value
#    REINFORCE trains action prob predictions directly against observed rewards
#    No expectation of Q action-value is computed
#    To update logistic regression theta
#      over all observations (i.e. each action taken)
#        compute gradient of action prob w.r.t. thetas
#        compute standardized discounted reward for each observation (state/action taken)
#        compute gradient of average reward over all observations w.r.t theta
#        update each theta by that amount times learning rate
#    This will tend to update thetas so that
#        actions with above-average rewards become more probable
#        actions with below-average rewards become less probable
# only 4 params, runs fast and solves consistently after about 500 episodes
# could add hidden layers for NN but would need to backprop the gradient (or use Keras)
# also only supports 2 actions (logistic regression/binary classification)
# could add softmax (or use Keras)

# https://mcneela.github.io/math/2018/04/18/A-Tutorial-on-the-REINFORCE-Algorithm.html
# https://karpathy.github.io/2016/05/31/rl/
# code mostly from
# https://github.com/jklaise/personal_website/blob/master/notebooks/rl_policy_gradients.ipynb

In [None]:
class LogisticAgent(Agent):
    """REINFORCE agent (policy gradient) using logistic regression"""
    def __init__(self, theta, learning_rate, discount_rate, filename='logistic'):
        """Initialize parameter vector theta, learning rate and discount_rate"""
        self.theta = theta
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.save_interval = 10
        self.results = []
        self.timestep = 0
        self.state_size = 4
        self.filename = filename

        self.max_score = 0
        self.max_avg = 0
        
    def reset(self):
        """reset agent for start of episode"""
        self.state_history = []
        self.action_history = []
        self.reward_history = []
        self.probs = []
        self.timestep = 0
        self.total_reward = 0

    def logistic(self, y):
        """logistic function, squash -infinity to +infinity to prob between 0 and 1"""
        return 1/(1 + math.exp(-y))

    def remember(self):
        self.state_history.append(self.state)
        self.reward_history.append(self.reward)
        self.action_history.append(self.action)

    def act(self, X, argmax=False):
        """predict probas using theta, sample an action from probabilities"""
        # use same calling convention as Keras predict, which expects array X of n states
        x = X[0]
        y = x @ self.theta
        prob0 = self.logistic(y)
        probs = np.array([prob0, 1-prob0])
        # sample action from predicted probabilities
        if argmax:
            # for play() choose best action
            action = np.argmax(probs)
        else:
            # for train() sample actions
            action = np.random.choice([0, 1], p=probs)
        # save prob history
        self.probs.append(probs[action])
        return action

    def grad_log_p(self, x):
        """calculate gradient vector of log-probas"""
        y = x @ self.theta
        grad_log_p0 = x - x * self.logistic(y)
        grad_log_p1 = - x * self.logistic(y)
        return grad_log_p0, grad_log_p1

    def discount_rewards(self, rewards):
        """calculate discounted rewards"""
        discounted_rewards = np.zeros(len(rewards))
        cumulative_rewards = 0
        for i in reversed(range(len(rewards))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + rewards[i]
            discounted_rewards[i] = cumulative_rewards
        return discounted_rewards

    def train(self):
        """update thetas based on gradients, discounted rewards, learning rate"""
        # calculate gradients for each action you actually took
        # how much to adjust theta to increase prob of that action
        grad_log_p = np.array([self.grad_log_p(ob)[action]
                               for ob, action in zip(self.state_history,
                                                     self.action_history)])

        # calculate discounted rewards
        discounted_rewards = self.discount_rewards(self.reward_history)
        # standardize
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards)

        # gradients times discounted rewards
        # average gradient over all obs weighted by reward
        # how much to update theta to increase reward
        update_target = grad_log_p.T @ discounted_rewards

        # update theta
        self.theta += self.learning_rate*update_target

    def save_agent(self):
        """save agent: pickle self"""
        fullname = "%s%s%05d" % (OUTPUT_DIR, self.filename, len(self.results))
        pickle.dump(self, open("%s.p" % fullname, "wb"))

    def load_agent(filename):
        """load saved agent"""
        new = pickle.load(open("%s.p" % filename, "rb"))
        print("loaded %d results" % (len(new.results)))
        return new
    

In [None]:
N_EPISODES = CARTPOLE_EPISODES
RENDER = False

def run_experiment(env, theta, learning_rate, discount_rate, AgentClass,
                   N_EPISODES=1000, seed=None, comet_enabled=COMET_ENABLED):

    if comet_enabled:
        experiment = Experiment(project_name="DV_Cartpole_Logistic_PG",
                                auto_param_logging=False)
        
        experiment.log_parameters({
            "discount_rate": DISCOUNT_RATE,
            "learning_rate": learning_rate,
        })

    # initialize environment and policy
    if seed is not None:
        env.seed(seed)
    episode_rewards = []
    agent = AgentClass(theta, learning_rate, discount_rate)

    # train for N_EPISODES
    print("Start training: %s" % time.strftime("%H:%M:%S"))    
    for e in range(N_EPISODES):

        # run an episode
        agent.run_episode(env, render=RENDER)
        agent.score_episode(e, N_EPISODES)
        if comet_enabled:
            experiment.log_metrics({
                'episode': len(agent.results),
                'reward': agent.total_reward,
                'avg_reward': agent.avgscore
            })

        if e and (e+1) % agent.save_interval == 0:
            agent.save_agent()
    print("\nFinish training: %s" % time.strftime("%H:%M:%S"))

    if comet_enabled:
        experiment.end()

    return episode_rewards, agent


env = gym.make('CartPole-v1')

episode_rewards, agent = run_experiment(env,
                                        theta=np.random.randn(4)/100,
                                        learning_rate=0.1,
                                        discount_rate=0.975,
                                        AgentClass=LogisticAgent,
                                        N_EPISODES=N_EPISODES,
                                        seed=GLOBAL_SEED,
                                       )


In [None]:
# plot training progress

agent.rlplot('Cartpole Logistic Policy Gradient Training Progress')

In [None]:
# view trained agent in action
# early stopping is our friend, if last model not best
env = gym.make('CartPole-v1')
# agent = LogisticAgent.load_agent(OUTPUT_DIR + 'logistic01950')
agent.view(render=RENDER)

In [None]:
# Keras REINFORCE policy gradient method
# Same policy gradient algorithm
# Use Keras to define policy network
# Allows use of neural network with multiple hidden layers
# Keras does the backprop, computes gradients 
# Also our logistic is binary, only 2 actions
# Keras softmax generalizes to n actions

In [None]:
class REINFORCE_Agent(Agent):
    """REINFORCE policy gradient method using deep Keras NN"""
    def __init__(self, state_size=STATE_SIZE, action_size=ACTION_SIZE, learning_rate=0.0005,
                 discount_rate=DISCOUNT_RATE, n_hidden_layers=N_HIDDEN_LAYERS, hidden_layer_size=HIDDEN_LAYER_SIZE,
                 activation='relu', reg_penalty=0, dropout=0, filename="kreinforce",
                 verbose=True):
        self.state_size = state_size
        self.action_size = action_size
        self.action_space = list(range(action_size))
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate

        self.n_hidden_layers = n_hidden_layers
        self.hidden_layer_size = hidden_layer_size
        self.activation = activation
        self.reg_penalty = reg_penalty
        self.dropout = dropout
        self.verbose = verbose
        self.filename = filename

        self.max_score = 0
        self.max_avg = 0

        self.policy_model = PolicyModel(state_size, action_size,
                                        n_hidden_layers=n_hidden_layers,
                                        hidden_layer_size=hidden_layer_size)
        self.optimizer = keras.optimizers.Adam()        
        
        self.results = []
        self.save_interval = 10
        self.reset()

    def reset(self):
        """reset agent for start of episode"""
        self.timestep = 0
        # truncate memory
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.total_reward = 0

    def act(self, state, argmax=False):
        """pick an action using policy_model"""
        logits = self.policy_model.predict(state)
        probabilities = tf.nn.softmax(logits)[0]

        if argmax:
            # for replay choose most likely action
            action = np.argmax(probabilities)
        else:
            # for training sample from actions
            action = np.random.choice(self.action_space, p=probabilities)
        return action

    def remember(self):
        """at each timestep save state, action, reward for future training"""
        self.state_memory.append(self.state)
        self.action_memory.append(self.action)
        self.reward_memory.append(self.reward)
        
    def train_step(self, state_memory, actions, deltas):
        # compute gradient and update
        with tf.GradientTape() as tape:
            pred_logits = self.policy_model(state_memory)
            pred_probs = tf.nn.softmax(pred_logits)
            log_probs = tf.math.log(pred_probs)
            # mask / squeeze log_probs for only actions we actually took
            log_probs = tf.reduce_sum(tf.math.multiply(log_probs, actions), axis=1)
            # multiply by discounted_rewards and calculate sum. 
            # negate because apply does gradient descent(minimizes), we want ascent(maximize)
            target = tf.reduce_mean(tf.math.multiply(log_probs, -deltas))
            # get gradients of prob x discounted_rewards
            gradients = tape.gradient(target, self.policy_model.trainable_variables)
            # apply gradient
            self.optimizer.apply_gradients(zip(gradients, self.policy_model.trainable_variables))
            
    def train(self):
        """train the model on experience stored by remember"""
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        # one-hot actions
        actions = np.zeros([len(action_memory), self.action_size])
        actions[np.arange(len(action_memory)), action_memory] = 1

        disc_rewards = np.zeros_like(reward_memory)
        cumulative_rewards = 0
        for i in reversed(range(len(reward_memory))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + reward_memory[i]
            disc_rewards[i] = cumulative_rewards
            
        # standardize
        disc_rewards -= np.mean(disc_rewards)
        disc_rewards /= np.std(disc_rewards) if np.std(disc_rewards) > 0 else 1
      
        self.train_step(tf.convert_to_tensor(state_memory), 
                        tf.convert_to_tensor(actions), tf.convert_to_tensor(disc_rewards))
        
    def get_config(self):
        return {'state_size': self.state_size,
                'action_size': self.action_size,
                'n_hidden_layers': self.n_hidden_layers,
                'hidden_layer_size': self.hidden_layer_size,
                'activation': self.activation,
                'reg_penalty': self.reg_penalty,
                'dropout': self.dropout,
                'learning_rate': self.learning_rate,
                'filename': self.filename,
                'verbose': self.verbose,
               }
    
    def save_agent(self):
        """save agent: pickle self and use Keras native save model"""
        fullname = "%s%s%05d" % (OUTPUT_DIR, self.filename, len(self.results))
        agent_attrs = self.get_config()
        model_attrs = self.policy_model.get_config()
        self.policy_model.save_weights("%s.h5" % fullname)
        pickle.dump({'agent_attrs': agent_attrs, 'model_attrs': model_attrs}, open("%s.p" % fullname, "wb"))

    def load_agent(filename):
        """load saved agent"""
        pickledict = pickle.load(open("%s.p" % filename, "rb"))
        new = DQN_Agent(state_size=pickledict['agent_attrs']['state_size'],
                        action_size=pickledict['agent_attrs']['action_size'],
                        discount_rate=pickledict['agent_attrs']['discount_rate'],
                        learning_rate=pickledict['agent_attrs']['learning_rate'],
                        epsilon=pickledict['agent_attrs']['epsilon'],
                        epsilon_decay=pickledict['agent_attrs']['epsilon_decay'],
                        epsilon_min=pickledict['agent_attrs']['epsilon_min']                        
                       )
        for name, value in pickledict['agent_attrs'].items():
            setattr(new, name, value)
        # make a prediction to fix input size
        new.policy_model.predict(np.random.uniform(size=(1, new.state_size))/10)
            
        new.policy_model.load_weights("%s.h5" % filename, by_name=False)           
        for name, value in pickledict['model_attrs'].items():
            setattr(new.policy_model, name, value)
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(new.results),
                                                                      len(new.memory),
                                                                      new.epsilon))
        return new


In [None]:
# run faster without rendering
RENDER = False

if COMET_ENABLED:
    experiment = Experiment(project_name="DV_Cartpole_REINFORCE",
                            auto_param_logging=False)    

# https://gym.openai.com/envs/CartPole-v1/
env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)

LEARNING_RATE = 0.0005
if COMET_ENABLED:
    experiment.log_parameters({
        "discount_rate": DISCOUNT_RATE,
        "learning_rate": LEARNING_RATE,
    })

agent = REINFORCE_Agent(state_size=env.observation_space.shape[0],
                        action_size=env.action_space.n,
                        n_hidden_layers=0,
                        learning_rate=LEARNING_RATE,
                        discount_rate=DISCOUNT_RATE,)

# train

print("Start training: %s" % time.strftime("%H:%M:%S"))
for e in range(N_EPISODES):
    agent.run_episode(env)
    agent.score_episode(e, N_EPISODES)
    if COMET_ENABLED:
        experiment.log_metrics({
            'episode': len(agent.results),
            'reward': agent.total_reward,
            'avg_reward': agent.avgscore
        })

    if e and (e+1) % agent.save_interval == 0:
        agent.save_agent()
        
print("\nFinish training: %s" % time.strftime("%H:%M:%S"))

if COMET_ENABLED:
    experiment.end()


In [None]:
# plot training progress
agent.rlplot(title='Cartpole REINFORCE Agent Training Progress')

In [None]:
# view agent in action
# agent.predict_model = load_model(OUTPUT_DIR + "kreinforce01760_predict.h5")

agent.view(render=RENDER)

In [None]:
# for science let's try on LunarLander

if COMET_ENABLED:
    experiment = Experiment(project_name="DV_LunarLander_REINFORCE",
                            auto_param_logging=False)    

env = gym.make('LunarLander-v2')
env.seed(GLOBAL_SEED)

N_EPISODES = LUNARLANDER_EPISODES
N_HIDDEN_LAYERS = 2
HIDDEN_LAYER_SIZE = 64
DISCOUNT_RATE = 0.0005
LEARNING_RATE = 0.99

if COMET_ENABLED:
    experiment.log_parameters({
        "n_hidden_layers": N_HIDDEN_LAYERS,
        "hidden_layer_size": HIDDEN_LAYER_SIZE,
        "discount_rate": DISCOUNT_RATE,
        "learning_rate": LEARNING_RATE,
    })

agent = REINFORCE_Agent(state_size=env.observation_space.shape[0],
                        n_hidden_layers=N_HIDDEN_LAYERS,
                        hidden_layer_size=HIDDEN_LAYER_SIZE,
                        action_size=env.action_space.n,
                        learning_rate=LEARNING_RATE,
                        discount_rate=DISCOUNT_RATE,)

print("Start training: %s" % time.strftime("%H:%M:%S"))
for e in range(N_EPISODES):
    agent.run_episode(env)
    agent.score_episode(e, N_EPISODES)
    if COMET_ENABLED:
        experiment.log_metrics({
            'episode': len(agent.results),
            'reward': agent.total_reward,
            'avg_reward': agent.avgscore
        })
    
print("\nFinish training: %s" % time.strftime("%H:%M:%S"))

if COMET_ENABLED:
    experiment.end()

In [None]:
agent.rlplot(title='Lunar Lander Deep Policy Gradient Agent Training Progress')

In [None]:
agent.view(render=RENDER)

In [None]:
# with REINFORCE above, we run out each episode trajectory, then update thetas 
# to get more good/fewer bad outcomes relative to average
# for instance suppose we are training Cartpole, getting scores around 200
# you run out 200 timesteps and fall over
# you want more of the actions at the beginning, fewer of the actions at the end that made it fall over
# fine as long as you gradually improve but suppose you have an episode that scores only 10
# you don't really want more of any of those actions
# maybe we can do better than measuring scores relative to episode average
# we build a 'baseline' NN state-value function estimator 
# after each episode, train the state-value function
# in case of cart-pole, function should learn that high deflections, high speed toward edge = bad
# train policy agent to use state augmented by state-value deltas instead of deviations from mean
# this is 'REINFORCE with baseline'

In [None]:
class ValueModel(Model):
    """model takes state_size inputs, returns a single state value  
    same setup as PolicyModel except last layer is size 1 instead of action_size
    """
    def __init__(self, 
                 state_size, 
                 action_size,
                 n_hidden_layers=N_HIDDEN_LAYERS,
                 hidden_layer_size=HIDDEN_LAYER_SIZE,
                 activation='relu',
                 reg_penalty=0.001,
                 dropout=0.0625,
                 verbose=True
                ):

        super().__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.n_hidden_layers = n_hidden_layers
        self.hidden_layer_size = hidden_layer_size
        self.activation = activation
        self.reg_penalty = reg_penalty
        self.dropout = dropout
        self.verbose = verbose
                
        self.rllayers = []

        for i in range(n_hidden_layers):
            if verbose:
                formatstr = "layer %d size %d, %s, reg_penalty %.8f, dropout %.3f"
                print(formatstr % (i + 1, hidden_layer_size, activation, reg_penalty, dropout))
                
            # add dropout, only between hidden layers
            if i and dropout:
                self.rllayers.append(Dropout(dropout, name="Dropout%02d" % i))
                
            self.rllayers.append(Dense(hidden_layer_size, activation=activation, name="Dense%02d" % i))

        self.rllayers.append(Dense(1, activation='linear', name="Output"))

    def call(self, x):
        # Forward pass
        for layer in self.rllayers:
            x = layer(x)
        return x  

    def get_config(self):
        return {'state_size': self.state_size,
                'action_size': self.action_size,
                'n_hidden_layers': self.n_hidden_layers,
                'hidden_layer_size': self.hidden_layer_size,
                'activation': self.activation,
                'reg_penalty': self.reg_penalty,
                'dropout': self.dropout,
                'verbose': self.verbose,
               }


In [None]:
class ReinforceBaseline(REINFORCE_Agent):
    """REINFORCE with baseline
    inherit from REINFORCE_Agent, add action-value model, reimplement train to use value model"""
    
    def __init__(self, state_size=STATE_SIZE, action_size=ACTION_SIZE, learning_rate=0.0005,
                 discount_rate=DISCOUNT_RATE, 
                 n_hidden_layers=N_HIDDEN_LAYERS,
                 hidden_layer_size=HIDDEN_LAYER_SIZE,
                 activation='relu', reg_penalty=0, dropout=0, filename="krb",):
        super().__init__(state_size=state_size, 
                         action_size=action_size, 
                         learning_rate=learning_rate,
                         discount_rate=discount_rate, 
                         n_hidden_layers=n_hidden_layers, 
                         hidden_layer_size=hidden_layer_size,
                         activation=activation, 
                         reg_penalty=reg_penalty, 
                         dropout=dropout, 
                         filename=filename)
        self.baseline = ValueModel(state_size, action_size, 
                                   n_hidden_layers=n_hidden_layers, hidden_layer_size=hidden_layer_size)
        self.baseline_optimizer = Adam()
        self.loss_function = keras.losses.MSE

    def train_baseline_step(self, X, Y):
        train_ds = tf.data.Dataset.from_tensor_slices((X, Y)).batch(BATCH_SIZE)
        for X_batch, Y_batch in train_ds:
            with tf.GradientTape() as tape:
                predictions = self.baseline(X_batch)
                loss = self.loss_function(Y_batch, predictions)
            gradients = tape.gradient(loss, self.baseline.trainable_variables)
            self.baseline_optimizer.apply_gradients(zip(gradients, self.baseline.trainable_variables))

    def train(self):
        """train action-value model, policy model"""
        
        # convert to numpy ndarrays
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        # one-hot actions
        actions = np.zeros([len(action_memory), self.action_size])
        actions[np.arange(len(action_memory)), action_memory] = 1

        # compute discounted rewards
        disc_rewards = np.zeros_like(reward_memory)
        cumulative_rewards = 0
        for i in reversed(range(len(reward_memory))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + reward_memory[i]
            disc_rewards[i] = cumulative_rewards

        # instead of standardizing, compute difference vs. value model
        # can be viewed as 'surprise', how much better/worse outcome was than expected
        state_values = self.baseline.predict(state_memory).reshape(- 1)
        deltas = disc_rewards - state_values
        self.train_step(state_memory, actions, deltas)
        # train value function against observed rewards
        self.train_baseline_step(state_memory, disc_rewards)

    def save_agent(self):
        """save agent: pickle self and use Keras native save model"""
        fullname = "%s%s%05d" % (OUTPUT_DIR, self.filename, len(self.results))
        self.policy_model.save_weights("%s_predict.h5" % fullname)
        self.baseline.save_weights("%s_baseline.h5" % fullname)
        pickledict = {'agent_attrs': self.get_config(), 
                      'policy_model_attrs': self.policy_model.get_config(),
                      'baseline_model_attrs': self.baseline.get_config(),
                     }
        pickle.dump(pickledict, open("%s.p" % fullname, "wb"))

    def load_agent(filename):
        """load saved agent"""
        pickledict = pickle.load(open("%s.p" % filename, "rb"))
        new = DQN_Agent(state_size=pickledict['agent_attrs']['state_size'],
                        action_size=pickledict['agent_attrs']['action_size'],
                        discount_rate=pickledict['agent_attrs']['discount_rate'],
                        learning_rate=pickledict['agent_attrs']['learning_rate'],
                        epsilon=pickledict['agent_attrs']['epsilon'],
                        epsilon_decay=pickledict['agent_attrs']['epsilon_decay'],
                        epsilon_min=pickledict['agent_attrs']['epsilon_min']                        
                       )
        for name, value in pickledict['agent_attrs'].items():
            setattr(new, name, value)
        # make a prediction to fix input size
        new.policy_model.predict(np.random.uniform(size=(1, new.state_size))/10)
        new.policy_model.load_weights("%s_predict.h5" % filename, by_name=False)           
        for name, value in pickledict['policy_model_attrs'].items():
            setattr(new.policy_model, name, value)
        # make a prediction to fix input size
        new.baseline.predict(np.random.uniform(size=(1, new.state_size))/10)
        new.baseline.load_weights("%s_baseline.h5" % filename, by_name=False)           
        for name, value in pickledict['baseline_model_attrs'].items():
            setattr(new.baseline, name, value)
        print("loaded")
        return new


In [None]:
# run faster without rendering
RENDER = False
# https://gym.openai.com/envs/CartPole-v1/
env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)

if COMET_ENABLED:
    experiment = Experiment(project_name="DV_Cartpole_REINFORCE_Baseline",
                            auto_param_logging=False)    

N_EPISODES = CARTPOLE_EPISODES
N_HIDDEN_LAYERS = 1
HIDDEN_LAYER_SIZE = 32
DISCOUNT_RATE = 0.99
LEARNING_RATE = 0.0005

if COMET_ENABLED:
    experiment.log_parameters({
        "n_hidden_layers": N_HIDDEN_LAYERS,
        "hidden_layer_size": HIDDEN_LAYER_SIZE,
        "discount_rate": DISCOUNT_RATE,
        "learning_rate": LEARNING_RATE,
    })

agent = ReinforceBaseline(state_size=env.observation_space.shape[0],
                          action_size=env.action_space.n,
                          learning_rate=LEARNING_RATE,
                          discount_rate=DISCOUNT_RATE,
                          n_hidden_layers=N_HIDDEN_LAYERS,
                          hidden_layer_size=HIDDEN_LAYER_SIZE,
                         )

# train
N_EPISODES = CARTPOLE_EPISODES
print("Start training: %s" % time.strftime("%H:%M:%S"))
for e in range(N_EPISODES):
    agent.run_episode(env)
    agent.score_episode(e, N_EPISODES)
    if COMET_ENABLED:
        experiment.log_metrics({
            'episode': len(agent.results),
            'reward': agent.total_reward,
            'avg_reward': agent.avgscore
        })   
    if e and (e+1) % agent.save_interval == 0:
        agent.save_agent()
print("\nFinish training: %s" % time.strftime("%H:%M:%S"))

if COMET_ENABLED:
    experiment.end()

In [None]:
agent.rlplot(title='Cartpole REINFORCE w/Baseline Agent Training Progress')

In [None]:
agent.view(render=RENDER)

In [None]:
# try on LunarLander
N_EPISODES = LUNARLANDER_EPISODES
env = gym.make('LunarLander-v2')
env.seed(GLOBAL_SEED)

agent = ReinforceBaseline(state_size=env.observation_space.shape[0],
                          action_size=env.action_space.n,
                          n_hidden_layers=2,
                          hidden_layer_size=64,
                          learning_rate=0.0005,
                          discount_rate=0.99, 
                          filename='llrb')

print("Start training: %s" % time.strftime("%H:%M:%S"))
for e in range(N_EPISODES):
    agent.run_episode(env)
    agent.score_episode(e, N_EPISODES)
    if e and (e+1) % agent.save_interval == 0:
        agent.save_agent()
print("\nFinish training: %s" % time.strftime("%H:%M:%S"))


In [None]:
agent.rlplot("Lunar Lander REINFORCE with Baseline Training Progress")

In [None]:
agent.view(render=RENDER)

In [None]:
# run faster without rendering
RENDER = False
# https://gym.openai.com/envs/CartPole-v1/
env = gym.make('LunarLander-v2')
env.seed(GLOBAL_SEED)

agent = ReinforceBaseline(state_size=env.observation_space.shape[0],
                          action_size=env.action_space.n,
                          n_hidden_layers=2,
                          hidden_layer_size=64,
                          learning_rate=0.0005,
                          discount_rate=0.98,)
agent.predict_model = load_model("llrb_good_predict.h5")
agent.baseline = load_model("llrb_good_V.h5")
agent.view(render=RENDER)


In [None]:
# use RLlib - state of the art library, instead of rolling our own

In [None]:
import ray

n_cpus = multiprocessing.cpu_count()
n_cpus

In [None]:
n_gpus = len(tf.config.list_physical_devices('GPU'))
n_gpus

In [None]:
ray.init(ignore_reinit_error=True, log_to_driver=False, webui_host='0.0.0.0')
# https://ray.readthedocs.io/en/latest/package-ref.html#ray.init

In [None]:
# https://github.com/ray-project/ray/blob/master/rllib/agents/ppo/ppo.py
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG

env_name = 'CartPole-v1'

ppo_config = DEFAULT_CONFIG.copy()
if n_gpus:
    ppo_config['num_gpus'] = n_gpus
    ppo_config['tf_session_args']['device_count']['GPU'] = n_gpus

ppo_config['num_workers'] = 1
ppo_config['num_sgd_iter'] = 2
ppo_config['sgd_minibatch_size'] = 128
ppo_config['lr'] = 0.0003
ppo_config['gamma'] = 0.99
ppo_config['model']['fcnet_hiddens'] = [64, 64]
ppo_config['timesteps_per_iteration'] = 2000
ppo_config['train_batch_size'] = 8000
ppo_config['num_cpus_per_worker'] = 0  # This avoids running out of resources in the notebook environment when this cell is re-executed

agent = PPOTrainer(ppo_config, env_name)
result = agent.train()

result


In [None]:
# https://github.com/ray-project/ray/blob/master/python/ray/tune/tune.py
from ray import tune
ray.init(ignore_reinit_error=True)
env_name = 'CartPole-v1'
ppo_config = {
    "env": env_name,
    "num_workers": 1,
    'model': {
        'fcnet_hiddens': tune.grid_search([
                                           [16, 16], [32, 32], [64, 64], [128, 128],
                                          ])
    },        
    'train_batch_size': 1000,
    "lr": tune.grid_search([0.0003, 0.0001]),
    'gamma': tune.grid_search([0.99, 0.999]),
    "eager": False,
    'num_gpus': n_gpus  
}
                      
analysis = tune.run(
    "PPO",
    name='cartpole_test',
    verbose=1,

    stop={"episode_reward_mean": 300},  # stop when a parameter set is able to reach 300 timesteps
    config = ppo_config,
    checkpoint_freq=10,
    checkpoint_at_end=True,
    checkpoint_score_attr='episode_reward_mean',
    num_samples=1,  # for grid search, number of times to run each hyperparameter combo
    #     with_server=True,
    #     server_port=8267,
)


In [None]:
dfs = analysis.trial_dataframes

# Plot by epoch
ax = None  # This plots everything on the same plot
for d in dfs.values():
    ax = d.episode_reward_mean.plot(ax=ax, legend=False)
    

In [None]:
analysis.dataframe().sort_values(['timesteps_total','episode_reward_mean'])[['config/lr', 
                                                                             'config/gamma', 
                                                                             'config/model', 
                                                                             'episode_reward_mean', 
                                                                             'timesteps_total']]


In [None]:
analysis.dataframe()[['config/lr', 'timesteps_total']].groupby('config/lr').mean()

In [None]:
analysis.dataframe()[['config/gamma', 'timesteps_total']].groupby('config/gamma').mean()

In [None]:
aframe = analysis.dataframe()
aframe['config/model'] = aframe['config/model'].astype(str)
aframe[['config/model', 'timesteps_total']] \
    .groupby(['config/model']) \
    .mean() \
    .sort_values('timesteps_total') \
    .head(10)

In [None]:
zframe = analysis.dataframe()[['config/lr', 'config/gamma', 'timesteps_total']]
matrix = pd.pivot_table(zframe, 
                        values='timesteps_total', 
                        index=['config/lr'], 
                        columns=['config/gamma'],
                        aggfunc=np.mean)

fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(matrix, annot=True, fmt='.0f')
plt.ylabel('lr')
plt.xlabel('gamma')
plt.title("Hyperparameter matrix")
plt.show()

In [None]:
logdir = analysis.get_best_logdir("timesteps_total", mode="min")
logdir

In [None]:
with open('%s/params.json' % logdir) as f:
    data = json.load(f)
data

In [None]:
ray.init(num_cpus=n_cpus, num_gpus=n_gpus, ignore_reinit_error=True, log_to_driver=False, webui_host='0.0.0.0')

ppo_config = {
    "env": env_name,
    "num_workers": n_cpus - 1,
    'model': {
        'fcnet_hiddens': [32, 32]
    },        
    'train_batch_size': 10000,
    "lr": 0.0003,
    'gamma': 0.99,
    "eager": False,
    'num_gpus': n_gpus  
}
                      
analysis = tune.run(
    "PPO",
    name='cartpole_test',
    verbose=1,

    stop={"episode_reward_mean": 500},  # stop when a parameter set is able to reach 500 timesteps
    config = ppo_config,
    checkpoint_freq=10,
    checkpoint_at_end=True,
    checkpoint_score_attr='episode_reward_mean',
    num_samples=1,  # for grid search, number of times to run each hyperparameter combo
    #     with_server=True,
    #     server_port=8267,
)


In [None]:
list(analysis.trial_dataframes.keys())[0]


In [None]:
!ls /home/ubuntu/ray_results/cartpole_test/PPO_CartPole-v1_1c7d6f00_2020-02-13_02-25-33iw9gmucd/checkpoint_18

In [None]:
# load checkpoint and do the runout
ckpoint = '/home/ubuntu/ray_results/cartpole_test/PPO_CartPole-v1_1c7d6f00_2020-02-13_02-25-33iw9gmucd/checkpoint_18/checkpoint-18'
trainer = PPOTrainer(config=ppo_config, env=env_name)
trainer.restore(ckpoint)



In [None]:
env = gym.make(env_name)
#env.seed(GLOBAL_SEED)

state = env.reset()
trainer.compute_action(state)

done = False

# run an episode
timestep = 0
r = 0
while not done:
    env.render()
    action = trainer.compute_action(state)
    state, reward, done, _ = env.step(action)
    r += reward
    timestep += 1
print(r)
env.close()
timestep