In [1]:
import os
import random
import time
import resource
import pickle
import math

import pdb

import numpy as np
import pandas as pd

import tensorflow as tf
# keras bundled with Tensorflow 2.0 ran slower, leaked memory. got latest
# from tensorflow import keras
# from tensorflow.keras.models import Model, Sequential, load_model
# from tensorflow.keras.layers import Input, Dense, Dropout
# from tensorflow.keras.optimizers import Adam
# import tensorflow.keras.backend as K
import keras
from keras.models import Model, load_model
from keras.layers import Input, Dense, Dropout
from keras.optimizers import Adam
from keras.initializers import glorot_uniform
from keras.regularizers import l2
import keras.backend as K

import plotly
import plotly.graph_objects as go

# requires python 3.6
# conda install -c akode gym
import gym

# set seeds for reproducibility
# np.random.uniform(0,10000) 4465
GLOBAL_SEED = 4465
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
tf.random.set_seed(GLOBAL_SEED)

print("TensorFlow %s" % tf.__version__)
print("Keras %s" % keras.__version__)
print("gym %s" % gym.__version__)
print("plotly %s" % plotly.__version__)
print("pandas %s" % pd.__version__)
print("numpy %s" % np.__version__)


Using TensorFlow backend.


TensorFlow 2.0.0
Keras 2.3.1
gym 0.10.5
plotly 4.1.1
pandas 0.25.2
numpy 1.17.2


In [2]:
MAX_TIMESTEPS = 500
N_EPISODES = 2000
WIN_REWARD = 10
DISCOUNT_RATE = 0.98
RENDER = False
SAMPLE_SIZE = 128
OUTPUT_DIR = 'model_output/cartpole/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


In [3]:
# show memory usage (some versions of TensorFlow gave memory issues)
def sizeof_fmt(num, suffix='B'):
    """given memory as int format as memory units eg KB"""
    for unit in ['', 'K', 'M', 'G', 'T', 'P', 'E', 'Z']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Y', suffix)

def memusage():
    """print memory usage"""
    return sizeof_fmt(int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

memusage()


'224.8 MB'

In [4]:
class Agent:
    """abstract base class for agents"""

    def __init__(self, state_size, action_size, filename="model",
                 *args, **kwargs):
        self.state_size = state_size
        self.action_size = action_size
        self.filename = filename
        self.timestep = 0
        self.save_interval = 10

        raise NotImplementedError

    def build_model(self, *args, **kwargs):
        """build the relevant model"""
        raise NotImplementedError

    def reset(self):
        """reset agent for start of episode"""
        self.timestep = 0

    def increment_time(self):
        """increment timestep counter"""
        self.timestep += 1

    def remember(self, *args, **kwargs):
        """store the states and rewards needed to fit the model"""
        raise NotImplementedError

    def train(self, *args, **kwargs):
        """train the model on experience stored by remember"""
        raise NotImplementedError

    def act(self, *args, **kwargs):
        """pick an action using model"""
        raise NotImplementedError

    def save_score(self):
        """save score of each episode"""
        self.results.append(self.timestep)

    def score_episode(self, episode_num, n_episodes):
        """output results and save"""
        self.save_score()
        avglen = min(len(self.results), self.save_interval)
        formatstr = "{} episode {}: {}/{}, score: {}, {}-episode avg: {:.1f} Memory: {}        "
        print(formatstr.format(time.strftime("%H:%M:%S"), len(self.results),
                               episode_num+1, n_episodes, self.timestep, avglen,
                               sum(self.results[-avglen:])/avglen, memusage()),
              end="\r", flush=False)

    def run_episode(self, render=RENDER):
        """run a full episode"""
        global env

        self.reset()
        self.state = env.reset()
        self.done = False

        while not self.done:
            if render:
                env.render()
            self.action = self.act(self.state.reshape([1, self.state_size]))
            self.next_state, self.reward, self.done, _ = env.step(self.action)
            # should get extra reward for max + not done vs. max + done
            if self.done and self.timestep == (MAX_TIMESTEPS -1):
                self.reward += WIN_REWARD

            self.remember()
            self.state = self.next_state
            self.increment_time()

        # train
        self.train()

    def save(self, *args, **kwargs):
        """save agent to disk"""
        raise NotImplementedError

    def load(*args, **kwargs):
        """load agent from disk"""
        raise NotImplementedError

    def view(self):
        """Run an episode without training, with rendering"""
        state = env.reset()
        state = np.reshape(state, [1, self.state_size])
        done = False

        # run an episode
        self.timestep = 0
        r = 0
        while not done:
            env.render()
            action = self.act(state)
            state, reward, done, _ = env.step(action)
            r += reward
            state = np.reshape(state, [1, self.state_size])
            self.timestep += 1
        print(r)
        env.close()
        return self.timestep

    def rlplot(self, title='Cartpole Agent Training Progress'):
        """plot training progress"""
        df = pd.DataFrame({'timesteps': self.results})
        df['avg'] = df['timesteps'].rolling(10).mean()

        fig = go.Figure()
        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['timesteps'],
                                 mode='markers',
                                 name='timesteps',
                                 marker=dict(
                                     color='mediumblue',
                                     size=4,
                                 ),
                                ))

        fig.add_trace(go.Scatter(x=df.index,
                                 y=df['avg'],
                                 mode='lines',
                                 line_width=3,
                                 name='moving average'))

        fig.update_layout(
            title=dict(text=title,
                       x=0.5,
                       xanchor='center'),
            xaxis=dict(
                title="Episodes",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            yaxis=dict(
                title="Completed Timesteps",
                linecolor='black',
                linewidth=1,
                mirror=True
            ),
            legend=go.layout.Legend(
                x=0.01,
                y=0.99,
                traceorder="normal",
                font=dict(
                    family="sans-serif",
                    size=12,
                    color="black"
                ),
                #bgcolor="LightSteelBlue",
                bordercolor="Black",
                borderwidth=1,
            ),
        )

        return fig.show()


In [5]:
class REINFORCE_Agent(Agent):
    # REINFORCE policy gradient method using deep Keras NN
    def __init__(self, state_size=4, action_size=2, learning_rate=0.0005,
                 discount_rate=0.98, n_hidden_layers=2, hidden_layer_size=16,
                 activation='relu', reg_penalty=0, dropout=0, filename="kreinforce",
                 verbose=True):
        self.state_size = state_size
        self.action_size = action_size
        self.action_space = list(range(action_size))
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate

        self.n_hidden_layers = n_hidden_layers
        self.hidden_layer_size = hidden_layer_size
        self.activation = activation
        self.reg_penalty = reg_penalty
        self.dropout = dropout
        self.verbose = verbose
        self.filename = filename

        self.train_model, self.predict_model = self.build_model()
        self.results = []
        self.save_interval = 10
        self.reset()

    def reset(self):
        self.timestep = 0
        # truncate memory
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

    def build_model(self):
        
        def custom_loss(y_true, y_pred):
            y_pred_clip = K.clip(y_pred, 1e-8, 1-1e-8)
            log_likelihood = y_true*K.log(y_pred_clip)
            return K.sum(-log_likelihood*discounted_rewards)

        inputs = Input(shape=(self.state_size,), name="Input")
        discounted_rewards = Input(shape=(1,), name="Discounted_rewards")
        last_layer = inputs

        for i in range(self.n_hidden_layers):
            if self.verbose:
                formatstr = "layer %d size %d, %s, reg_penalty %.8f, dropout %.3f"
                print(formatstr % (i + 1,
                                   self.hidden_layer_size,
                                   self.activation,
                                   self.reg_penalty,
                                   self.dropout,
                                   ))
            # add dropout, but not on inputs, only between hidden layers
            if i and self.dropout:
                last_layer = Dropout(self.dropout, name="Dropout%02d" % i)(last_layer)

            last_layer = Dense(units=self.hidden_layer_size,
                               activation=self.activation,
                               kernel_initializer=glorot_uniform(),
                               kernel_regularizer=keras.regularizers.l2(self.reg_penalty),
                               name="Dense%02d" % i)(last_layer)

        outputs = Dense(self.action_size, activation='softmax', name="Output")(last_layer)

        train_model = Model(inputs=[inputs, discounted_rewards], outputs=[outputs])
        train_model.compile(optimizer=Adam(lr=self.learning_rate), loss=custom_loss)

        predict_model = Model(inputs=[inputs], outputs=[outputs])

        if self.verbose:
            print(predict_model.summary())

        return train_model, predict_model

    def act(self, state):
        probabilities = self.predict_model.predict(state)
        action = np.random.choice(self.action_space, p=probabilities[0])
        return action

    def remember(self):
        self.state_memory.append(self.state)
        self.action_memory.append(self.action)
        self.reward_memory.append(self.reward)

    def train(self):
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        actions = np.zeros([len(action_memory), self.action_size])
        actions[np.arange(len(action_memory)), action_memory] = 1

        discounted_rewards = np.zeros_like(reward_memory)
        cumulative_rewards = 0
        for i in reversed(range(len(reward_memory))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + reward_memory[i]
            discounted_rewards[i] = cumulative_rewards

        # standardize
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards) if np.std(discounted_rewards) > 0 else 1

        # train
        cost = self.train_model.train_on_batch([state_memory, discounted_rewards], actions)

        return cost

    def save(self):
        "save agent: pickle self and use Keras native save model"
        fullname = "%s%s%04d" % (OUTPUT_DIR, self.filename, len(self.results))
        self.predict_model.save("%s_predict.h5" % fullname)
        # can't save / load train model due to custom loss
        pickle.dump(self, open("%s.p" % fullname, "wb"))

    def load(filename, memory=True):
        "load saved agent"
        self = pickle.load(open("%s.p" % filename, "rb"))
        self.predict_model = load_model("%s_predict.h5" % filename)
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(self.results),
                                                                      len(self.memory),
                                                                      self.epsilon))


In [6]:
env = gym.make('CartPole-v1')
env.seed(int(np.random.uniform(10000)))

agent = REINFORCE_Agent(state_size=env.observation_space.shape[0],
                        action_size=env.action_space.n,
                        learning_rate=0.0005,
                        discount_rate=0.99,)
agent.predict_model = load_model("reinforce.h5")



[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
layer 1 size 16, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 16, relu, reg_penalty 0.00000000, dropout 0.000
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 4)                 0         
_________________________________________________________________
Dense00 (Dense)              (None, 16)                80        
_________________________________________________________________
Dense01 (Dense)              (None, 16)                272       
_________________________________________________________________
Output (Dense)               (None, 2)                 34        
Total params: 386
Trainable params: 386
Non-trainable params: 0
_________________________________________________________________
None



Parameters to load are deprecated.  Call .resolve and .require separately.


No training configuration found in save file: the model was *not* compiled. Compile it manually.



In [7]:
agent.view()

500.0


500