In [1]:
# based on Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN

import os
import random
import time
import resource
import pickle
import math

import pdb

import numpy as np
import pandas as pd

import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras.models import Model, Sequential, load_model
# from tensorflow.keras.layers import Input, Dense, Dropout
# from tensorflow.keras.optimizers import Adam
# import tensorflow.keras.backend as K
import keras
from keras.models import Model, Sequential, load_model
from keras.layers import Input, Dense, Dropout
from keras.optimizers import Adam
import keras.backend as K

import plotly
import plotly.express as px
import plotly.graph_objects as go

from IPython.display import clear_output, display, HTML

# requires python 3.6
# conda install -c akode gym
import gym

# set seeds for reproducibility
# np.random.uniform(0,10000) 4465
GLOBAL_SEED = 4465
random.seed(GLOBAL_SEED)
np.random.seed(GLOBAL_SEED)
tf.random.set_seed(GLOBAL_SEED)

print("TensorFlow %s" % tf.__version__)
print("Keras %s" % keras.__version__)
print("gym %s" % gym.__version__)
print("plotly %s" % plotly.__version__)
print("pandas %s" % pd.__version__)
print("numpy %s" % np.__version__)


Using TensorFlow backend.


TensorFlow 2.0.0
Keras 2.3.1
gym 0.10.5
plotly 4.1.1
pandas 0.25.2
numpy 1.17.2


In [2]:
class DQN_Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.98
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.build_model()
        self.memory = pd.DataFrame(columns=["state", "action", "next_state", "reward", "done"])
        self.memory_size=200000
        self.results = []
        self.train_batch_size=8
        self.timestep=0
        self.save_interval=10
        
    def build_model(self,
                    n_hidden_layers=2, 
                    hidden_layer_size=16, 
                    activation='relu',
                    reg_penalty=0.001,
                    dropout=0.0675,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        inputs = Input(shape=(self.state_size,), name="Input")
        last_layer = inputs
        
        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                hidden_layer_size, 
                                                                                activation,
                                                                                reg_penalty,
                                                                                dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                last_layer = Dropout(dropout, name = "Dropout%02d" % i)(last_layer)
            
            last_layer = Dense(units = hidden_layer_size, 
                               activation = activation,
                               kernel_initializer = keras.initializers.glorot_uniform(),
                               kernel_regularizer=keras.regularizers.l2(reg_penalty),
                               name = "Dense%02d" % i)(last_layer)

        outputs = Dense(self.action_size, activation='linear', name = "Output")(last_layer)

        #model = Model(inputs=input_layer , output=last_layer)
        model = Model(inputs=inputs, outputs=outputs)

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        return model
    
    def remember(self, state, action, reward, next_state, done):
        # append in place
        self.memory.loc[self.memory.shape[0]]=[state[0], action, next_state[0], reward, done]
            
    def train(self, sample_size, start_epoch=0):
        # truncate memory
        self.memory = self.memory[-self.memory_size:]
        # sample sample_size observations from memory
        minibatch = self.memory.sample(n=sample_size)
        
        # target is our best estimate of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((sample_size, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit model against model's own prediction, that would get us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward obtained + predicted value of the observed next state
        minibatch['target_observed'] = minibatch['reward']
        # if done, target is the reward 
        # reward by gym env is only 1 for each timestep of survival
        # but we also added a reward of -10 on failure
        # if not done, add gamma discount rate * Q-value prediction for the observed next state
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        # run all predictions at once
        # iterates faster but does not train after each prediction
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] += self.gamma * y_observed_pred
        # vectorized vlookup - update y_pred column specified by action using target_observed
        np.put_along_axis(Y_pred, 
                          minibatch['action'].astype(int).values.reshape(sample_size,1), 
                          minibatch['target_observed'].values.reshape(sample_size,1),
                          axis=1)
        # fit model against improved target
        # arbitrary 8 batch size to reduce variance a little and speed up fit
        self.model.fit(X_fit, Y_pred, 
                       epochs=1, initial_epoch=start_epoch,
                       batch_size=self.train_batch_size, 
                       verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def reset(self):
        self.timestep = 0
    
    def increment_time(self):
        self.timestep +=1

    def score_episode(self, e, n_episodes):
        self.save_score()
        avglen=min(len(self.results), self.save_interval)
        print("{} episode {}: {}/{}, score: {}, {}-episode avg: {:.1f} epsilon: {:.02} Memory: {}        "
              .format(time.strftime("%H:%M:%S"), len(self.results), e+1, n_episodes, self.timestep, 
                      avglen, sum(self.results[-avglen:])/avglen, self.epsilon, memusage()),
              end="\r", flush=False)
        
    def save_score(self):
        self.results.append(self.timestep)    
    
    def load(self, filename, memory=True):
        self.model = load_model("%s.h5" % filename)
        pickledict = pickle.load(open( "%s.p" % filename, "rb"))
        self.memory = pickledict['memory']
        self.results = pickledict['results']
        self.epsilon = pickledict['epsilon']
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(self.results),
                                                                      len(self.memory),
                                                                      self.epsilon))

    def save(self, pathname, memory=True):
        fullname = "%s%04d" % (pathname, len(self.results))
        self.model.save("%s.h5" % fullname)
        pickledict = {
            'memory': self.memory,
            'results': self.results,
            'epsilon': self.epsilon,
        }
        pickle.dump( pickledict, open( "%s.p" % fullname, "wb" ) )
        #print("saved model to %s" % fullname)

        

In [3]:
#https://gym.openai.com/envs/CartPole-v1/
env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
sample_size = 128
max_timesteps = 500
n_episodes = 400
win_reward = 10

output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m



Parameters to load are deprecated.  Call .resolve and .require separately.



In [4]:
agent = DQN_Agent(state_size, action_size)

layer 1 size 16, relu, reg_penalty 0.00100000, dropout 0.068
layer 2 size 16, relu, reg_penalty 0.00100000, dropout 0.068
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           (None, 4)                 0         
_________________________________________________________________
Dense00 (Dense)              (None, 16)                80        
_________________________________________________________________
Dropout01 (Dropout)          (None, 16)                0         
_________________________________________________________________
Dense01 (Dense)              (None, 16)                272       
_________________________________________________________________
Output (Dense)               (None, 2)                 34        
Total params: 386
Trainable params: 386
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
#load earlier model
# start_epoch=400
# loadmodel = '%04d' % start_epoch
# agent.load(output_dir + 'model_' + loadmodel)
# n_episodes = 400


In [6]:
def sizeof_fmt(num, suffix='B'):
    for unit in ['','K','M','G','T','P','E','Z']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Y', suffix)

def memusage():
    return sizeof_fmt(int(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss))

memusage()

'262.0 MB'

In [None]:
# run faster without rendering
RENDER=False

for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    agent.reset()
    done = False
    
    # run an episode
    while not done:
        if RENDER:
            env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        # should get extra reward for max + not done vs. max + done
        if done and agent.timestep == (max_timesteps -1):
            reward += win_reward 
            
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        agent.increment_time()
    
    # after episode
    agent.score_episode(e, n_episodes)
    
    # train
    if len(agent.memory) > sample_size*2:
        agent.train(max(sample_size, int(agent.memory.shape[0] *0.05)))

    # save every so often
    if e and (e+1) % agent.save_interval == 0:
        agent.save(output_dir + "model_")


14:13:01 episode 6: 6/400, score: 27, 6-episode avg: 18.7 epsilon: 1.0 Memory: 262.7 MB        

In [None]:
df = pd.DataFrame({'timesteps': agent.results})
df['avg'] = df['timesteps'].rolling(10).mean() 
df


In [None]:
# chart timesteps vs. episodes
def rlplot(agent):
    df = pd.DataFrame({'timesteps': agent.results})
    df['avg'] = df['timesteps'].rolling(10).mean() 

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df.index, 
                             y=df['timesteps'],
                             mode='markers',
                             name='timesteps',
                             marker=dict(
                                 color='mediumblue',
                                 size=4,
                             ),
                            ))

    fig.add_trace(go.Scatter(x=df.index, 
                             y=df['avg'],
                             mode='lines',
                             line_width=3,
                             name='moving average'))

    fig.update_layout(
        title= dict(text='Cartpole DQN Agent Training Progress',
                    x=0.5,
                    xanchor='center'),
        xaxis=dict(
            title="Episodes",
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        yaxis=dict(
            title="Completed Timesteps",
            linecolor='black',
            linewidth=1,
            mirror=True
        ),
        legend=go.layout.Legend(
            x=0.01,
            y=0.99,
            traceorder="normal",
            font=dict(
                family="sans-serif",
                size=12,
                color="black"
            ),
            #bgcolor="LightSteelBlue",
            bordercolor="Black",
            borderwidth=1,
        ),
    )

    return fig.show()

start_epoch=400
loadmodel = '%04d' % start_epoch
agent.load(output_dir + 'model_' + loadmodel)
rlplot(agent)



In [None]:
start_epoch=1000
loadmodel = '%04d' % start_epoch
agent.load(output_dir + 'model_' + loadmodel)
rlplot(agent)

In [None]:
# training as above does well up to a point but not very stable
# sometimes performance goes off a cliff esp with more complex NNs like 2x32
# continuing to train sometimes results in forgetting what it learned
# also on my machine tensorflow leaks memory, can't train long without restarting
# trained repeatedly, when it fell off a cliff restarted using best previous model
# early stopping after achieving a model that wins many times in a row,
# saved best model, run it here without epsilon random exploration, or training

agent.load('good_new')
agent.epsilon = 0.0
print(agent.model.summary())
RENDER=True

for e in range(10):
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    agent.reset()
    done = False
    
    while not done:
        if RENDER:
            env.render()
        action = agent.act(state)
        state, reward, done, _ = env.step(action)
        state = state.reshape([1, state_size])
        agent.increment_time()

    agent.score_episode(e, n_episodes)

    # don't train or save after each episode
    


In [None]:
# REINFORCE is a policy gradient method
# 2 changes vs. DQN
# 1) Monte Carlo instead of temporal difference learning:
#    after each episode, compute rewards for trajectory runout
#    train on the full last episode
#    then throw it away (no resampling history)
# 2) Use logistic regression first instead of deep NN
#    DQN trains against estimate of Q value
#    REINFORCE trains against reward
#    To update logistic regression theta
#      compute gradients of all logistic function (sigmoid) outputs w.r.t. thetas
#      compute discounted reward for each observation
#      for each action compute average gradient weighted by reward (gradient of average reward wrt theta)
#      update each thetas by that amount times learning rate

# only 4 params, runs fast and solves consistently after about 500 episodes
# logistic regression only
# could add hidden layers to make NN but would need to backprop the gradient
# tried to use keras NN but the loss function is tricky, not quite working

# https://mcneela.github.io/math/2018/04/18/A-Tutorial-on-the-REINFORCE-Algorithm.html
# https://karpathy.github.io/2016/05/31/rl/
# code mostly from
# https://github.com/jklaise/personal_website/blob/master/notebooks/rl_policy_gradients.ipynb

In [None]:
class LogisticAgent:
    
    def __init__(self, theta, learning_rate, discount_rate):
        """Initialize parameter vector theta, learning rate and discount_rate"""
        self.theta = theta
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.save_interval=10
        self.results=[]
        
    def logistic(self, y):
        """logistic function, squash -infinity to +infinity to prob between 0 and 1"""
        return 1/(1 + math.exp(-y))
    
    def act(self, x):
        """predict probas using theta, sample an action from probabilities"""
        y = x @ self.theta
        prob0 = self.logistic(y)
        probs = np.array([prob0, 1-prob0])
        action = np.random.choice([0, 1], p=probs)
        return action, probs[action]
    
    def grad_log_p(self, x):
        """calculate gradient vector of log-probas"""
        y = x @ self.theta        
        grad_log_p0 = x - x*self.logistic(y)
        grad_log_p1 = - x*self.logistic(y)
        return grad_log_p0, grad_log_p1
        
    def discount_rewards(self, rewards):
        """calculate discounted rewards"""
        discounted_rewards = np.zeros(len(rewards))
        cumulative_rewards = 0
        for i in reversed(range(len(rewards))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + rewards[i]
            discounted_rewards[i] = cumulative_rewards
        return discounted_rewards

    def train(self, rewards, obs, actions):
        """update thetas based on gradients, discounted rewards, learning rate"""
        # calculate gradients for each action you actually took
        # how much to adjust theta to increase prob of that action
        grad_log_p = np.array([self.grad_log_p(ob)[action] for ob,action in zip(obs,actions)])

        # calculate discounted rewards
        discounted_rewards = self.discount_rewards(rewards)
        # standardize
        discounted_rewards = (discounted_rewards - np.mean(discounted_rewards)) / np.std(discounted_rewards)
        
        # gradients times discounted rewards
        # average gradient over all obs weighted by reward
        # how much to update theta to increase reward
        update_target = grad_log_p.T @ discounted_rewards

        # update theta
        self.theta += self.learning_rate*update_target

    def save_score(self, reward):
        self.results.append(reward)    
        
    def score_episode(self, e, n_episodes):
        avglen=min(len(self.results), self.save_interval)
        print("{} episode {}/{}:, score: {}, {}-episode avg: {:.1f}        "
              .format(time.strftime("%H:%M:%S"), e+1, n_episodes, self.results[-1], 
                      avglen, sum(self.results[-avglen:])/avglen),
              end="\r", flush=False)
        

In [None]:
MAX_TIMESTEPS = 500
WIN_REWARD=10

def run_episode(env, agent, render=False):
    
    observation = env.reset()
    totalreward = 0
    
    observations = []
    actions = []
    rewards = []
    probs = []
    
    done = False
    
    while not done:
        if render:
            env.render()
        
        observations.append(observation)
        
        action, prob = agent.act(observation)
        observation, reward, done, info = env.step(action)
        # should get extra reward for max + not done vs. max + done
        if done and totalreward == (MAX_TIMESTEPS -1):
            reward += WIN_REWARD 
         
        totalreward += reward
        rewards.append(reward)
        actions.append(action)
        probs.append(prob)
            
    agent.save_score(totalreward)
    
    return totalreward, np.array(rewards), np.array(observations), np.array(actions), np.array(probs)

In [None]:
MAX_EPISODES=1000
RENDER=False

def run_experiment(theta, learning_rate, discount_rate, AgentClass, MAX_EPISODES=1000, seed=None):
    
    # initialize environment and policy
    env = gym.make('CartPole-v1')
    if seed is not None:
        env.seed(seed)
    episode_rewards = []
    agent = AgentClass(theta, learning_rate, discount_rate)
    
    # train until MAX_EPISODES
    for i in range(MAX_EPISODES):

        # run a single episode
        total_reward, rewards, observations, actions, probs = run_episode(env, agent, render=RENDER)
                
        # keep track of episode rewards
        episode_rewards.append(total_reward)
        
        # update policy
        agent.train(rewards, observations, actions)
        
        agent.score_episode(i, MAX_EPISODES)
                
    return episode_rewards, agent


episode_rewards, agent = run_experiment(theta=np.random.rand(4),
                                        learning_rate=0.1,
                                        discount_rate=0.975,
                                        AgentClass=LogisticAgent,
                                        MAX_EPISODES=MAX_EPISODES,
                                        seed=GLOBAL_SEED,
                                       )

In [None]:
# view trained agent
env = gym.make('CartPole-v1')
total_reward, rewards, observations, actions, probs = run_episode(env, agent, render=True)

In [None]:
rlplot(agent)

In [None]:
# Keras REINFORCE policy gradient method

class REINFORCE_Agent:
    def __init__(self, state_size=4, action_size=2, learning_rate=0.0005, discount_rate=0.98,
                 n_hidden_layers=2, hidden_layer_size=16, activation='relu', reg_penalty=0, dropout=0,
                 verbose=True):
        self.state_size = state_size
        self.action_size = action_size
        self.action_space = list(range(action_size))
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        
        self.n_hidden_layers=n_hidden_layers
        self.hidden_layer_size=hidden_layer_size
        self.activation=activation
        self.reg_penalty=reg_penalty
        self.dropout=dropout
        self.verbose=verbose        
        
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.policy, self.predict = self.build_models()
        self.results = []
        self.timestep=0
        self.save_interval=10
        
    def build_models(self):
        
        def custom_loss(y_true, y_pred):
            y_pred_clip = K.clip(y_pred, 1e-8, 1-1e-8)
            log_likelihood = y_true*K.log(y_pred_clip)

            return K.sum(-log_likelihood*discounted_rewards)
        
        inputs = Input(shape=(self.state_size,), name="Input")
        discounted_rewards = Input(shape=(1,), name="Discounted_rewards")
        last_layer = inputs
                
        for i in range(self.n_hidden_layers):
            if self.verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                self.hidden_layer_size, 
                                                                                self.activation,
                                                                                self.reg_penalty,
                                                                                self.dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and self.dropout:
                last_layer = Dropout(self.dropout, name = "Dropout%02d" % i)(last_layer)
            
            last_layer = Dense(units = self.hidden_layer_size, 
                               activation = self.activation,
                               kernel_initializer = keras.initializers.glorot_uniform(),
                               kernel_regularizer=keras.regularizers.l2(self.reg_penalty),
                               name = "Dense%02d" % i)(last_layer)

        outputs = Dense(self.action_size, activation='softmax', name = "Output")(last_layer)

        train_model = Model(inputs=[inputs, discounted_rewards], outputs=[outputs])
        train_model.compile(optimizer=Adam(lr=self.learning_rate), loss=custom_loss)

        predict_model = Model(inputs=[inputs], outputs=[outputs])
        
        if self.verbose:
            print(predict_model.summary())

        return train_model, predict_model
    
    def act(self, state):
        probabilities = self.predict.predict(state)[0]
        action = np.random.choice(self.action_space, p=probabilities)
        return action
    
    def remember(self, state, action, reward):
        self.state_memory.append(state[0])
        self.action_memory.append(action)
        self.reward_memory.append(reward)

    def train(self):
        state_memory = np.array(self.state_memory)
        action_memory = np.array(self.action_memory)
        reward_memory = np.array(self.reward_memory)

        actions = np.zeros([len(action_memory), self.action_size])
        actions[np.arange(len(action_memory)), action_memory] = 1
        
        discounted_rewards = np.zeros_like(reward_memory)
        cumulative_rewards = 0
        for i in reversed(range(len(reward_memory))):
            cumulative_rewards = cumulative_rewards * self.discount_rate + reward_memory[i]
            discounted_rewards[i] = cumulative_rewards
        
        # standardize
        discounted_rewards -= np.mean(discounted_rewards)
        discounted_rewards /= np.std(discounted_rewards) if np.std(discounted_rewards) > 0 else 1

        # train
        cost = self.policy.train_on_batch([state_memory, discounted_rewards], actions)

        # truncate memory
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []

        return cost

    def reset(self):
        self.timestep = 0
    
    def increment_time(self):
        self.timestep +=1

    def score_episode(self, e, n_episodes):
        self.save_score()
        avglen=min(len(self.results), self.save_interval)
        print("{} episode {}/{}:, score: {}, {}-episode avg: {:.1f} Memory: {}        "
              .format(time.strftime("%H:%M:%S"), e+1, n_episodes, self.timestep, 
                      avglen, sum(self.results[-avglen:])/avglen, memusage()),
              end="\r", flush=False)
        
    def save_score(self):
        self.results.append(self.timestep)    
    
    def load(self, filename, memory=True):
        self.model = load_model("%s.h5" % filename)
        pickledict = pickle.load(open( "%s.p" % filename, "rb"))
        self.memory = pickledict['memory']
        self.results = pickledict['results']
        self.epsilon = pickledict['epsilon']
        print("loaded %d results, %d rows of memory, epsilon %.4f" % (len(self.results),
                                                                      len(self.memory),
                                                                      self.epsilon))

    def save(self, pathname, memory=True):
        fullname = "%s%04d" % (pathname, len(self.results))
        self.policy.save("%s_train.h5" % fullname)        
        self.predict.save("%s_predict.h5" % fullname)        
        pickledict = {
            'state_memory': self.state_memory,
            'action_memory': self.action_memory,
            'reward_memory': self.reward_memory,
            'results': self.results,
        }
        pickle.dump( pickledict, open( "%s.p" % fullname, "wb" ) )
        #print("saved model to %s" % fullname)


In [None]:
MAX_TIMESTEPS = 500
N_EPISODES = 1000
WIN_REWARD = 10
# run faster without rendering
RENDER=False

#https://gym.openai.com/envs/CartPole-v1/
env = gym.make('CartPole-v1')
env.seed(GLOBAL_SEED)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

agent = REINFORCE_Agent(state_size=state_size, action_size=action_size, learning_rate=0.0005, discount_rate=0.98,)

output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    
# train

for e in range(N_EPISODES):
    agent.reset()
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    
    # run an episode
    while not done:
        if RENDER:
            env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        # should get extra reward for max + not done vs. max + done
        if done and agent.timestep == (MAX_TIMESTEPS -1):
            reward += WIN_REWARD 
            
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward)
        state = next_state
        agent.increment_time()
    
    # after episode
    agent.score_episode(e, N_EPISODES)
    
    # train
    agent.train()

    # save every so often
    if e and (e+1) % agent.save_interval == 0:
        agent.save(output_dir + "reinforce_")


In [None]:
rlplot(agent)

In [None]:
# view it in action

agent.reset()
env = gym.make('CartPole-v1')
state = env.reset()
state = np.reshape(state, [1, state_size])
done = False

# run an episode
while not done:
    env.render()
    action = agent.act(state)
    state, reward, done, _ = env.step(action)
    state = np.reshape(state, [1, state_size])


