In [1]:
# based on Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN

import os
from collections import deque
import random
import time
import resource

import pdb

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# requires python 3.6
# conda install -c akode gym
import gym


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.build_model()
        self.memory = pd.DataFrame(columns=["state", "action", "next_state", "reward", "done"])
        
    def build_model(self,
                    n_hidden_layers=2, 
                    hidden_layer_size=32, 
                    activation='relu',
                    reg_penalty=0.0,
                    dropout=False,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        model = Sequential()

        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                hidden_layer_size, 
                                                                                activation,
                                                                                reg_penalty,
                                                                                dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                model.add(Dropout(dropout))

            if i==0: # first layer, specify input shape
                model.add(Dense(input_shape=(state_size,),
                                units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))
            else: #use implicit input shape
                model.add(Dense(units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))

        model.add(Dense(self.action_size, activation='linear'))

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        return model
        
    def remember(self, state, action, reward, next_state, done):
        # append in place
        self.memory.loc[self.memory.shape[0]]=[state[0], action, next_state[0], reward, done]
        # TODO: eventually truncate last 2000
        
    def mem_predict_state(row, action):
        state = np.array([[row['state_00'],
                           row['state_01'],
                           row['state_02'],
                           row['state_03'],
                          ]])
        target_f = self.model.predict(state)
        target_f[0][action] = target

        return self.model.predict(state)

    def mem_predict_next_state(row):
        next_state = np.array([[row['next_state_00'],
                                row['next_state_01'],
                                row['next_state_02'],
                                row['next_state_03'],
                               ]])
        return np.amax(self.model.predict(next_state)[0])
    
    def train(self, batch_size):
        # get batch_size observations from memory
        minibatch = self.memory[-2000:].sample(n=batch_size)
        
        # target is best prediction of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((batch_size, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit against our own prediction, that gets us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward + predicted value of the observed next state
        minibatch['target_observed'] = minibatch['reward']
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] += self.gamma * y_observed_pred
        np.put_along_axis(Y_pred, 
                          minibatch['action'].astype(int).values.reshape(batch_size,1), 
                          minibatch['target_observed'].values.reshape(batch_size,1),
                          axis=1)
        # fit model against improved target
        self.model.fit(X_fit, Y_pred, epochs=1, batch_size=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def load(self, filename):
        with open('%s.json' % filename, 'r') as json_file:
            self.model = model_from_json(json_file.read())
        self.model.load_weights("%s.h5" % filename)

    def save(self, filename):
        # serialize model to JSON
        with open("%s.json" % filename, "w") as json_file:
            json_file.write(self.model.to_json())
        # serialize weights to HDF5
        self.model.save_weights("%s.h5" % filename)


In [3]:
#https://gym.openai.com/envs/CartPole-v0/
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episodes=1000
output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [None]:
agent = DQNAgent(state_size, action_size)

for e in range(n_episodes):
    print ('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    timesteps = 0
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("{} episode: {}/{}, score: {}, epsilon: {:.02}"
                  .format(time.strftime("%H:%M:%S"), e, n_episodes, timesteps, agent.epsilon))
        timesteps +=1
    if len(agent.memory) > batch_size:
        #pdb.set_trace()
        agent.train(batch_size)
    if e % 10 == 0:
        agent.save(output_dir + "model_%.04d" % e)

layer 1 size 32, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 32, relu, reg_penalty 0.00000000, dropout 0.000
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Dense00 (Dense)              (None, 32)                160       
_________________________________________________________________
Dense01 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense (Dense)                (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________
None
Memory usage: 222085120 (kb)
11:18:24 episode: 0/1000, score: 16, epsilon: 1.0
Memory usage: 266989568 (kb)
11:18:25 episode: 1/1000, score: 10, epsilon: 1.0
Memory usage: 267022336 (kb)
11:18:25 episode: 2/1000, score: 10, epsilon: 1.0
Memory usage: 288722

Memory usage: 688353280 (kb)
11:19:30 episode: 91/1000, score: 23, epsilon: 0.64
Memory usage: 695607296 (kb)
11:19:30 episode: 92/1000, score: 29, epsilon: 0.64
Memory usage: 702361600 (kb)
11:19:31 episode: 93/1000, score: 29, epsilon: 0.63
Memory usage: 709660672 (kb)
11:19:32 episode: 94/1000, score: 10, epsilon: 0.63
Memory usage: 713105408 (kb)
11:19:33 episode: 95/1000, score: 13, epsilon: 0.63
Memory usage: 717443072 (kb)
11:19:34 episode: 96/1000, score: 25, epsilon: 0.62
Memory usage: 725909504 (kb)
11:19:35 episode: 97/1000, score: 40, epsilon: 0.62
Memory usage: 737583104 (kb)
11:19:38 episode: 98/1000, score: 61, epsilon: 0.62
Memory usage: 756133888 (kb)
11:19:39 episode: 99/1000, score: 31, epsilon: 0.61
Memory usage: 763990016 (kb)
11:19:40 episode: 100/1000, score: 13, epsilon: 0.61
Memory usage: 767545344 (kb)
11:19:40 episode: 101/1000, score: 12, epsilon: 0.61
Memory usage: 771518464 (kb)
11:19:42 episode: 102/1000, score: 31, epsilon: 0.61
Memory usage: 779866112 (

Memory usage: 1686396928 (kb)
11:22:29 episode: 191/1000, score: 26, epsilon: 0.39
Memory usage: 1694425088 (kb)
11:22:30 episode: 192/1000, score: 18, epsilon: 0.39
Memory usage: 1703333888 (kb)
11:22:36 episode: 193/1000, score: 105, epsilon: 0.38
Memory usage: 1744826368 (kb)
11:22:39 episode: 194/1000, score: 32, epsilon: 0.38
Memory usage: 1744826368 (kb)
11:22:43 episode: 195/1000, score: 83, epsilon: 0.38
Memory usage: 1744826368 (kb)
11:22:45 episode: 196/1000, score: 36, epsilon: 0.38
Memory usage: 1744826368 (kb)
11:22:49 episode: 197/1000, score: 41, epsilon: 0.38
Memory usage: 1744826368 (kb)
11:22:52 episode: 198/1000, score: 54, epsilon: 0.37
Memory usage: 1744826368 (kb)
11:22:54 episode: 199/1000, score: 28, epsilon: 0.37
Memory usage: 1744826368 (kb)
11:22:55 episode: 200/1000, score: 24, epsilon: 0.37
Memory usage: 1744826368 (kb)
11:22:59 episode: 201/1000, score: 61, epsilon: 0.37
Memory usage: 1744826368 (kb)
11:23:01 episode: 202/1000, score: 36, epsilon: 0.37
Mem

Memory usage: 3199062016 (kb)
11:29:36 episode: 290/1000, score: 144, epsilon: 0.24
Memory usage: 3199062016 (kb)
11:29:41 episode: 291/1000, score: 87, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:29:50 episode: 292/1000, score: 117, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:30:05 episode: 293/1000, score: 199, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:30:19 episode: 294/1000, score: 199, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:30:32 episode: 295/1000, score: 199, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:30:45 episode: 296/1000, score: 199, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:30:58 episode: 297/1000, score: 199, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:31:07 episode: 298/1000, score: 136, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:31:20 episode: 299/1000, score: 199, epsilon: 0.23
Memory usage: 3199062016 (kb)
11:31:33 episode: 300/1000, score: 199, epsilon: 0.22
Memory usage: 3199062016 (kb)
11:31:46 episode: 301/1000, score: 199, epsilon