In [1]:
# based on Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN

import os
from collections import deque
import random
import time
import resource

import pdb

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# requires python 3.6
# conda install -c akode gym
import gym


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.build_model()
        self.memory = pd.DataFrame(columns=["state", "action", "next_state", "reward", "done"])
        
    def build_model(self,
                    n_hidden_layers=2, 
                    hidden_layer_size=24, 
                    activation='relu',
                    reg_penalty=0.0,
                    dropout=False,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        model = Sequential()

        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                hidden_layer_size, 
                                                                                activation,
                                                                                reg_penalty,
                                                                                dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                model.add(Dropout(dropout))

            if i==0: # first layer, specify input shape
                model.add(Dense(input_shape=(state_size,),
                                units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))
            else: #use implicit input shape
                model.add(Dense(units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))

        model.add(Dense(self.action_size, activation='linear'))

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        return model
        
    def remember(self, state, action, reward, next_state, done):
        # append in place
        self.memory.loc[self.memory.shape[0]]=[state[0], action, next_state[0], reward, done]
        # TODO: eventually truncate last 2000
        
    def mem_predict_state(row, action):
        state = np.array([[row['state_00'],
                           row['state_01'],
                           row['state_02'],
                           row['state_03'],
                          ]])
        target_f = self.model.predict(state)
        target_f[0][action] = target

        return self.model.predict(state)

    def mem_predict_next_state(row):
        next_state = np.array([[row['next_state_00'],
                                row['next_state_01'],
                                row['next_state_02'],
                                row['next_state_03'],
                               ]])
        return np.amax(self.model.predict(next_state)[0])
    
    def train(self, batch_size):
        # get batch_size observations from memory
        minibatch = self.memory[-2000:].sample(n=batch_size)
        
        # target is our best estimate of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((batch_size, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit model against model's own prediction, that gets us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward obtained + predicted value of the observed next state
        minibatch['target_observed'] = minibatch['reward']
        # if done, target is the reward 
        # reward by gym env is only 1 for each timestep of survival
        # but we also added a reward of -10 on failure
        # if not done, add gamma discount rate * Q-value prediction for the observed next state
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] += self.gamma * y_observed_pred
        # vectorized vlookup - update y_pred column specified by action using target_observed
        np.put_along_axis(Y_pred, 
                          minibatch['action'].astype(int).values.reshape(batch_size,1), 
                          minibatch['target_observed'].values.reshape(batch_size,1),
                          axis=1)
        # fit model against improved target
        self.model.fit(X_fit, Y_pred, epochs=1, batch_size=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def load(self, filename):
        with open('%s.json' % filename, 'r') as json_file:
            self.model = model_from_json(json_file.read())
        self.model.load_weights("%s.h5" % filename)

    def save(self, filename):
        # serialize model to JSON
        with open("%s.json" % filename, "w") as json_file:
            json_file.write(self.model.to_json())
        # serialize weights to HDF5
        self.model.save_weights("%s.h5" % filename)


In [3]:
#https://gym.openai.com/envs/CartPole-v0/
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episodes=1000
output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [4]:
agent = DQNAgent(state_size, action_size)

for e in range(n_episodes):
    print ('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    timesteps = 0
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("{} episode: {}/{}, score: {}, epsilon: {:.02}"
                  .format(time.strftime("%H:%M:%S"), e, n_episodes, timesteps, agent.epsilon))
        timesteps +=1
    if len(agent.memory) > batch_size:
        #pdb.set_trace()
        agent.train(batch_size)
    if e % 10 == 0:
        agent.save(output_dir + "model_%.04d" % e)

layer 1 size 24, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 24, relu, reg_penalty 0.00000000, dropout 0.000
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Dense00 (Dense)              (None, 24)                120       
_________________________________________________________________
Dense01 (Dense)              (None, 24)                600       
_________________________________________________________________
dense (Dense)                (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
None
Memory usage: 221306880 (kb)
18:33:49 episode: 0/1000, score: 23, epsilon: 1.0
Memory usage: 266977280 (kb)
18:33:49 episode: 1/1000, score: 31, epsilon: 1.0
Memory usage: 287985664 (kb)
18:33:51 episode: 2/1000, score: 25, epsilon: 0.99
Memory usage: 289882112

Memory usage: 650272768 (kb)
18:34:44 episode: 91/1000, score: 12, epsilon: 0.64
Memory usage: 653352960 (kb)
18:34:45 episode: 92/1000, score: 9, epsilon: 0.63
Memory usage: 657371136 (kb)
18:34:46 episode: 93/1000, score: 17, epsilon: 0.63
Memory usage: 662507520 (kb)
18:34:48 episode: 94/1000, score: 33, epsilon: 0.63
Memory usage: 670875648 (kb)
18:34:48 episode: 95/1000, score: 15, epsilon: 0.62
Memory usage: 676151296 (kb)
18:34:50 episode: 96/1000, score: 44, epsilon: 0.62
Memory usage: 687869952 (kb)
18:34:51 episode: 97/1000, score: 15, epsilon: 0.62
Memory usage: 693768192 (kb)
18:34:52 episode: 98/1000, score: 49, epsilon: 0.61
Memory usage: 707694592 (kb)
18:34:53 episode: 99/1000, score: 16, epsilon: 0.61
Memory usage: 713842688 (kb)
18:34:54 episode: 100/1000, score: 16, epsilon: 0.61
Memory usage: 720568320 (kb)
18:34:55 episode: 101/1000, score: 21, epsilon: 0.61
Memory usage: 730198016 (kb)
18:34:56 episode: 102/1000, score: 12, epsilon: 0.6
Memory usage: 734982144 (kb

Memory usage: 1463787520 (kb)
18:37:07 episode: 191/1000, score: 59, epsilon: 0.39
Memory usage: 1488232448 (kb)
18:37:10 episode: 192/1000, score: 54, epsilon: 0.38
Memory usage: 1488232448 (kb)
18:37:12 episode: 193/1000, score: 36, epsilon: 0.38
Memory usage: 1488232448 (kb)
18:37:13 episode: 194/1000, score: 22, epsilon: 0.38
Memory usage: 1488363520 (kb)
18:37:15 episode: 195/1000, score: 52, epsilon: 0.38
Memory usage: 1506381824 (kb)
18:37:18 episode: 196/1000, score: 25, epsilon: 0.38
Memory usage: 1514168320 (kb)
18:37:19 episode: 197/1000, score: 33, epsilon: 0.37
Memory usage: 1525174272 (kb)
18:37:20 episode: 198/1000, score: 10, epsilon: 0.37
Memory usage: 1530023936 (kb)
18:37:21 episode: 199/1000, score: 20, epsilon: 0.37
Memory usage: 1539575808 (kb)
18:37:23 episode: 200/1000, score: 35, epsilon: 0.37
Memory usage: 1552343040 (kb)
18:37:25 episode: 201/1000, score: 34, epsilon: 0.37
Memory usage: 1566257152 (kb)
18:37:26 episode: 202/1000, score: 21, epsilon: 0.37
Memo

Memory usage: 2899283968 (kb)
18:42:34 episode: 290/1000, score: 38, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:42:38 episode: 291/1000, score: 70, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:42:45 episode: 292/1000, score: 109, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:42:49 episode: 293/1000, score: 57, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:42:52 episode: 294/1000, score: 50, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:42:59 episode: 295/1000, score: 111, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:43:02 episode: 296/1000, score: 48, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:43:09 episode: 297/1000, score: 117, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:43:16 episode: 298/1000, score: 102, epsilon: 0.23
Memory usage: 2899283968 (kb)
18:43:20 episode: 299/1000, score: 72, epsilon: 0.22
Memory usage: 2899283968 (kb)
18:43:24 episode: 300/1000, score: 58, epsilon: 0.22
Memory usage: 2899283968 (kb)
18:43:28 episode: 301/1000, score: 80, epsilon: 0.22


Memory usage: 3909304320 (kb)
19:05:26 episode: 388/1000, score: 32, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:05:28 episode: 389/1000, score: 10, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:05:29 episode: 390/1000, score: 15, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:05:31 episode: 391/1000, score: 17, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:05:34 episode: 392/1000, score: 33, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:05:40 episode: 393/1000, score: 72, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:06:01 episode: 394/1000, score: 242, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:06:11 episode: 395/1000, score: 121, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:06:38 episode: 396/1000, score: 324, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:06:47 episode: 397/1000, score: 86, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:07:06 episode: 398/1000, score: 122, epsilon: 0.14
Memory usage: 3909304320 (kb)
19:07:22 episode: 399/1000, score: 178, epsilon: 0.14

KeyboardInterrupt: 