In [1]:
# based on code from Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN

import os
from collections import deque
import random
import time

# to track down memory leak
import resource
from pympler import tracker

import pdb

import numpy as np

from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# requires python 3.6
# conda install -c akode gym
import gym


In [6]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000) # double-ended queue
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self.build_model()
        
        self.memory_tracker = tracker.SummaryTracker()
        
    def build_model(self,
                    n_hidden_layers=2, 
                    hidden_layer_size=32, 
                    activation='relu',
                    reg_penalty=0.0,
                    dropout=False,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        model = Sequential()

        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                hidden_layer_size, 
                                                                                activation,
                                                                                reg_penalty,
                                                                                dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                model.add(Dropout(dropout))

            if i==0: # first layer, specify input shape
                model.add(Dense(input_shape=(state_size,),
                                units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))
            else: #use implicit input shape
                model.add(Dense(units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))

        model.add(Dense(self.action_size, activation='linear', name="Output"))

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        return model
        
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
    
    def train(self, batch_size):
        # get batch_size observations from memory
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            # target is reward plus current Q prediction of value of action
            target_fit = self.model.predict(state)
            # but we don't want to fit against our own prediction
            # we improve the target by what we observed about the action we took
            target_actual = reward
            if not done:
                # add discount factor * value of predicted next state
                # self.model.predict(next_state).max()
                target_actual += self.gamma * np.amax(self.model.predict(next_state)[0])
            target_fit[0][action] = target_actual
            self.model.fit(state, target_fit, epochs=1, verbose=0)
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        # self.memory_tracker.print_diff()

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def load(self, filename):
        with open('%s.json' % filename, 'r') as json_file:
            self.model = model_from_json(json_file.read())
        self.model.load_weights("%s.h5" % filename)

    def save(self, filename):
        # serialize model to JSON
        with open("%s.json" % filename, "w") as json_file:
            json_file.write(self.model.to_json())
        # serialize weights to HDF5
        self.model.save_weights("%s.h5" % filename)


In [7]:
#https://gym.openai.com/envs/CartPole-v0/
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episodes=1000
output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [8]:
agent = DQNAgent(state_size, action_size)

for e in range(n_episodes):
    print ('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    timesteps = 0
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("{} episode: {}/{}, score: {}, epsilon: {:.02}"
                  .format(time.strftime("%H:%M:%S"), e, n_episodes, timesteps, agent.epsilon))
        timesteps +=1
            
    if len(agent.memory) > batch_size:
        #pdb.set_trace()
        agent.train(batch_size)
    if e % 10 == 0:
        agent.save(output_dir + "model_%.04d" % e)

layer 1 size 32, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 32, relu, reg_penalty 0.00000000, dropout 0.000
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Dense00 (Dense)              (None, 32)                160       
_________________________________________________________________
Dense01 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________
None
18:50:46 episode: 0/1000, score: 37, epsilon: 1.0
Memory usage: 696397824 (kb)
                                                           types |   # objects |   total size
                                                   <class 'tuple |      177197 |  

                                                           types |   # objects |   total size
                                                   <class 'tuple |      148053 |     12.41 MB
                                                     <class 'int |      216599 |      5.78 MB
                                                    <class 'dict |       13317 |      2.95 MB
                                                    <class 'list |       19217 |      2.60 MB
                                                     <class 'set |        1395 |    305.16 KB
                                 <class 'collections.OrderedDict |         837 |    257.20 KB
                                       <class 'collections.deque |         279 |    172.20 KB
  <class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |        2671 |    146.07 KB
               <class 'tensorflow.python.framework.ops.Operation |        2671 |    146.07 KB
                  <class 'tensorflow.python.framework.ops.Te

                                                           types |   # objects |   total size
                                                   <class 'tuple |      152544 |     12.78 MB
                                                     <class 'int |      223232 |      5.96 MB
                                                    <class 'dict |       13728 |      3.04 MB
                                                    <class 'list |       19808 |      2.68 MB
                                                     <class 'set |        1440 |    315.00 KB
                                 <class 'collections.OrderedDict |         864 |    265.50 KB
                                       <class 'collections.deque |         288 |    178.27 KB
  <class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |        2752 |    150.50 KB
               <class 'tensorflow.python.framework.ops.Operation |        2752 |    150.50 KB
                  <class 'tensorflow.python.framework.ops.Te

                                                           types |   # objects |   total size
                                                   <class 'tuple |      149550 |     12.53 MB
                                                     <class 'int |      218810 |      5.84 MB
                                                    <class 'dict |       13454 |      3.12 MB
                                                    <class 'list |       19414 |      2.63 MB
                                                     <class 'set |        1410 |    308.44 KB
                                 <class 'collections.OrderedDict |         846 |    259.97 KB
                                       <class 'collections.deque |         282 |    174.05 KB
  <class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |        2698 |    147.55 KB
               <class 'tensorflow.python.framework.ops.Operation |        2698 |    147.55 KB
                  <class 'tensorflow.python.framework.ops.Te

                                                               types |   # objects |   total size
                                                       <class 'tuple |      155538 |     13.04 MB
                                                         <class 'int |      227654 |      6.08 MB
                                                        <class 'dict |       14002 |      3.10 MB
                                                        <class 'list |       20202 |      2.74 MB
                                                         <class 'set |        1470 |    321.56 KB
                                     <class 'collections.OrderedDict |         882 |    271.03 KB
                                           <class 'collections.deque |         294 |    181.45 KB
      <class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |        2806 |    153.45 KB
                   <class 'tensorflow.python.framework.ops.Operation |        2806 |    153.45 KB
                    

                                                           types |   # objects |   total size
                                                   <class 'tuple |      151047 |     12.66 MB
                                                     <class 'int |      221021 |      5.90 MB
                                                    <class 'dict |       13591 |      3.01 MB
                                                    <class 'list |       19611 |      2.66 MB
                                                     <class 'set |        1425 |    311.72 KB
                                 <class 'collections.OrderedDict |         855 |    262.73 KB
                                       <class 'collections.deque |         285 |    176.41 KB
  <class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |        2725 |    149.02 KB
               <class 'tensorflow.python.framework.ops.Operation |        2725 |    149.02 KB
                  <class 'tensorflow.python.framework.ops.Te

19:23:34 episode: 30/1000, score: 10, epsilon: 0.86
Memory usage: 2923421696 (kb)
                                                               types |   # objects |   total size
                                                       <class 'tuple |      154041 |     12.91 MB
                                                         <class 'int |      225443 |      6.02 MB
                                                        <class 'dict |       13865 |      3.07 MB
                                                        <class 'list |       20005 |      2.71 MB
                                                         <class 'set |        1455 |    318.28 KB
                                     <class 'collections.OrderedDict |         873 |    268.27 KB
                                           <class 'collections.deque |         291 |    179.60 KB
      <class 'tensorflow.python.pywrap_tensorflow_internal.TF_Output |        2779 |    151.98 KB
                   <class 'tensorflo

KeyboardInterrupt: 