In [1]:
# based on Deep Learning Illustrated by Jon Krohn
# https://www.amazon.com/Deep-Learning-Illustrated-Intelligence-Addison-Wesley/dp/0135116694
# in turn based on bit.ly/keonDQN
import os
from collections import deque
import random
import time

import pdb

import numpy as np
import pandas as pd

from tensorflow import keras
from tensorflow.keras.models import Sequential, model_from_json
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# requires python 3.6
# conda install -c akode gym
import gym


In [2]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.0003
        self.model = self.build_model()
        self.memory = pd.DataFrame(columns=["state", "action", "next_state", "reward", "done"])
        
    def build_model(self,
                    n_hidden_layers=2, 
                    hidden_layer_size=32, 
                    activation='relu',
                    reg_penalty=0.0,
                    dropout=False,
                    verbose=True
                   ):
        """return keras NN model per inputs
        input is a state - array of size state_size
        output is an array of action values - array of size action_size
        """

        model = Sequential()

        for i in range(n_hidden_layers):
            if verbose:
                print("layer %d size %d, %s, reg_penalty %.8f, dropout %.3f" % (i + 1, 
                                                                                hidden_layer_size, 
                                                                                activation,
                                                                                reg_penalty,
                                                                                dropout,
                                                                               ))
            # add dropout, but not on inputs, only between hidden layers
            if i and dropout:
                model.add(Dropout(dropout))

            if i==0: # first layer, specify input shape
                model.add(Dense(input_shape=(state_size,),
                                units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))
            else: #use implicit input shape
                model.add(Dense(units = hidden_layer_size, 
                                activation = activation,
                                kernel_initializer = keras.initializers.glorot_uniform(),
                                kernel_regularizer=keras.regularizers.l2(reg_penalty),
                                name = "Dense%02d" % i))

        model.add(Dense(self.action_size, activation='linear'))

        if verbose:
            print(model.summary())

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))

        return model
        
    def remember(self, state, action, reward, next_state, done):
        # append in place
        self.memory.loc[self.memory.shape[0]]=[state[0], action, next_state[0], reward, done]
        # TODO: eventually truncate last 2000
        
    def mem_predict_state(row, action):
        state = np.array([[row['state_00'],
                           row['state_01'],
                           row['state_02'],
                           row['state_03'],
                          ]])
        target_f = self.model.predict(state)
        target_f[0][action] = target

        return self.model.predict(state)

    def mem_predict_next_state(row):
        next_state = np.array([[row['next_state_00'],
                                row['next_state_01'],
                                row['next_state_02'],
                                row['next_state_03'],
                               ]])
        return np.amax(self.model.predict(next_state)[0])
    
    def train(self, batch_size):
        # get batch_size observations from memory
        minibatch = self.memory[-2000:].sample(n=batch_size)
        
        # target is best prediction of value of each action
        X_fit = np.concatenate(minibatch['state'].values)
        X_fit = X_fit.reshape((batch_size, self.state_size))
        Y_pred = self.model.predict(X_fit)

        # we don't just fit against our own prediction, that gets us nowhere
        # we improve the target by what we learned about the action we actually took
        # value is reward + predicted value of the observed next state
        minibatch['target_observed'] = minibatch['reward']
        not_done = minibatch.loc[minibatch['done'] == False]
        X_observed = np.concatenate(not_done['next_state'].values)
        X_observed = X_observed.reshape((not_done.shape[0], self.state_size))
        y_observed_pred = np.amax(self.model.predict(X_observed), axis=1)
        minibatch.loc[minibatch['done'] == False, 'target_observed'] += self.gamma * y_observed_pred
        np.put_along_axis(Y_pred, 
                          minibatch['action'].astype(int).values.reshape(batch_size,1), 
                          minibatch['target_observed'].values.reshape(batch_size,1),
                          axis=1)
        # fit model against improved target
        self.model.fit(X_fit, Y_pred, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
    
    def load(self, filename):
        with open('%s.json' % filename, 'r') as json_file:
            self.model = model_from_json(json_file.read())
        self.model.load_weights("%s.h5" % filename)

    def save(self, filename):
        # serialize model to JSON
        with open("%s.json" % filename, "w") as json_file:
            json_file.write(self.model.to_json())
        # serialize weights to HDF5
        self.model.save_weights("%s.h5" % filename)


In [3]:
#https://gym.openai.com/envs/CartPole-v0/
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
batch_size = 32
n_episodes=1000
output_dir = 'model_output/cartpole/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


  result = entry_point.load(False)


In [4]:
agent = DQNAgent(state_size, action_size)

for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    timesteps = 0
    
    while not done:
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = next_state.reshape([1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("{} episode: {}/{}, score: {}, epsilon: {:.02}"
                  .format(time.strftime("%H:%M:%S"), e, n_episodes, timesteps, agent.epsilon))
        timesteps +=1
    if len(agent.memory) > batch_size:
        #pdb.set_trace()
        agent.train(batch_size)
    if e % 10 == 0:
        agent.save(output_dir + "model_%.04d" % e)

layer 1 size 32, relu, reg_penalty 0.00000000, dropout 0.000
layer 2 size 32, relu, reg_penalty 0.00000000, dropout 0.000
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Dense00 (Dense)              (None, 32)                160       
_________________________________________________________________
Dense01 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense (Dense)                (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________
None
16:04:24 episode: 0/1000, score: 12, epsilon: 1.0
16:04:24 episode: 1/1000, score: 27, epsilon: 1.0
16:04:26 episode: 2/1000, score: 29, epsilon: 0.99
16:04:26 episode: 3/1000, score: 14, epsilon: 0.99
16:04:27 episode: 4/1000, score: 12, epsilon: 0.99
16:0

16:07:08 episode: 141/1000, score: 9, epsilon: 0.5
16:07:09 episode: 142/1000, score: 13, epsilon: 0.49
16:07:10 episode: 143/1000, score: 16, epsilon: 0.49
16:07:10 episode: 144/1000, score: 8, epsilon: 0.49
16:07:11 episode: 145/1000, score: 12, epsilon: 0.49
16:07:12 episode: 146/1000, score: 26, epsilon: 0.48
16:07:13 episode: 147/1000, score: 18, epsilon: 0.48
16:07:14 episode: 148/1000, score: 15, epsilon: 0.48
16:07:15 episode: 149/1000, score: 9, epsilon: 0.48
16:07:15 episode: 150/1000, score: 11, epsilon: 0.47
16:07:16 episode: 151/1000, score: 8, epsilon: 0.47
16:07:17 episode: 152/1000, score: 10, epsilon: 0.47
16:07:17 episode: 153/1000, score: 9, epsilon: 0.47
16:07:18 episode: 154/1000, score: 8, epsilon: 0.46
16:07:19 episode: 155/1000, score: 10, epsilon: 0.46
16:07:20 episode: 156/1000, score: 14, epsilon: 0.46
16:07:20 episode: 157/1000, score: 9, epsilon: 0.46
16:07:21 episode: 158/1000, score: 13, epsilon: 0.46
16:07:22 episode: 159/1000, score: 15, epsilon: 0.45
1

16:09:57 episode: 297/1000, score: 14, epsilon: 0.23
16:09:59 episode: 298/1000, score: 10, epsilon: 0.23
16:10:00 episode: 299/1000, score: 13, epsilon: 0.22
16:10:01 episode: 300/1000, score: 14, epsilon: 0.22
16:10:02 episode: 301/1000, score: 8, epsilon: 0.22
16:10:03 episode: 302/1000, score: 12, epsilon: 0.22
16:10:04 episode: 303/1000, score: 12, epsilon: 0.22
16:10:06 episode: 304/1000, score: 11, epsilon: 0.22
16:10:06 episode: 305/1000, score: 11, epsilon: 0.22
16:10:07 episode: 306/1000, score: 9, epsilon: 0.22
16:10:08 episode: 307/1000, score: 11, epsilon: 0.22
16:10:09 episode: 308/1000, score: 13, epsilon: 0.21
16:10:10 episode: 309/1000, score: 9, epsilon: 0.21
16:10:11 episode: 310/1000, score: 14, epsilon: 0.21
16:10:12 episode: 311/1000, score: 11, epsilon: 0.21
16:10:13 episode: 312/1000, score: 11, epsilon: 0.21
16:10:18 episode: 313/1000, score: 11, epsilon: 0.21
16:10:19 episode: 314/1000, score: 10, epsilon: 0.21
16:10:21 episode: 315/1000, score: 13, epsilon: 0

16:12:35 episode: 454/1000, score: 8, epsilon: 0.1
16:12:36 episode: 455/1000, score: 9, epsilon: 0.1
16:12:36 episode: 456/1000, score: 9, epsilon: 0.1
16:12:37 episode: 457/1000, score: 9, epsilon: 0.1
16:12:38 episode: 458/1000, score: 11, epsilon: 0.1
16:12:39 episode: 459/1000, score: 9, epsilon: 0.1
16:12:40 episode: 460/1000, score: 8, epsilon: 0.1
16:12:41 episode: 461/1000, score: 9, epsilon: 0.1
16:12:42 episode: 462/1000, score: 11, epsilon: 0.099
16:12:42 episode: 463/1000, score: 7, epsilon: 0.099
16:12:43 episode: 464/1000, score: 9, epsilon: 0.098
16:12:44 episode: 465/1000, score: 9, epsilon: 0.098
16:12:45 episode: 466/1000, score: 8, epsilon: 0.097
16:12:46 episode: 467/1000, score: 9, epsilon: 0.097
16:12:47 episode: 468/1000, score: 9, epsilon: 0.096
16:12:48 episode: 469/1000, score: 10, epsilon: 0.096
16:12:48 episode: 470/1000, score: 8, epsilon: 0.095
16:12:49 episode: 471/1000, score: 8, epsilon: 0.095
16:12:50 episode: 472/1000, score: 9, epsilon: 0.094
16:12:

16:15:00 episode: 609/1000, score: 9, epsilon: 0.047
16:15:00 episode: 610/1000, score: 8, epsilon: 0.047
16:15:01 episode: 611/1000, score: 8, epsilon: 0.047
16:15:02 episode: 612/1000, score: 8, epsilon: 0.047
16:15:03 episode: 613/1000, score: 10, epsilon: 0.047
16:15:04 episode: 614/1000, score: 9, epsilon: 0.046
16:15:05 episode: 615/1000, score: 8, epsilon: 0.046
16:15:06 episode: 616/1000, score: 8, epsilon: 0.046
16:15:07 episode: 617/1000, score: 11, epsilon: 0.046
16:15:08 episode: 618/1000, score: 8, epsilon: 0.045
16:15:09 episode: 619/1000, score: 10, epsilon: 0.045
16:15:10 episode: 620/1000, score: 8, epsilon: 0.045
16:15:10 episode: 621/1000, score: 8, epsilon: 0.045
16:15:12 episode: 622/1000, score: 10, epsilon: 0.044
16:15:12 episode: 623/1000, score: 9, epsilon: 0.044
16:15:13 episode: 624/1000, score: 8, epsilon: 0.044
16:15:14 episode: 625/1000, score: 10, epsilon: 0.044
16:15:15 episode: 626/1000, score: 10, epsilon: 0.044
16:15:16 episode: 627/1000, score: 9, ep

KeyboardInterrupt: 

In [None]:
np.array([[-0.03049697,  0.01529634, -0.02727538,  0.02191796]]).shape


In [None]:
z = pd.DataFrame({'a': [1, 11], 'b': [2,22], 'c': [3, 33]})
z['array'] = z.apply(lambda row: np.array([[row['a'],row['b'],row['c']]]), axis=1)
z

In [None]:
lookup = np.array([[  1.     ,   3.14   ,   4.14   ],
                   [  2.     ,   2.71818,   3.7    ],
                   [  3.     ,  42.     ,  43.     ]])

a = np.array([[ 1, 11],
              [ 1, 12],
              [ 2, 21],
              [ 3, 31]])
mapping = dict(zip(lookup[:,0], range(len(lookup))))
mapping

In [None]:
lookup[mapping[1],1:]


In [None]:
np.hstack((a, np.array([lookup[mapping[key],1:] 
                            for key in a[:,0]])))

In [None]:
lookup[:,:]

In [None]:
a = np.array([[10, 30, 20], [60, 40, 50], [80, 90, 100]])
a

In [None]:
ai = np.expand_dims(np.argmax(a, axis=1), axis=1)
ai

In [None]:
np.put_along_axis(a, ai, np.array([[-1], [-2], [-3]]), axis=1)
a

In [None]:
a
