In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import base64, io, os, time, gym
from tensorflow.keras.layers import Dense

In [2]:
import os
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam, RMSprop
def OurModel(input_shape, action_space):
    X_input = Input(input_shape)

    # 'Dense' is the basic form of a neural network layer
    # Input Layer of state size(4) and Hidden Layer with 512 nodes
    X = Dense(512, input_shape=input_shape, activation="relu", kernel_initializer='he_uniform')(X_input)

    # Hidden layer with 256 nodes
    X = Dense(256, activation="relu", kernel_initializer='he_uniform')(X)
    
    # Hidden layer with 64 nodes
    X = Dense(64, activation="relu", kernel_initializer='he_uniform')(X)

    # Output Layer with # of actions: 2 nodes (left, right)
    X = Dense(action_space, activation="linear", kernel_initializer='he_uniform')(X)

    model = Model(inputs = X_input, outputs = X, name='CartPole_DQN_model')
    model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, rho=0.95, epsilon=0.01), metrics=["accuracy"])

    model.summary()
    return model

class DQNAgent:
    def __init__(self):
        self.env = gym.make('CartPole-v1')
        # by default, CartPole-v1 has max episode steps = 500
        self.state_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.EPISODES = 1000
        self.memory = deque(maxlen=2000)
        
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999
        self.batch_size = 64
        self.train_start = 1000

        # create main model
        self.model = OurModel(input_shape=(self.state_size,), action_space = self.action_size)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > self.train_start:
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay

    def act(self, state):
        if np.random.random() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state))

    def replay(self):
        if len(self.memory) < self.train_start:
            return
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        state = np.zeros((self.batch_size, self.state_size))
        next_state = np.zeros((self.batch_size, self.state_size))
        action, reward, done = [], [], []

        # do this before prediction
        # for speedup, this could be done on the tensor level
        # but easier to understand using a loop
        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        # do batch prediction to save speed
        target = self.model.predict(state)
        target_next = self.model.predict(next_state)

        for i in range(self.batch_size):
            # correction on the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # Standard - DQN
                # DQN chooses the max Q value among next actions
                # selection and evaluation of action is on the target Q Network
                # Q_max = max_a' Q_target(s', a')
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        # Train the Neural Network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)


    def load(self, name):
        self.model = load_model(name)

    def save(self, name):
        self.model.save(name)
            
    def run(self):
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.state_size])
                if not done or i == self.env._max_episode_steps-1:
                    reward = reward
                else:
                    reward = -100
                self.remember(state, action, reward, next_state, done)
                state = next_state
                i += 1
                if done:                   
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.EPISODES, i, self.epsilon))
                    if i == 500:
                        print("Saving trained model as cartpole-dqn.h5")
                        self.save("cartpole-dqn.h5")
                        return
                self.replay()

    def test(self):
        self.load("cartpole-dqn.h5")
        for e in range(self.EPISODES):
            state = self.env.reset()
            state = np.reshape(state, [1, self.state_size])
            done = False
            i = 0
            while not done:
                self.env.render()
                action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                state = np.reshape(next_state, [1, self.state_size])
                i += 1
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, i))
                    break

if __name__ == "__main__":
    agent = DQNAgent()
    agent.run()


Model: "CartPole_DQN_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 512)               2560      
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dense_3 (Dense)             (None, 2)                 130       
                                                                 
Total params: 150,466
Trainable params: 150,466
Non-trainable params: 0
_________________________________________________________________


  super(RMSprop, self).__init__(name, **kwargs)


episode: 0/1000, score: 13, e: 1.0
episode: 1/1000, score: 14, e: 1.0
episode: 2/1000, score: 23, e: 1.0
episode: 3/1000, score: 52, e: 1.0
episode: 4/1000, score: 8, e: 1.0
episode: 5/1000, score: 22, e: 1.0
episode: 6/1000, score: 16, e: 1.0
episode: 7/1000, score: 16, e: 1.0
episode: 8/1000, score: 19, e: 1.0
episode: 9/1000, score: 21, e: 1.0
episode: 10/1000, score: 15, e: 1.0
episode: 11/1000, score: 11, e: 1.0
episode: 12/1000, score: 43, e: 1.0
episode: 13/1000, score: 12, e: 1.0
episode: 14/1000, score: 17, e: 1.0
episode: 15/1000, score: 13, e: 1.0
episode: 16/1000, score: 13, e: 1.0
episode: 17/1000, score: 29, e: 1.0
episode: 18/1000, score: 23, e: 1.0
episode: 19/1000, score: 24, e: 1.0
episode: 20/1000, score: 12, e: 1.0
episode: 21/1000, score: 17, e: 1.0
episode: 22/1000, score: 11, e: 1.0
episode: 23/1000, score: 61, e: 1.0
episode: 24/1000, score: 20, e: 1.0
episode: 25/1000, score: 12, e: 1.0
episode: 26/1000, score: 22, e: 1.0
episode: 27/1000, score: 29, e: 1.0
epi

KeyboardInterrupt: ignored

In [11]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras
!pip install keras-rl2

Collecting tensorflow==2.3.0
  Downloading tensorflow-2.3.0-cp37-cp37m-manylinux2010_x86_64.whl (320.4 MB)
[K     |████████████████████████████████| 320.4 MB 50 kB/s 
Collecting numpy<1.19.0,>=1.16.0
  Downloading numpy-1.18.5-cp37-cp37m-manylinux1_x86_64.whl (20.1 MB)
[K     |████████████████████████████████| 20.1 MB 1.3 MB/s 
[?25hCollecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
[K     |████████████████████████████████| 459 kB 53.5 MB/s 
Collecting gast==0.3.3
  Downloading gast-0.3.3-py2.py3-none-any.whl (9.7 kB)
Collecting h5py<2.11.0,>=2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 40.0 MB/s 
Installing collected packages: numpy, tensorflow-estimator, h5py, gast, tensorflow
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.5
    Uninstalling numpy-1.21.5:
      Successfully uninstalled numpy-1.21.5
  Attemptin

Collecting keras-rl2
  Downloading keras_rl2-1.0.5-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 626 kB/s 
Installing collected packages: keras-rl2
Successfully installed keras-rl2-1.0.5


In [47]:
env = gym.make('CartPole-v0')
states = env.observation_space.shape[0]
actions = env.action_space.n
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Conv2D
from tensorflow.keras.optimizers import Adam

In [52]:
def build_model(states, actions):
    model = Sequential()
    model.add(Flatten(input_shape=(1, states)))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [53]:
model = build_model(states, actions)
model.summary()
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_8 (Flatten)         (None, 4)                 0         
                                                                 
 dense_33 (Dense)            (None, 16)                80        
                                                                 
 dense_34 (Dense)            (None, 12)                204       
                                                                 
 dense_35 (Dense)            (None, 8)                 104       
                                                                 
 dense_36 (Dense)            (None, 2)                 18        
                                                                 
Total params: 406
Trainable params: 406
Non-trainable params: 0
_________________________________________________________________


In [54]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=100, target_model_update=1e-2)
    return dqn

In [56]:
dqn = build_agent(model, actions)
dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)


  updates=self.state_updates,


60 episodes - episode_reward: 164.733 [14.000, 200.000] - loss: 10.490 - mae: 58.230 - mean_q: 116.532

Interval 2 (10000 steps performed)
50 episodes - episode_reward: 198.640 [169.000, 200.000] - loss: 11.490 - mae: 41.929 - mean_q: 83.833

Interval 3 (20000 steps performed)
52 episodes - episode_reward: 194.577 [55.000, 200.000] - loss: 14.928 - mae: 46.821 - mean_q: 93.093

Interval 4 (30000 steps performed)
56 episodes - episode_reward: 179.464 [47.000, 200.000] - loss: 15.428 - mae: 46.001 - mean_q: 91.667

Interval 5 (40000 steps performed)
done, took 727.973 seconds


<keras.callbacks.History at 0x7fb9b20e9b90>

In [57]:
_ = dqn.test(env, nb_episodes=15, visualize=False)


Testing for 15 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 200.000, steps: 200
Episode 10: reward: 200.000, steps: 200
Episode 11: reward: 200.000, steps: 200
Episode 12: reward: 200.000, steps: 200
Episode 13: reward: 200.000, steps: 200
Episode 14: reward: 191.000, steps: 191
Episode 15: reward: 200.000, steps: 200


In [58]:
dqn.save_weights('dqn_weights.h5f', overwrite=True)
