## Import libraries

In [1]:
import gym
from gym import wrappers
from IPython.display import HTML
import glob
import os
import random
import numpy as np
from collections import deque
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


## Set directory

In [2]:
os.chdir("/home/devin/notebooks/cartpole-ddqn")

## Initialize gym environment and monitor recording

In [3]:
env = gym.make('CartPole-v1')
env = wrappers.Monitor(env, directory = 'random', force = True)

In [4]:
print(env.observation_space)
print(env.action_space)

Box(4,)
Discrete(2)


## Record episode of random actions

In [5]:
state = env.reset()
for t in range(500):
    env.render()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t + 1))
        break

Episode finished after 15 timesteps


## Close environment

In [6]:
env.close()

## Get episode recording path

In [7]:
os.chdir("/home/devin/notebooks/cartpole-ddqn/random")
for file in glob.glob("*.mp4"):
    vid_dir = file
os.chdir("/home/devin/notebooks/cartpole-ddqn")

In [8]:
vid_dir

'openaigym.video.0.6102.video000000.mp4'

## View episode recording of random actions

In [9]:
HTML(
'''<div align="middle"> 
<video width="80%" controls> 
    <source src="{}" type="video/mp4">
</video></div>'''.format('cartpole-ddqn/random/' + vid_dir)
)

# DDQN agent

In [13]:
class DDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen = 2000)
        self.gamma = 0.95   # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.001
        self.epsilon_decay = 0.999
        self.learning_rate = 0.0001
        self.tau = .125
        self.model = self.create_model()
        self.target_model = self.create_model()

    def create_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(100, input_dim = self.state_size, activation = 'relu'))
        model.add(Dense(50, activation = 'relu'))
        model.add(Dense(25, activation = 'relu'))
        model.add(Dense(self.action_size, activation = 'linear'))
        model.compile(loss = 'mse', optimizer = Adam(lr = self.learning_rate))
        return model

    def store(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action with highest predicted NPV of long-term rewards
    
    def act_optimal(self, state):
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action with highest predicted NPV of long-term rewards

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = self.model.predict(state)
            if done:
                target[0][action] = reward
            else:
                a = self.model.predict(next_state)[0]
                t = self.target_model.predict(next_state)[0]
                target[0][action] = reward + self.gamma * t[np.argmax(a)]
            self.model.fit(state, target, epochs = 1, verbose = 0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
        self.target_model.set_weights(target_weights)

    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

## Initialize agent and environment

In [14]:
EPISODES = 3000
episode_len = 500
env = gym.make('CartPole-v1')
env = wrappers.Monitor(env, directory = 'training', force = True)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DDQNAgent(state_size, action_size)
# agent.load("/home/devin/notebooks/cartpole-ddqn/agent-dqn.h5")
done = False
batch_size = 32

## Train agent

In [15]:
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    cumulative_reward = 0
    for time in range(episode_len):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        agent.store(state, action, reward, next_state, done)
        state = next_state
        cumulative_reward += reward
        if done:
            agent.target_train()
            print("episode: {}/{}, score: {}, eps: {:.2}".format(e + 1, EPISODES, cumulative_reward, agent.epsilon))
            break
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
    if e % 100 == 0:
        agent.save("/home/devin/notebooks/cartpole-ddqn/agent-dqn.h5")

episode: 1/3000, score: 32.0, eps: 1.0
episode: 2/3000, score: 23.0, eps: 1.0
episode: 3/3000, score: 25.0, eps: 1.0
episode: 4/3000, score: 24.0, eps: 1.0
episode: 5/3000, score: 27.0, eps: 1.0
episode: 6/3000, score: 10.0, eps: 1.0
episode: 7/3000, score: 28.0, eps: 1.0
episode: 8/3000, score: 24.0, eps: 0.99
episode: 9/3000, score: 21.0, eps: 0.99
episode: 10/3000, score: 19.0, eps: 0.99
episode: 11/3000, score: 24.0, eps: 0.99
episode: 12/3000, score: 15.0, eps: 0.99
episode: 13/3000, score: 29.0, eps: 0.99
episode: 14/3000, score: 43.0, eps: 0.99
episode: 15/3000, score: 23.0, eps: 0.99
episode: 16/3000, score: 37.0, eps: 0.99
episode: 17/3000, score: 10.0, eps: 0.99
episode: 18/3000, score: 15.0, eps: 0.98
episode: 19/3000, score: 16.0, eps: 0.98
episode: 20/3000, score: 32.0, eps: 0.98
episode: 21/3000, score: 14.0, eps: 0.98
episode: 22/3000, score: 18.0, eps: 0.98
episode: 23/3000, score: 9.0, eps: 0.98
episode: 24/3000, score: 32.0, eps: 0.98
episode: 25/3000, score: 53.0, ep

episode: 202/3000, score: 42.0, eps: 0.82
episode: 203/3000, score: 22.0, eps: 0.82
episode: 204/3000, score: 28.0, eps: 0.82
episode: 205/3000, score: 24.0, eps: 0.82
episode: 206/3000, score: 13.0, eps: 0.82
episode: 207/3000, score: 14.0, eps: 0.81
episode: 208/3000, score: 9.0, eps: 0.81
episode: 209/3000, score: 30.0, eps: 0.81
episode: 210/3000, score: 28.0, eps: 0.81
episode: 211/3000, score: 11.0, eps: 0.81
episode: 212/3000, score: 47.0, eps: 0.81
episode: 213/3000, score: 22.0, eps: 0.81
episode: 214/3000, score: 21.0, eps: 0.81
episode: 215/3000, score: 12.0, eps: 0.81
episode: 216/3000, score: 42.0, eps: 0.81
episode: 217/3000, score: 36.0, eps: 0.81
episode: 218/3000, score: 12.0, eps: 0.81
episode: 219/3000, score: 41.0, eps: 0.8
episode: 220/3000, score: 12.0, eps: 0.8
episode: 221/3000, score: 21.0, eps: 0.8
episode: 222/3000, score: 18.0, eps: 0.8
episode: 223/3000, score: 37.0, eps: 0.8
episode: 224/3000, score: 22.0, eps: 0.8
episode: 225/3000, score: 17.0, eps: 0.8


episode: 400/3000, score: 30.0, eps: 0.67
episode: 401/3000, score: 106.0, eps: 0.67
episode: 402/3000, score: 49.0, eps: 0.67
episode: 403/3000, score: 17.0, eps: 0.67
episode: 404/3000, score: 64.0, eps: 0.67
episode: 405/3000, score: 18.0, eps: 0.67
episode: 406/3000, score: 89.0, eps: 0.67
episode: 407/3000, score: 58.0, eps: 0.67
episode: 408/3000, score: 17.0, eps: 0.67
episode: 409/3000, score: 50.0, eps: 0.67
episode: 410/3000, score: 43.0, eps: 0.66
episode: 411/3000, score: 33.0, eps: 0.66
episode: 412/3000, score: 100.0, eps: 0.66
episode: 413/3000, score: 28.0, eps: 0.66
episode: 414/3000, score: 17.0, eps: 0.66
episode: 415/3000, score: 39.0, eps: 0.66
episode: 416/3000, score: 49.0, eps: 0.66
episode: 417/3000, score: 34.0, eps: 0.66
episode: 418/3000, score: 59.0, eps: 0.66
episode: 419/3000, score: 52.0, eps: 0.66
episode: 420/3000, score: 117.0, eps: 0.66
episode: 421/3000, score: 13.0, eps: 0.66
episode: 422/3000, score: 28.0, eps: 0.66
episode: 423/3000, score: 19.0,

episode: 599/3000, score: 61.0, eps: 0.55
episode: 600/3000, score: 51.0, eps: 0.55
episode: 601/3000, score: 81.0, eps: 0.55
episode: 602/3000, score: 34.0, eps: 0.55
episode: 603/3000, score: 78.0, eps: 0.55
episode: 604/3000, score: 62.0, eps: 0.55
episode: 605/3000, score: 20.0, eps: 0.55
episode: 606/3000, score: 30.0, eps: 0.55
episode: 607/3000, score: 23.0, eps: 0.55
episode: 608/3000, score: 49.0, eps: 0.55
episode: 609/3000, score: 36.0, eps: 0.54
episode: 610/3000, score: 25.0, eps: 0.54
episode: 611/3000, score: 19.0, eps: 0.54
episode: 612/3000, score: 82.0, eps: 0.54
episode: 613/3000, score: 102.0, eps: 0.54
episode: 614/3000, score: 17.0, eps: 0.54
episode: 615/3000, score: 112.0, eps: 0.54
episode: 616/3000, score: 28.0, eps: 0.54
episode: 617/3000, score: 76.0, eps: 0.54
episode: 618/3000, score: 71.0, eps: 0.54
episode: 619/3000, score: 59.0, eps: 0.54
episode: 620/3000, score: 57.0, eps: 0.54
episode: 621/3000, score: 55.0, eps: 0.54
episode: 622/3000, score: 22.0, 

episode: 796/3000, score: 116.0, eps: 0.45
episode: 797/3000, score: 40.0, eps: 0.45
episode: 798/3000, score: 145.0, eps: 0.45
episode: 799/3000, score: 96.0, eps: 0.45
episode: 800/3000, score: 44.0, eps: 0.45
episode: 801/3000, score: 115.0, eps: 0.45
episode: 802/3000, score: 283.0, eps: 0.45
episode: 803/3000, score: 79.0, eps: 0.45
episode: 804/3000, score: 126.0, eps: 0.45
episode: 805/3000, score: 31.0, eps: 0.45
episode: 806/3000, score: 139.0, eps: 0.45
episode: 807/3000, score: 36.0, eps: 0.45
episode: 808/3000, score: 125.0, eps: 0.45
episode: 809/3000, score: 92.0, eps: 0.45
episode: 810/3000, score: 124.0, eps: 0.45
episode: 811/3000, score: 105.0, eps: 0.45
episode: 812/3000, score: 210.0, eps: 0.44
episode: 813/3000, score: 151.0, eps: 0.44
episode: 814/3000, score: 150.0, eps: 0.44
episode: 815/3000, score: 52.0, eps: 0.44
episode: 816/3000, score: 98.0, eps: 0.44
episode: 817/3000, score: 30.0, eps: 0.44
episode: 818/3000, score: 253.0, eps: 0.44
episode: 819/3000, sc

episode: 990/3000, score: 235.0, eps: 0.37
episode: 991/3000, score: 44.0, eps: 0.37
episode: 992/3000, score: 279.0, eps: 0.37
episode: 993/3000, score: 303.0, eps: 0.37
episode: 994/3000, score: 69.0, eps: 0.37
episode: 995/3000, score: 500.0, eps: 0.37
episode: 996/3000, score: 86.0, eps: 0.37
episode: 997/3000, score: 141.0, eps: 0.37
episode: 998/3000, score: 234.0, eps: 0.37
episode: 999/3000, score: 500.0, eps: 0.37
episode: 1000/3000, score: 202.0, eps: 0.37
episode: 1001/3000, score: 255.0, eps: 0.37
episode: 1002/3000, score: 402.0, eps: 0.37
episode: 1003/3000, score: 235.0, eps: 0.37
episode: 1004/3000, score: 332.0, eps: 0.37
episode: 1005/3000, score: 273.0, eps: 0.37
episode: 1006/3000, score: 92.0, eps: 0.37
episode: 1007/3000, score: 89.0, eps: 0.37
episode: 1008/3000, score: 287.0, eps: 0.37
episode: 1009/3000, score: 384.0, eps: 0.37
episode: 1010/3000, score: 249.0, eps: 0.36
episode: 1011/3000, score: 361.0, eps: 0.36
episode: 1012/3000, score: 61.0, eps: 0.36
epis

episode: 1177/3000, score: 138.0, eps: 0.31
episode: 1178/3000, score: 368.0, eps: 0.31
episode: 1179/3000, score: 397.0, eps: 0.31
episode: 1180/3000, score: 301.0, eps: 0.31
episode: 1181/3000, score: 190.0, eps: 0.31
episode: 1182/3000, score: 142.0, eps: 0.31
episode: 1183/3000, score: 178.0, eps: 0.31
episode: 1184/3000, score: 289.0, eps: 0.31
episode: 1185/3000, score: 333.0, eps: 0.31
episode: 1186/3000, score: 442.0, eps: 0.31
episode: 1187/3000, score: 241.0, eps: 0.31
episode: 1188/3000, score: 315.0, eps: 0.31
episode: 1189/3000, score: 266.0, eps: 0.3
episode: 1190/3000, score: 209.0, eps: 0.3
episode: 1191/3000, score: 157.0, eps: 0.3
episode: 1192/3000, score: 170.0, eps: 0.3
episode: 1193/3000, score: 261.0, eps: 0.3
episode: 1194/3000, score: 240.0, eps: 0.3
episode: 1195/3000, score: 374.0, eps: 0.3
episode: 1196/3000, score: 291.0, eps: 0.3
episode: 1197/3000, score: 500.0, eps: 0.3
episode: 1198/3000, score: 287.0, eps: 0.3
episode: 1199/3000, score: 324.0, eps: 0.3

episode: 1366/3000, score: 310.0, eps: 0.26
episode: 1367/3000, score: 455.0, eps: 0.26
episode: 1368/3000, score: 192.0, eps: 0.25
episode: 1369/3000, score: 180.0, eps: 0.25
episode: 1370/3000, score: 219.0, eps: 0.25
episode: 1371/3000, score: 313.0, eps: 0.25
episode: 1372/3000, score: 207.0, eps: 0.25
episode: 1373/3000, score: 201.0, eps: 0.25
episode: 1374/3000, score: 134.0, eps: 0.25
episode: 1375/3000, score: 266.0, eps: 0.25
episode: 1376/3000, score: 361.0, eps: 0.25
episode: 1377/3000, score: 291.0, eps: 0.25
episode: 1378/3000, score: 313.0, eps: 0.25
episode: 1379/3000, score: 274.0, eps: 0.25
episode: 1380/3000, score: 349.0, eps: 0.25
episode: 1381/3000, score: 374.0, eps: 0.25
episode: 1382/3000, score: 500.0, eps: 0.25
episode: 1383/3000, score: 280.0, eps: 0.25
episode: 1384/3000, score: 320.0, eps: 0.25
episode: 1385/3000, score: 362.0, eps: 0.25
episode: 1386/3000, score: 184.0, eps: 0.25
episode: 1387/3000, score: 258.0, eps: 0.25
episode: 1388/3000, score: 355.0

episode: 1553/3000, score: 274.0, eps: 0.21
episode: 1554/3000, score: 239.0, eps: 0.21
episode: 1555/3000, score: 235.0, eps: 0.21
episode: 1556/3000, score: 217.0, eps: 0.21
episode: 1557/3000, score: 210.0, eps: 0.21
episode: 1558/3000, score: 244.0, eps: 0.21
episode: 1559/3000, score: 254.0, eps: 0.21
episode: 1560/3000, score: 351.0, eps: 0.21
episode: 1561/3000, score: 441.0, eps: 0.21
episode: 1562/3000, score: 274.0, eps: 0.21
episode: 1563/3000, score: 359.0, eps: 0.21
episode: 1564/3000, score: 289.0, eps: 0.21
episode: 1565/3000, score: 288.0, eps: 0.21
episode: 1566/3000, score: 330.0, eps: 0.21
episode: 1567/3000, score: 318.0, eps: 0.21
episode: 1568/3000, score: 267.0, eps: 0.21
episode: 1569/3000, score: 315.0, eps: 0.21
episode: 1570/3000, score: 325.0, eps: 0.21
episode: 1571/3000, score: 246.0, eps: 0.21
episode: 1572/3000, score: 341.0, eps: 0.21
episode: 1573/3000, score: 239.0, eps: 0.21
episode: 1574/3000, score: 234.0, eps: 0.21
episode: 1575/3000, score: 247.0

episode: 1741/3000, score: 493.0, eps: 0.18
episode: 1742/3000, score: 493.0, eps: 0.18
episode: 1743/3000, score: 268.0, eps: 0.18
episode: 1744/3000, score: 314.0, eps: 0.18
episode: 1745/3000, score: 355.0, eps: 0.17
episode: 1746/3000, score: 273.0, eps: 0.17
episode: 1747/3000, score: 389.0, eps: 0.17
episode: 1748/3000, score: 327.0, eps: 0.17
episode: 1749/3000, score: 417.0, eps: 0.17
episode: 1750/3000, score: 374.0, eps: 0.17
episode: 1751/3000, score: 500.0, eps: 0.17
episode: 1752/3000, score: 350.0, eps: 0.17
episode: 1753/3000, score: 347.0, eps: 0.17
episode: 1754/3000, score: 222.0, eps: 0.17
episode: 1755/3000, score: 190.0, eps: 0.17
episode: 1756/3000, score: 222.0, eps: 0.17
episode: 1757/3000, score: 283.0, eps: 0.17
episode: 1758/3000, score: 205.0, eps: 0.17
episode: 1759/3000, score: 249.0, eps: 0.17
episode: 1760/3000, score: 256.0, eps: 0.17
episode: 1761/3000, score: 154.0, eps: 0.17
episode: 1762/3000, score: 137.0, eps: 0.17
episode: 1763/3000, score: 169.0

episode: 1928/3000, score: 500.0, eps: 0.15
episode: 1929/3000, score: 333.0, eps: 0.15
episode: 1930/3000, score: 309.0, eps: 0.15
episode: 1931/3000, score: 237.0, eps: 0.15
episode: 1932/3000, score: 211.0, eps: 0.15
episode: 1933/3000, score: 238.0, eps: 0.14
episode: 1934/3000, score: 224.0, eps: 0.14
episode: 1935/3000, score: 233.0, eps: 0.14
episode: 1936/3000, score: 194.0, eps: 0.14
episode: 1937/3000, score: 236.0, eps: 0.14
episode: 1938/3000, score: 379.0, eps: 0.14
episode: 1939/3000, score: 500.0, eps: 0.14
episode: 1940/3000, score: 500.0, eps: 0.14
episode: 1941/3000, score: 295.0, eps: 0.14
episode: 1942/3000, score: 350.0, eps: 0.14
episode: 1943/3000, score: 341.0, eps: 0.14
episode: 1944/3000, score: 233.0, eps: 0.14
episode: 1945/3000, score: 327.0, eps: 0.14
episode: 1946/3000, score: 339.0, eps: 0.14
episode: 1947/3000, score: 350.0, eps: 0.14
episode: 1948/3000, score: 394.0, eps: 0.14
episode: 1949/3000, score: 294.0, eps: 0.14
episode: 1950/3000, score: 314.0

episode: 2115/3000, score: 237.0, eps: 0.12
episode: 2116/3000, score: 221.0, eps: 0.12
episode: 2117/3000, score: 229.0, eps: 0.12
episode: 2118/3000, score: 240.0, eps: 0.12
episode: 2119/3000, score: 500.0, eps: 0.12
episode: 2120/3000, score: 500.0, eps: 0.12
episode: 2121/3000, score: 380.0, eps: 0.12
episode: 2122/3000, score: 357.0, eps: 0.12
episode: 2123/3000, score: 327.0, eps: 0.12
episode: 2124/3000, score: 311.0, eps: 0.12
episode: 2125/3000, score: 328.0, eps: 0.12
episode: 2126/3000, score: 290.0, eps: 0.12
episode: 2127/3000, score: 286.0, eps: 0.12
episode: 2128/3000, score: 213.0, eps: 0.12
episode: 2129/3000, score: 241.0, eps: 0.12
episode: 2130/3000, score: 361.0, eps: 0.12
episode: 2131/3000, score: 428.0, eps: 0.12
episode: 2132/3000, score: 500.0, eps: 0.12
episode: 2133/3000, score: 275.0, eps: 0.12
episode: 2134/3000, score: 301.0, eps: 0.12
episode: 2135/3000, score: 328.0, eps: 0.12
episode: 2136/3000, score: 271.0, eps: 0.12
episode: 2137/3000, score: 256.0

episode: 2303/3000, score: 414.0, eps: 0.1
episode: 2304/3000, score: 352.0, eps: 0.1
episode: 2305/3000, score: 397.0, eps: 0.1
episode: 2306/3000, score: 357.0, eps: 0.1
episode: 2307/3000, score: 462.0, eps: 0.1
episode: 2308/3000, score: 381.0, eps: 0.1
episode: 2309/3000, score: 240.0, eps: 0.099
episode: 2310/3000, score: 484.0, eps: 0.099
episode: 2311/3000, score: 426.0, eps: 0.099
episode: 2312/3000, score: 376.0, eps: 0.099
episode: 2313/3000, score: 422.0, eps: 0.099
episode: 2314/3000, score: 406.0, eps: 0.099
episode: 2315/3000, score: 390.0, eps: 0.099
episode: 2316/3000, score: 500.0, eps: 0.099
episode: 2317/3000, score: 428.0, eps: 0.099
episode: 2318/3000, score: 500.0, eps: 0.099
episode: 2319/3000, score: 379.0, eps: 0.098
episode: 2320/3000, score: 500.0, eps: 0.098
episode: 2321/3000, score: 500.0, eps: 0.098
episode: 2322/3000, score: 457.0, eps: 0.098
episode: 2323/3000, score: 500.0, eps: 0.098
episode: 2324/3000, score: 500.0, eps: 0.098
episode: 2325/3000, sc

KeyboardInterrupt: 

## Record new episode using optimal policy

In [16]:
env = gym.make('CartPole-v1')
env = wrappers.Monitor(env, directory = 'optimal', force = True)
# agent.load("/home/devin/notebooks/cartpole-ddqn/agent-dqn.h5")

In [17]:
state = env.reset()
state = np.reshape(state, [1, state_size])
cumulative_reward = 0
for time in range(episode_len):
    env.render()
    action = agent.act_optimal(state)
    next_state, reward, done, info = env.step(action)
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state
    cumulative_reward += reward
    if done:
        print("score: {}".format(cumulative_reward))
        break

score: 500.0


## Close environment

In [18]:
env.close()

## View episode of trained agent

In [20]:
os.chdir("/home/devin/notebooks/cartpole-ddqn/optimal")
for file in glob.glob("*.mp4"):
    vid_dir = file
os.chdir("/home/devin/notebooks/cartpole-ddqn")

In [21]:
vid_dir

'openaigym.video.3.6102.video000000.mp4'

In [22]:
HTML(
'''<div align="middle"> 
<video width="80%" controls> 
    <source src="{}" type="video/mp4">
</video></div>'''.format('cartpole-ddqn/optimal/' + vid_dir)
)