In [None]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import collections
import os
import time

In [3]:
def get_next_batch(experience, model, num_actions, gamma, state_size, batch_size):
    
    batch_indices = np.random.randint(low=0, high=len(experience), size=batch_size)
    #batch = [experience[i] for i in batch_indices]
    batch = random.sample(experience, batch_size)
    X = np.zeros((batch_size, state_size))
    Y = np.zeros((batch_size, num_actions))
    for i in range(len(batch)):
        s_t, a_t, r_t, s_tp1, done = batch[i]
        X[i] = s_t
        Y[i] = model.predict(s_t)[0]
        Q_sa = np.max(model.predict(s_tp1)[0])
        if done:
            Y[i, a_t] = r_t
        else:
            Y[i, a_t] = r_t + gamma * Q_sa
    return X, Y

# Initialize parameters

In [4]:
DATA_DIR = "data"
GAMMA = 0.95 # decay rate of past observations
INITIAL_EPSILON = 0.1 # starting value of epsilon
FINAL_EPSILON = 0.001 # final value of epsilon
MEMORY_SIZE = 2000 # number of previous transitions to remember
NUM_EPOCHS_OBSERVE = 100
NUM_EPOCHS_TRAIN = 150

BATCH_SIZE = 32
NUM_EPOCHS = NUM_EPOCHS_OBSERVE + NUM_EPOCHS_TRAIN

In [5]:
env = gym.make('CartPole-v0')
STATE_SIZE = env.observation_space.shape[0]
NUM_ACTIONS = env.action_space.n

[2017-06-13 12:03:36,231] Making new env: CartPole-v0


# Build the model

In [6]:
model = Sequential()
model.add(Dense(24, input_dim=STATE_SIZE, activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(NUM_ACTIONS, activation='linear'))
model.compile(loss='mse', optimizer=Adam(lr=0.001))

# Train Network

In [15]:
experience = collections.deque(maxlen=MEMORY_SIZE)
fout = open(os.path.join(DATA_DIR, "rl-network-results.tsv"), "w")
epsilon = INITIAL_EPSILON
done = False

start_time = time.time()
for e in range(NUM_EPOCHS):
    loss = 0.0
    s_t = env.reset()
    s_t = np.reshape(s_t, [1, STATE_SIZE])
    reward = 0
    for step in range(195):
        s_tm1 = s_t
        # next action
        if e <= NUM_EPOCHS_OBSERVE:
            a_t = np.random.randint(low=0, high=NUM_ACTIONS, size=1)[0]
        else:
            if np.random.rand() <= epsilon:
                a_t = np.random.randint(low=0, high=NUM_ACTIONS, size=1)[0]
            else:
                q = model.predict(s_t)[0]
                a_t = np.argmax(q)
                
        # apply action, get reward
        s_t, r_t, done, _ = env.step(a_t)
        r_t = r_t if not done else -10
        s_t = np.reshape(s_t, [1, STATE_SIZE])
        reward +=r_t 
        # store experience
        experience.append((s_tm1, a_t, r_t, s_t, done))
        
        if e > NUM_EPOCHS_OBSERVE:
            # finished observing, now start training
            # get next batch
            X, Y = get_next_batch(experience, model, NUM_ACTIONS, 
                                  GAMMA, STATE_SIZE, BATCH_SIZE)
            loss += model.train_on_batch(X, Y)
        
        if done:
            break;
            
    print("Epoch {:04d}/{:d} | Loss {:.5f} | score {:.5f} | reward {:5f}"
                  .format(e + 1, NUM_EPOCHS, loss, step, reward))            
    # reduce epsilon gradually
    if epsilon > FINAL_EPSILON:
        epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / NUM_EPOCHS
        
    
    fout.write("{:04d}\t{:.5f}\n".format(e + 1, loss))

    if e % 100 == 0:
        model.save(os.path.join(DATA_DIR, "rl-network.h5"), overwrite=True)
        
fout.close()
model.save(os.path.join(DATA_DIR, "rl-network.h5"), overwrite=True)
print("--- Tempo total: %d seconds ---" % (time.time() - start_time))

Epoch 0001/250 | Loss 0.00000 | score 12.00000 | reward 2.000000
Epoch 0002/250 | Loss 0.00000 | score 23.00000 | reward 13.000000
Epoch 0003/250 | Loss 0.00000 | score 19.00000 | reward 9.000000
Epoch 0004/250 | Loss 0.00000 | score 17.00000 | reward 7.000000
Epoch 0005/250 | Loss 0.00000 | score 64.00000 | reward 54.000000
Epoch 0006/250 | Loss 0.00000 | score 13.00000 | reward 3.000000
Epoch 0007/250 | Loss 0.00000 | score 41.00000 | reward 31.000000
Epoch 0008/250 | Loss 0.00000 | score 16.00000 | reward 6.000000
Epoch 0009/250 | Loss 0.00000 | score 36.00000 | reward 26.000000
Epoch 0010/250 | Loss 0.00000 | score 23.00000 | reward 13.000000
Epoch 0011/250 | Loss 0.00000 | score 18.00000 | reward 8.000000
Epoch 0012/250 | Loss 0.00000 | score 8.00000 | reward -2.000000
Epoch 0013/250 | Loss 0.00000 | score 33.00000 | reward 23.000000
Epoch 0014/250 | Loss 0.00000 | score 10.00000 | reward 0.000000
Epoch 0015/250 | Loss 0.00000 | score 13.00000 | reward 3.000000
Epoch 0016/250 | Lo

In [9]:
!zip data/nn.zip data/rl-network.h5

updating: data/rl-network.h5 (deflated 70%)


Download: <a class="reference external" href="data/nn.zip" download="w3logo">rl-network.h5</a>