In [1]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
def create_model(n_states, n_actions, dnn_learning_rate):
    sgd = SGD(lr=dnn_learning_rate)
    
    model = Sequential()
    model.add(Dense(64, input_dim=n_states, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(n_actions, activation='linear'))
    model.compile(loss='mse', optimizer=sgd)
    print(model.summary())
    return model

In [3]:
class Agent:
    def __init__(self, alpha, gamma, epsilon, n_states, n_actions, deque_length, batch_size, dnn_learning_rate, epochs):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.n_actions = n_actions
        self.n_states = n_states
        self.batch_size = batch_size
        self.epochs = epochs
        self.memory = deque(maxlen=deque_length)
        self.model = create_model(n_states, n_actions, dnn_learning_rate)
    def select_action(self, state):
        # exploration vs explotation
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.n_actions)
        state = np.reshape(state, (1, n_states)) # reshape to 1 row, 4 cols
        actions = self.model.predict(state)[0]   # get 1st prediction
        action_with_largest_value = np.argmax(actions)
        return action_with_largest_value
    def remember(self, s0, a0, r1, s1, done):
        self.memory.append((s0, a0, r1, s1, done))
    def learn(self):
        experiences = random.sample(self.memory, k=self.batch_size)
        X = []
        Y = []

        for s0, a0, r1, s1, done in experiences:
            if done:
                target = r1
            else:
                actions1 = self.model.predict(np.reshape(s1, (1, self.n_states)))[0]  # get 1st prediction
                s1_q = np.max(actions1)                                               # get max value for next state
                target = r1 + (self.gamma * s1_q)
        
            actions0 = self.model.predict(np.reshape(s0, (1, self.n_states)))[0]
            # current = actions0[a0]
            # target = current + (self.alpha * (target - current))
            actions0[a0] = target

            X.append(s0)
            Y.append(actions0)
            
        X = np.array(X)
        Y = np.array(Y)
        self.model.fit(X, Y, epochs=self.epochs, verbose=0, batch_size=self.batch_size)

In [4]:
env = gym.make('CartPole-v0')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [5]:
# pos, vel, theta, omega
env.observation_space.sample()

array([4.6860966e-01, 1.4645028e+38, 8.6090848e-02, 3.0545910e+37],
      dtype=float32)

In [6]:
# left, right
env.action_space.sample()

1

In [7]:
# Observation: 
#     Type: Box(4)
#     Num	Observation                 Min         Max
#     0	Cart Position             -4.8            4.8
#     1	Cart Velocity             -Inf            Inf
#     2	Pole Angle                 -24°           24°
#     3	Pole Velocity At Tip      -Inf            Inf

# Actions:
#     Type: Discrete(2)
#     Num	Action
#     0	Push cart to the left
#     1	Push cart to the right

In [8]:
# variables

n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
n_episodes = 1_000
n_steps = 10_000
epsilon = 0.10
gamma = 0.99
alpha = 0.10
deque_length = 5000
batch_size = 64
dnn_learning_rate = 0.001
epochs = 64
agent = Agent(alpha, gamma, epsilon, n_states, n_actions, deque_length, batch_size, dnn_learning_rate, epochs)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                320       
_________________________________________________________________
dense_2 (Dense)              (None, 128)               8320      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
Total params: 8,898
Trainable params: 8,898
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
for e in range(n_episodes):
    s0 = env.reset()
    # *** training ***
    for step in range(n_steps):
        a0 = agent.select_action(s0)         # get action
        s1, r1, done, _ = env.step(a0)       # move game one step
        r1 = r1 if not done else -10         # if the pole fell down, game ends, -10 reward
        agent.remember(s0, a0, r1, s1, done) # remember actions taken for later learning
        s0 = s1                              # set state to next state
        if done:
            print('episode:', e, 'score:', step)
            break
    # *** learning ***
    if len(agent.memory) > batch_size:
        agent.learn()

episode: 0 score: 9
episode: 1 score: 9
episode: 2 score: 8
episode: 3 score: 8
episode: 4 score: 23
episode: 5 score: 12
episode: 6 score: 10
episode: 7 score: 23
episode: 8 score: 11
episode: 9 score: 34
episode: 10 score: 33
episode: 11 score: 15
episode: 12 score: 29
episode: 13 score: 13
episode: 14 score: 12
episode: 15 score: 16
episode: 16 score: 17
episode: 17 score: 19
episode: 18 score: 27
episode: 19 score: 19
episode: 20 score: 12
episode: 21 score: 11
episode: 22 score: 18
episode: 23 score: 15
episode: 24 score: 14
episode: 25 score: 19
episode: 26 score: 13
episode: 27 score: 13
episode: 28 score: 18
episode: 29 score: 14
episode: 30 score: 14
episode: 31 score: 12
episode: 32 score: 8
episode: 33 score: 12
episode: 34 score: 11
episode: 35 score: 11
episode: 36 score: 11
episode: 37 score: 23
episode: 38 score: 16
episode: 39 score: 12
episode: 40 score: 11
episode: 41 score: 16
episode: 42 score: 14
episode: 43 score: 19
episode: 44 score: 12
episode: 45 score: 11
epi

episode: 353 score: 199
episode: 354 score: 199
episode: 355 score: 199
episode: 356 score: 199
episode: 357 score: 199
episode: 358 score: 199
episode: 359 score: 199
episode: 360 score: 199
episode: 361 score: 199
episode: 362 score: 199
episode: 363 score: 170
episode: 364 score: 197
episode: 365 score: 145
episode: 366 score: 190
episode: 367 score: 199
episode: 368 score: 199
episode: 369 score: 191
episode: 370 score: 199
episode: 371 score: 199
episode: 372 score: 199
episode: 373 score: 178
episode: 374 score: 199
episode: 375 score: 199
episode: 376 score: 199
episode: 377 score: 199
episode: 378 score: 199
episode: 379 score: 154
episode: 380 score: 199
episode: 381 score: 199
episode: 382 score: 199
episode: 383 score: 199
episode: 384 score: 199
episode: 385 score: 199
episode: 386 score: 115
episode: 387 score: 183
episode: 388 score: 72
episode: 389 score: 30
episode: 390 score: 96
episode: 391 score: 178
episode: 392 score: 115
episode: 393 score: 167
episode: 394 score:

episode: 703 score: 167
episode: 704 score: 176
episode: 705 score: 127
episode: 706 score: 130
episode: 707 score: 117
episode: 708 score: 19
episode: 709 score: 118
episode: 710 score: 136
episode: 711 score: 117
episode: 712 score: 124
episode: 713 score: 20
episode: 714 score: 19
episode: 715 score: 20
episode: 716 score: 16
episode: 717 score: 192
episode: 718 score: 152
episode: 719 score: 162
episode: 720 score: 147
episode: 721 score: 199
episode: 722 score: 199
episode: 723 score: 131
episode: 724 score: 18
episode: 725 score: 27
episode: 726 score: 166
episode: 727 score: 50
episode: 728 score: 34
episode: 729 score: 31
episode: 730 score: 22
episode: 731 score: 24
episode: 732 score: 142
episode: 733 score: 136
episode: 734 score: 101
episode: 735 score: 139
episode: 736 score: 146
episode: 737 score: 129
episode: 738 score: 147
episode: 739 score: 92
episode: 740 score: 94
episode: 741 score: 106
episode: 742 score: 97
episode: 743 score: 108
episode: 744 score: 129
episode