In [1]:
import gym
import pylab
import random
import numpy as np
from collections import deque
# import tflearn

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras import optimizers
from keras.layers import BatchNormalization

Using TensorFlow backend.


## Task: fill empty spaces in the following agent code

In [8]:
class DeepQAgent:
    def __init__(self, state_size, action_size, render=True):
        # Tip: if you are training this on AWS the best way is to turn off rendering
        # and load it later with the serialized model
        self.render = render
        self.state_size = state_size
        self.action_size = action_size

        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = (self.epsilon - self.epsilon_min) / 50000
        self.batch_size = 64
        self.train_start = 1000
        # replay memory
        self.memory = deque(maxlen=10000)

        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        # Use tflearn to get simple NN for deep q-learning
        # Spoler alert: a couple of fully connected hidden layers should be enough
        # Output layer should have the same dimensionality as the action space
#         TODO
#         model = Sequential()
#         model.add(Dense(64, input_shape=(self.state_size,), activation='relu'))
# #         model.add(BatchNormalization(trainable = True))
#         model.add(Dense(64, activation='relu'))
#         #         model.add(BatchNormalization(trainable = True))
#         model.add(Dense(self.action_size, activation='linear'))
#         model.compile(
#             loss='mse',
#             optimizer=optimizers.Adam(lr=self.learning_rate)
# #                                       beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
#         )
#         print(model.summary())
#         return model

#         model = Sequential()
#         model.add(Dense(64, input_shape=(self.state_size, ), activation='relu'))
#         model.add(Dense(self.state_size, activation='relu'))
#         model.add(Dense(self.action_size))
#         model.compile(optimizer=optimizers.Adam(lr=self.learning_rate),loss='mse')
#         print(model.summary())
#         return model


        model = Sequential()
        
        model.add(Dense(64, activation='relu', input_dim = self.state_size))
        model.add(Dense(64, activation='relu'))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        
        model.compile(optimizers.Adam(lr=self.learning_rate), 'mse')
        print(model.summary())
        return model

    def update_target_model(self):
        """Update your target model to the model you are currently learning at regular time intervals"""
        self.target_model.set_weights(self.model.get_weights())

    def get_action(self, state):
        """The choice of action uses the epsilon-greedy policy for the current network."""
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    def replay_memory(self, state, action, reward, next_state, done):
        """Save <s, a, r, s'> to replay_memory"""
        if action == 2:
            action = 1
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
            # print(len(self.memory))

    def train_replay(self):
        """Random sampling of batch_size samples from replay memory"""
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.action_size))

        for i in range(batch_size):
            state, action, reward, next_state, done = mini_batch[i]
            target = self.model.predict(state)[0]
            
            
#             target[action] = reward
            # As in queuing, it gets the maximum Q Value at s'. However, it is imported from the target model.
            if done:
                target[action] = reward
            else:
                target[action] = reward + self.discount_factor * \
                                          np.amax(self.target_model.predict(next_state)[0])
            update_input[i] = state
            update_target[i] = target

        # You can create a minibatch of the correct target answer and the current value of your own,
        self.model.fit(update_input, update_target, batch_size=batch_size, epochs=1, verbose=0)

    def load_model(self, name):
        self.model.load_weights(name, by_name=True)

    def save_model(self, name):
        self.model.save(name)


In [11]:
env = gym.make('MountainCar-v0')
state_size = env.observation_space.shape[0] # should be equal 2
ACTION_SIZE = 2
del agent
agent = DeepQAgent(state_size, ACTION_SIZE)
# agent.load_model("./save_model/<your_saved_model_name>")
scores, episodes = [], []
N_EPISODES = 500

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 64)                192       
_________________________________________________________________
dense_20 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_21 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_22 (Dense)             (None, 2)                 66        
Total params: 6,498
Trainable params: 6,498
Non-trainable params: 0
_________________________________________________________________
None
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_23 (Dense)             (None, 64)                192       
_________________________________________________________________
den

In [4]:
# agent.load_model('./save_model/model_new1_1')

In [12]:

for e in range(N_EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
#     print(state)

    # Action 0 (left), 1 (do nothing), 3 (declare fake_action to avoid doing nothing
    fake_action = 0

    # Counter for the same action 4 times
    action_count = 0

    while not done:
        if agent.render:
            env.render()

        # Select an action in the current state and proceed to a step
        action_count = action_count + 1

        if action_count == 4:
            action = agent.get_action(state)
            action_count = 0

            if action == 0:
                fake_action = 0
            elif action == 1:
                fake_action = 2

        # Take 1 step with the selected action
        next_state, reward, done, info = env.step(fake_action)
        next_state = np.reshape(next_state, [1, state_size])
        # Give a penalty of -100 for actions that end an episode
        # reward = reward if not done else -100

        # Save <s, a, r, s'> to replay memory
        agent.replay_memory(state, fake_action, reward, next_state, done)
        # Continue to learn every time step
        agent.train_replay()
        score += reward
        state = next_state

        if done:
            env.reset()
            # Copy the learning model for each episode to the target model
            agent.update_target_model()

            # For each episode, the time step where cartpole stood is plot
            scores.append(score)
            episodes.append(e)
            print("episode:", e, "  score:", score, "  memory length:", len(agent.memory),
                  "  epsilon:", agent.epsilon)

    # Save model for every 50 episodes
    if e % 50 == 0:
        agent.save_model("./save_model/model_new1_1_1")

('episode:', 0, '  score:', -200.0, '  memory length:', 200, '  epsilon:', 0.9960200000000077)
('episode:', 1, '  score:', -200.0, '  memory length:', 400, '  epsilon:', 0.9920400000000154)
('episode:', 2, '  score:', -200.0, '  memory length:', 600, '  epsilon:', 0.988060000000023)
('episode:', 3, '  score:', -200.0, '  memory length:', 800, '  epsilon:', 0.9840800000000307)
('episode:', 4, '  score:', -200.0, '  memory length:', 1000, '  epsilon:', 0.9801000000000384)
('episode:', 5, '  score:', -200.0, '  memory length:', 1200, '  epsilon:', 0.9761200000000461)
('episode:', 6, '  score:', -200.0, '  memory length:', 1400, '  epsilon:', 0.9721400000000537)
('episode:', 7, '  score:', -200.0, '  memory length:', 1600, '  epsilon:', 0.9681600000000614)
('episode:', 8, '  score:', -200.0, '  memory length:', 1800, '  epsilon:', 0.9641800000000691)
('episode:', 9, '  score:', -200.0, '  memory length:', 2000, '  epsilon:', 0.9602000000000768)
('episode:', 10, '  score:', -200.0, '  memor

('episode:', 85, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6577200000006602)
('episode:', 86, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6537400000006679)
('episode:', 87, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6497600000006756)
('episode:', 88, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6457800000006833)
('episode:', 89, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6418000000006909)
('episode:', 90, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6378200000006986)
('episode:', 91, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6338400000007063)
('episode:', 92, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.629860000000714)
('episode:', 93, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6258800000007216)
('episode:', 94, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.6219000000007293)
('episode:', 95, '  s

('episode:', 168, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.32738000000081585)
('episode:', 169, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.3234000000008124)
('episode:', 170, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.319420000000809)
('episode:', 171, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.3154400000008056)
('episode:', 172, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.31146000000080215)
('episode:', 173, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.3074800000007987)
('episode:', 174, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.3035000000007953)
('episode:', 175, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.2995200000007919)
('episode:', 176, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.29554000000078845)
('episode:', 177, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.291560000000785)
('episode

('episode:', 251, '  score:', -153.0, '  memory length:', 10000, '  epsilon:', 0.031088900000800727)
('episode:', 252, '  score:', -200.0, '  memory length:', 10000, '  epsilon:', 0.02710890000080077)
('episode:', 253, '  score:', -172.0, '  memory length:', 10000, '  epsilon:', 0.02368610000080081)
('episode:', 254, '  score:', -156.0, '  memory length:', 10000, '  epsilon:', 0.020581700000800843)
('episode:', 255, '  score:', -189.0, '  memory length:', 10000, '  epsilon:', 0.016820600000800885)
('episode:', 256, '  score:', -123.0, '  memory length:', 10000, '  epsilon:', 0.014372900000800912)
('episode:', 257, '  score:', -139.0, '  memory length:', 10000, '  epsilon:', 0.011606800000800943)
('episode:', 258, '  score:', -151.0, '  memory length:', 10000, '  epsilon:', 0.008601900000800976)
('episode:', 259, '  score:', -145.0, '  memory length:', 10000, '  epsilon:', 0.0057164000008010085)
('episode:', 260, '  score:', -151.0, '  memory length:', 10000, '  epsilon:', 0.00498010000

('episode:', 333, '  score:', -136.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 334, '  score:', -114.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 335, '  score:', -132.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 336, '  score:', -133.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 337, '  score:', -123.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 338, '  score:', -120.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 339, '  score:', -123.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 340, '  score:', -149.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 341, '  score:', -160.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 342, '  score:', -109.0, '  memory length:', 10000, '  epsilon:', 0.0049801000

('episode:', 415, '  score:', -122.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 416, '  score:', -107.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 417, '  score:', -114.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 418, '  score:', -85.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 419, '  score:', -85.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 420, '  score:', -118.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 421, '  score:', -120.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 422, '  score:', -118.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 423, '  score:', -100.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 424, '  score:', -93.0, '  memory length:', 10000, '  epsilon:', 0.0049801000008

('episode:', 497, '  score:', -116.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 498, '  score:', -115.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
('episode:', 499, '  score:', -115.0, '  memory length:', 10000, '  epsilon:', 0.004980100000801017)
