In [5]:
%matplotlib inline
import matplotlib.pyplot as plt

from keras.models import *
from keras.layers import *
from keras.optimizers import *

import gym
import random
import numpy as np

In [6]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size # 4
        self.action_size = action_size # 2
        self.memory = [] # to save our prev. actions
        self.gamma = 0.95 # discount rate
        self.epsilon = 1.0 # exploration rate
        self.epsilon_lb = 0.05 # lower bound
        self.epsilon_dc = 0.99 # decay
        self.lr = 0.001 # lr for the dnn
        self.model = self.build_model()
       
    # DNN for our Q-Learning Model
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size))
        model.add(Activation("relu"))
        model.add(Dense(12))
        model.add(Activation("relu"))
        model.add(Dense(self.action_size))
        model.compile(loss="mse", optimizer=Adam(lr=self.lr))
        return model
    
    # We save the prev experiences for later re-train
    def remember(self, state, action, reward, next_state, finished):
        self.memory.append([state, action, reward, next_state, finished])
       
    # Get action by DNN or by Random sampling
    def get_action(self, state):
        # Select random action
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        # Compute action based on our DNN
        action = self.model.predict(state)
        return np.argmax(action[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size) 
        for state, action, reward, next_state, finished in minibatch:
            target = reward
            # Predict the future discounted reward
            if not finished:
                target = (reward + self.gamma * np.max(self.model.predict(next_state)[0]))
            # Train the agent to approx the current state to futute rewards
            target_future = self.model.predict(state)
            target_future[0][action] = target
            self.model.fit(state, target_future, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_lb:
            self.epsilon = self.epsilon * self.epsilon_dc

In [7]:
episodes = 1000
env = gym.make("CartPole-v1")
state_size = 4
action_size = 2
agent = Agent(state_size, action_size)
finished = False
batch_size = 16

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [None]:
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        # Get random/dnn action
        action = agent.get_action(state)
        next_state, reward, finished, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if finished == True:
            reward = -1
        # Save the episode
        agent.remember(state, action, reward, next_state, finished)
        state = next_state
        if finished == True:
            print("Episode: ", episode, " Time-Score: ", time, " Epsilon: ", agent.epsilon)
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)      

Episode:  0  Time-Score:  12  Epsilon:  1.0
Episode:  1  Time-Score:  10  Epsilon:  0.9320653479069899
Episode:  2  Time-Score:  10  Epsilon:  0.8429431933839266
Episode:  3  Time-Score:  11  Epsilon:  0.7547192872036325
Episode:  4  Time-Score:  14  Epsilon:  0.6556592205741436
Episode:  5  Time-Score:  33  Epsilon:  0.47058664158564995
Episode:  6  Time-Score:  17  Epsilon:  0.39667780642202527
Episode:  7  Time-Score:  18  Epsilon:  0.33103308832101386
Episode:  8  Time-Score:  10  Epsilon:  0.29938039131233124
Episode:  9  Time-Score:  11  Epsilon:  0.26804671691687404
Episode:  10  Time-Score:  13  Epsilon:  0.2352166292404101
Episode:  11  Time-Score:  14  Epsilon:  0.20434346174623952
Episode:  12  Time-Score:  9  Epsilon:  0.18667127671570335
Episode:  13  Time-Score:  11  Epsilon:  0.16713393501488363
Episode:  14  Time-Score:  9  Epsilon:  0.1526797322759062
Episode:  15  Time-Score:  9  Epsilon:  0.13947556877522502
Episode:  16  Time-Score:  9  Epsilon:  0.12741333767875881

Episode:  133  Time-Score:  315  Epsilon:  0.04953625663766238
Episode:  134  Time-Score:  190  Epsilon:  0.04953625663766238
Episode:  135  Time-Score:  286  Epsilon:  0.04953625663766238
Episode:  136  Time-Score:  190  Epsilon:  0.04953625663766238
Episode:  137  Time-Score:  259  Epsilon:  0.04953625663766238
Episode:  138  Time-Score:  187  Epsilon:  0.04953625663766238
Episode:  139  Time-Score:  310  Epsilon:  0.04953625663766238
Episode:  140  Time-Score:  229  Epsilon:  0.04953625663766238
Episode:  141  Time-Score:  470  Epsilon:  0.04953625663766238
Episode:  142  Time-Score:  335  Epsilon:  0.04953625663766238
Episode:  143  Time-Score:  223  Epsilon:  0.04953625663766238
Episode:  144  Time-Score:  265  Epsilon:  0.04953625663766238
Episode:  145  Time-Score:  378  Epsilon:  0.04953625663766238
Episode:  146  Time-Score:  227  Epsilon:  0.04953625663766238
Episode:  147  Time-Score:  177  Epsilon:  0.04953625663766238
Episode:  148  Time-Score:  243  Epsilon:  0.0495362566

Episode:  264  Time-Score:  110  Epsilon:  0.04953625663766238
Episode:  265  Time-Score:  114  Epsilon:  0.04953625663766238
Episode:  266  Time-Score:  297  Epsilon:  0.04953625663766238
Episode:  267  Time-Score:  283  Epsilon:  0.04953625663766238
Episode:  268  Time-Score:  103  Epsilon:  0.04953625663766238
Episode:  269  Time-Score:  118  Epsilon:  0.04953625663766238
Episode:  270  Time-Score:  176  Epsilon:  0.04953625663766238
Episode:  271  Time-Score:  462  Epsilon:  0.04953625663766238
Episode:  272  Time-Score:  368  Epsilon:  0.04953625663766238
Episode:  273  Time-Score:  268  Epsilon:  0.04953625663766238
Episode:  274  Time-Score:  122  Epsilon:  0.04953625663766238
Episode:  275  Time-Score:  244  Epsilon:  0.04953625663766238
Episode:  276  Time-Score:  218  Epsilon:  0.04953625663766238
Episode:  277  Time-Score:  177  Epsilon:  0.04953625663766238
Episode:  278  Time-Score:  165  Epsilon:  0.04953625663766238
Episode:  279  Time-Score:  242  Epsilon:  0.0495362566