In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import gym
import random
import numpy as np

Using TensorFlow backend.


In [2]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size # 4
        self.action_size = action_size # 2
        self.memory = [] # to save our prev. actions
        self.gamma = 0.95 # discount rate
        self.epsilon = 1.0 # exploration rate
        self.epsilon_lb = 0.05 # lower bound
        self.epsilon_dc = 0.99 # decay
        self.lr = 0.001 # lr for the dnn
        self.model = self.build_model()
       
    # DNN for our Q-Learning Model
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size))
        model.add(Activation("relu"))
        model.add(Dense(12))
        model.add(Activation("relu"))
        model.add(Dense(self.action_size))
        model.compile(loss="mse", optimizer=Adam(lr=self.lr))
        return model
    
    # We save the prev experiences for later re-train
    def remember(self, state, action, reward, next_state, finished):
        self.memory.append([state, action, reward, next_state, finished])
       
    # Get action by DNN or by Random sampling
    def get_action(self, state):
        # Select random action
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        # Compute action based on our DNN
        action = self.model.predict(state)
        return np.argmax(action[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size) 
        for state, action, reward, next_state, finished in minibatch:
            target = reward
            # Predict the future discounted reward
            if not finished:
                target = (reward + self.gamma * np.max(self.model.predict(next_state)[0]))
            # Train the agent to approx the current state to futute rewards
            target_future = self.model.predict(state)
            target_future[0][action] = target
            self.model.fit(state, target_future, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_lb:
            self.epsilon = self.epsilon * self.epsilon_dc

In [3]:
episodes = 1000
env = gym.make("CartPole-v1")
state_size = 4
action_size = 2
agent = Agent(state_size, action_size)
finished = False
batch_size = 16

In [None]:
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        # Get random/dnn action
        action = agent.get_action(state)
        next_state, reward, finished, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if finished == True:
            reward = -1
        # Save the episode
        agent.remember(state, action, reward, next_state, finished)
        state = next_state
        if finished == True:
            print("Episode: ", episode, " Time-Score: ", time, " Epsilon: ", agent.epsilon)
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)      

Episode:  0  Time-Score:  18  Epsilon:  0.9801
Episode:  1  Time-Score:  19  Epsilon:  0.8097278682212583
Episode:  2  Time-Score:  21  Epsilon:  0.6556592205741436
Episode:  3  Time-Score:  23  Epsilon:  0.5203405226503064
Episode:  4  Time-Score:  48  Epsilon:  0.3212010745647914
Episode:  5  Time-Score:  10  Epsilon:  0.2904884943099637
Episode:  6  Time-Score:  10  Epsilon:  0.2627125872502282
Episode:  7  Time-Score:  11  Epsilon:  0.2352166292404101
Episode:  8  Time-Score:  9  Epsilon:  0.21487444770607952
Episode:  9  Time-Score:  15  Epsilon:  0.18480456394854633
Episode:  10  Time-Score:  9  Epsilon:  0.16882215658069055
Episode:  11  Time-Score:  8  Epsilon:  0.15577974928671176
Episode:  12  Time-Score:  10  Epsilon:  0.1408844129042677
Episode:  13  Time-Score:  9  Epsilon:  0.12870034108965536
Episode:  14  Time-Score:  7  Epsilon:  0.11995712819347792
Episode:  15  Time-Score:  11  Epsilon:  0.1074022057426376
Episode:  16  Time-Score:  8  Epsilon:  0.09910481551887473
E

Episode:  135  Time-Score:  499  Epsilon:  0.04953625663766238
Episode:  136  Time-Score:  499  Epsilon:  0.04953625663766238
Episode:  137  Time-Score:  327  Epsilon:  0.04953625663766238
Episode:  138  Time-Score:  9  Epsilon:  0.04953625663766238
Episode:  139  Time-Score:  146  Epsilon:  0.04953625663766238
Episode:  140  Time-Score:  271  Epsilon:  0.04953625663766238
Episode:  141  Time-Score:  282  Epsilon:  0.04953625663766238
Episode:  142  Time-Score:  358  Epsilon:  0.04953625663766238
Episode:  143  Time-Score:  411  Epsilon:  0.04953625663766238
Episode:  144  Time-Score:  244  Epsilon:  0.04953625663766238
Episode:  145  Time-Score:  208  Epsilon:  0.04953625663766238
Episode:  146  Time-Score:  234  Epsilon:  0.04953625663766238
Episode:  147  Time-Score:  437  Epsilon:  0.04953625663766238
Episode:  148  Time-Score:  50  Epsilon:  0.04953625663766238
Episode:  149  Time-Score:  313  Epsilon:  0.04953625663766238
Episode:  150  Time-Score:  346  Epsilon:  0.0495362566376