In [6]:
%matplotlib inline
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import gym
import random
import numpy as np

In [11]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size # 4
        self.action_size = action_size # 2
        self.memory = []
        self.gamma = 0.97 # discount rate
        self.epsilon = 1.0 # exploration rate
        self.epsilon_lower_bound = 0.05 # lower bound
        self.epsilon_decay = 0.999 # decay rate
        self.lr = 0.0001 # learning rate for the dnn
        self.model = self.build_model() # keras dnn model
        
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size))
        model.add(Activation("relu"))
        model.add(Dense(24))
        model.add(Activation("relu"))
        model.add(Dense(self.action_size))
        model.compile(loss="mse", optimizer=Adam(lr=self.lr))
        #model.summary()
        return model
    
    # We save the prev. experiences for later re-training
    def remember(self, state, action, reward, next_state, is_finished):
        self.memory.append([state, action, reward, next_state, is_finished])
        
    # Get the action from the DNN or by Random Sampling 
    def get_action(self, state):
        # Select a random action
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        # Compute the action from the DNN
        action = self.model.predict(state)[0] # [0.25, 0.7]
        return np.argmax(action)
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, is_finished in minibatch:
            target = reward
            if not is_finished:
                # Q-learning reward formula
                target = (reward + self.gamma * np.max(self.model.predict(next_state)))
            # Train the agent to approx. the current state to future rewards
            target_future = self.model.predict(state) # [0.25, 0.7]
            target_future[0][action] = target
            self.model.fit(state, target_future, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_lower_bound:
            self.epsilon = self.epsilon * self.epsilon_decay

In [12]:
episodes = 1000
env = gym.make("CartPole-v1")
env.reset()

array([-0.01736505, -0.00308276, -0.02204238,  0.03357288])

In [13]:
state_size = 4
action_size = 2
agent = Agent(state_size, action_size)

is_finished = False
episodes = 1000
batch_size = 16

In [None]:
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for frame in range(500):
        # Get random/dnn action
        action = agent.get_action(state)
        next_state, reward, is_finished, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if is_finished == True:
            reward = -1
        # Save the epsiode
        agent.remember(state, action, reward, next_state, is_finished)
        state = next_state
        if is_finished == True:
            print("Episode: ", episode, " Time-Score: ", frame, " Epsilon: ", agent.epsilon)
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

Episode:  0  Time-Score:  106  Epsilon:  0.04953625663766238
Episode:  1  Time-Score:  89  Epsilon:  0.04953625663766238
Episode:  2  Time-Score:  73  Epsilon:  0.04953625663766238
Episode:  3  Time-Score:  87  Epsilon:  0.04953625663766238
Episode:  4  Time-Score:  77  Epsilon:  0.04953625663766238
Episode:  5  Time-Score:  83  Epsilon:  0.04953625663766238
Episode:  6  Time-Score:  97  Epsilon:  0.04953625663766238
Episode:  7  Time-Score:  85  Epsilon:  0.04953625663766238
Episode:  8  Time-Score:  90  Epsilon:  0.04953625663766238
Episode:  9  Time-Score:  145  Epsilon:  0.04953625663766238
Episode:  10  Time-Score:  99  Epsilon:  0.04953625663766238
Episode:  11  Time-Score:  179  Epsilon:  0.04953625663766238
Episode:  12  Time-Score:  82  Epsilon:  0.04953625663766238
Episode:  13  Time-Score:  93  Epsilon:  0.04953625663766238
Episode:  14  Time-Score:  283  Epsilon:  0.04953625663766238
Episode:  15  Time-Score:  71  Epsilon:  0.04953625663766238
Episode:  16  Time-Score:  204