In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

import gym
import random
import numpy as np

In [2]:
class Agent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size # 4
        self.action_size = action_size # 2
        self.memory = [] # to save our prev. actions
        self.gamma = 0.95 # discount rate
        self.epsilon = 1.0 # exploration rate
        self.epsilon_lb = 0.05 # lower bound
        self.epsilon_dc = 0.99 # decay
        self.lr = 0.001 # lr for the dnn
        self.model = self.build_model()
       
    # DNN for our Q-Learning Model
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size))
        model.add(Activation("relu"))
        model.add(Dense(12))
        model.add(Activation("relu"))
        model.add(Dense(self.action_size))
        model.compile(loss="mse", optimizer=Adam(lr=self.lr))
        return model
    
    # We save the prev experiences for later re-train
    def remember(self, state, action, reward, next_state, finished):
        self.memory.append([state, action, reward, next_state, finished])
       
    # Get action by DNN or by Random sampling
    def get_action(self, state):
        # Select random action
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        # Compute action based on our DNN
        action = self.model.predict(state)
        return np.argmax(action[0])
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size) 
        for state, action, reward, next_state, finished in minibatch:
            target = reward
            # Predict the future discounted reward
            if not finished:
                target = (reward + self.gamma * np.max(self.model.predict(next_state)[0]))
            # Train the agent to approx the current state to futute rewards
            target_future = self.model.predict(state)
            target_future[0][action] = target
            self.model.fit(state, target_future, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_lb:
            self.epsilon = self.epsilon * self.epsilon_dc

In [3]:
episodes = 1000
env = gym.make("CartPole-v1")
state_size = 4
action_size = 2
agent = Agent(state_size, action_size)
finished = False
batch_size = 16

In [4]:
for episode in range(episodes):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        # Get random/dnn action
        action = agent.get_action(state)
        next_state, reward, finished, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        if finished == True:
            reward = -1
        # Save the episode
        agent.remember(state, action, reward, next_state, finished)
        state = next_state
        if finished == True:
            print("Episode: ", episode, " Time-Score: ", time, " Epsilon: ", agent.epsilon)
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)      

Episode:  0  Time-Score:  13  Epsilon:  1.0


InternalError:  Blas GEMM launch failed : a.shape=(1, 4), b.shape=(4, 24), m=1, n=24, k=4
	 [[node sequential/dense/MatMul (defined at <ipython-input-2-79f52661199c>:43) ]] [Op:__inference_predict_function_195]

Function call stack:
predict_function
