In [6]:
import gym

In [7]:
env = gym.make("CartPole-v0")

In [8]:
env.reset()
for t in range(500):
    random_action = env.action_space.sample()
    env.step(random_action)
    env.render()
env.close()



In [9]:
for e in range(2):
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation, reward, done, other_info = env.step(action)
        
        if done:
            print("Game Episode : {}/{} High Score : {}".format(e,20,t))
            break
    env.close()

Game Episode : 0/20 High Score : 20
Game Episode : 1/20 High Score : 21


In [18]:
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

In [19]:
class Agent:
    def __init__(self,state_size,action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #Discount Factor
        self.epsilon = 1.0 # Exploration Rate: How much to act randomly, 
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        self.learning_rate = 0.001 
        self.model = self._create_model()
        
        
    def _create_model(self):
        #Neural Network To Approximate Q-Value function
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) #1st Hidden Layer
        model.add(Dense(24,activation='relu')) #2nd Hidden Layer
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=0.001))
        return model
    
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done)) #remembering previous experiences
        
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0]) #Left or Right
    
    def train(self,batch_size=32): #method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: #boolean 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0) #single epoch, x =state, y = target_f, loss--> target_f - 
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)
    

In [24]:
n_episodes = 1000
output_dir = "Cartpole_model/"

In [25]:
done = False
state_size = 4
action_size = 2
batch_size = 32
agent = Agent(state_size,action_size)


In [27]:
agent = Agent(state_size,action_size) #initialize Agent

for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state, [1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state) #action is 0 or 1
        next_state, reward, done, other_info = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode : {}/{}, High Score : {}, Exploration Rate : {:.2}".format(e, n_episodes, time, agent.epsilon))
            break
        
        if len(agent.memory)>batch_size:
            agent.train(batch_size)
    
        if e%50==0:
            agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()

Game Episode : 0/1000, High Score : 43, Exploration Rate : 0.95
Game Episode : 1/1000, High Score : 8, Exploration Rate : 0.91
Game Episode : 2/1000, High Score : 22, Exploration Rate : 0.81
Game Episode : 3/1000, High Score : 12, Exploration Rate : 0.77
Game Episode : 4/1000, High Score : 13, Exploration Rate : 0.72
Game Episode : 5/1000, High Score : 12, Exploration Rate : 0.68
Game Episode : 6/1000, High Score : 26, Exploration Rate : 0.59
Game Episode : 7/1000, High Score : 10, Exploration Rate : 0.56
Game Episode : 8/1000, High Score : 11, Exploration Rate : 0.53
Game Episode : 9/1000, High Score : 11, Exploration Rate : 0.51
Game Episode : 10/1000, High Score : 9, Exploration Rate : 0.48
Game Episode : 11/1000, High Score : 11, Exploration Rate : 0.46
Game Episode : 12/1000, High Score : 14, Exploration Rate : 0.43
Game Episode : 13/1000, High Score : 16, Exploration Rate : 0.39
Game Episode : 14/1000, High Score : 9, Exploration Rate : 0.38
Game Episode : 15/1000, High Score : 9

KeyboardInterrupt: 