In [1]:
import gym
import numpy as np
import math
from sklearn.preprocessing import KBinsDiscretizer
import random

In [2]:
# Setup enviroment
env = gym.make('CartPole-v0')

In [3]:
class CartPoleAgent():
    def __init__(self, min_lr=0.1, discount=0.95, epsilon=0.1, n_bins = ( 6 , 12 )):
        #Initialises with default values if not given.
        self.min_lr = min_lr # alpha
        self.discount = discount # gamma
        self.epsilon = epsilon # exploration probability
        self.lower_bounds = []
        self.upper_bounds = []
        self.n_bins = n_bins
    
    def initQTable(self, env):
        # Convert continous to discrete values
        self.qTable = np.zeros(self.n_bins + (env.action_space.n,))
        self.lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
        self.upper_bounds = [ env.observation_space.high[2], math.radians(50) ]
        
    def choose_action(self, state, env, n):
        if (np.random.random() < self.exploration_rate(n)):
            return env.action_space.sample() 
        else:
            return np.argmax(self.qTable[state])
        
    def updateQ(self, state, action, reward, new_state, lr):
        future_optimal_value = np.max(self.qTable[new_state])
        learned_value = reward + self.discount * future_optimal_value
        old_value = self.qTable[state][action]
        self.qTable[state][action] = (1-lr)*old_value + lr*learned_value 
        
    def learning_rate(self,n):
        #Decaying exploration rate
        return max(self.min_lr, min(1.0, 1.0 - math.log10((n + 1) / 25)))
        
    def exploration_rate(self, n):
        #Decaying exploration rate
        return max(self.epsilon, min(1, 1.0 - math.log10((n  + 1) / 25)))
       
    def discretize_state(self, cart_position , cart_velocity , pole_angle, pole_velocity ):
        est = KBinsDiscretizer(n_bins=self.n_bins, encode='ordinal', strategy='uniform')
        est.fit([self.lower_bounds, self.upper_bounds ])
        return tuple(map(int,est.transform([[pole_angle, pole_velocity]])[0]))

In [None]:
agent = CartPoleAgent(min_lr=0.005, discount=1.0, epsilon=0.1)
agent.initQTable(env)

for i_episode in range(1000):
    current_state = agent.discretize_state(*env.reset())
    
    for t in range(100):
        if(i_episode % 25 == 0 and t == 0): print(i_episode)
        if(i_episode > 250): env.render()

        action = agent.choose_action(current_state, env, i_episode)
        observation, reward, done, info = env.step(action)
        
        #Convert observation(continous) to discrete values to new_state
        new_state = agent.discretize_state(*observation) #*observation unpackages object
        lr= agent.learning_rate(i_episode) # decaying learning rate
        
        #Update QTables with new state values
        agent.updateQ(current_state, action, reward, new_state, lr)
        
        # Set new state as current state
        current_state = new_state
        
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
print("Finished")
env.close()

0
Episode finished after 22 timesteps
Episode finished after 17 timesteps
Episode finished after 31 timesteps
Episode finished after 33 timesteps
Episode finished after 35 timesteps
Episode finished after 12 timesteps
Episode finished after 37 timesteps
Episode finished after 17 timesteps
Episode finished after 11 timesteps
Episode finished after 15 timesteps
Episode finished after 22 timesteps
Episode finished after 13 timesteps
Episode finished after 14 timesteps
Episode finished after 16 timesteps
Episode finished after 22 timesteps
Episode finished after 18 timesteps
Episode finished after 45 timesteps
Episode finished after 39 timesteps
Episode finished after 19 timesteps
Episode finished after 32 timesteps
Episode finished after 31 timesteps
Episode finished after 16 timesteps
Episode finished after 27 timesteps
Episode finished after 18 timesteps
Episode finished after 30 timesteps
25
Episode finished after 12 timesteps
