In [None]:
import gym
import numpy as np
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import statistics as st

In [None]:
### e-greedy action selection
def select_action(env,Qtable,state,eps):
    num = random.uniform(0,1)  ## get a random number
    if num > eps:
        return  np.argmax(Qtable[state])
    else:
        return env.action_space.sample()
    
### modify rewards
def get_reward(state):
    if state[0] >= 0.5:
        return 10
    if state[0] > -0.4 or state[0]<-0.8:
        return -(1+state[0])**2
    return -1

def get_discrete_state(env,state):
    DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
    discrete_os_win_size = (env.observation_space.high - env.observation_space.low) / DISCRETE_OS_SIZE
    discrete_state = (state - env.observation_space.low) / discrete_os_win_size
    return tuple(discrete_state.astype(int))


In [None]:
def q_learning(env,episodes):
    ### init Qtable
    DISCRETE_OS_SIZE = [20] * len(env.observation_space.high)
    Qtable = np.zeros((DISCRETE_OS_SIZE + [env.action_space.n]))

    ### hyperparameters
    eps = 1
    gamma = 0.95
    eps_min = 0.01
    eps_decay = 0.9
    lr = 0.001
    
    rewards = []
    successes = {}
    for e in range(episodes): 
        state = get_discrete_state(env,env.reset()[0])
        action = select_action(env,Qtable,state,eps)
        
        score = 0
        for i in range(2000):
            action = select_action(env,Qtable,state,eps)
            new_state, reward, done, _ ,_= env.step(action)
            #reward = get_reward(new_state)  ### uncomment for custom reward function
            
            new_state = get_discrete_state(env,new_state)
            
            score += reward
            if not done:
                ## update table
                max_future_q = np.max(Qtable[new_state])
                current_q = Qtable[state + (action, )]
                new_q = (1 - lr) * current_q + lr * (reward + gamma * max_future_q)
                Qtable[state + (action, )] = new_q
                    
            else:
                print(f'Episode {e}/{episodes}, goal reached,score:{score} ')
                Qtable[state + (action, )] = reward
                rewards.append(score)
                successes[e] = score
                ### minimize exploration after success
                if eps*eps_decay > eps_min:
                    eps = eps*eps_decay
                else:
                    eps = eps_min
                
                break
        if e % 100 == 0:
            print(f'Episode {e}/{episodes}')
                    
    return rewards, successes

In [None]:
## create mountain car environment
import pickle
env = gym.make('MountainCar-v0',render_mode='human')

episodes = 1000

q_rewards,q_succ = q_learning(env,episodes)
