In [None]:
import gym
import sys
import itertools
import numpy 

def get_pos_vel_from_state(state):
    nps = np.array(state)
    position = nps[0] * 10 + 12
    velocity = nps[1] * 100 + 7
    return position, velocity
    

def q_learning(env, num_episodes, alpha=0.85, discount_factor=0.99, Qarg):
    """
    Q learning algorithm, off-polics TD control. Finds optimal gready policies
    Args:
    env: Given environment to solve
    num_episodes: Number of episodes to learn
    alpha: learning rate
    discount factor: weight/importance given to future rewards
    epsilon: probability of taking random action. 
             We are using decaying epsilon, 
             i.e high randomness at beginning and low towards end
    Returns:
    Optimal Q
    """
     
    # decaying epsilon, i.e we will divide num of episodes passed
    epsilon = 1.0
    # create a numpy array filled with zeros 
    # rows = number of observations & cols = possible actions
    Q = Qarg
    
    for i_episode in range(num_episodes):
            # reset the env
            state = env.reset()
            
            position, velocity = get_pos_vel_from_state(state)
            # itertools.count() has similar to 'while True:'
            for t in itertools.count():
                # generate a random num between 0 and 1 e.g. 0.35, 0.73 etc..
                # if the generated num is smaller than epsilon, we follow exploration policy 
                if np.random.random() <= epsilon:
                    # select a random action from set of all actions
                    action = env.action_space.sample()
                # if the generated num is greater than epsilon, we follow exploitation policy
                else:
                    # select an action with highest value for current state
                    action = np.argmax(Q[position, velocity, :])
                
                # apply selected action, collect values for next_state and reward
                next_state, reward, done, _ = env.step(action)
                position, velocity = get_pos_vel_from_state(next_state)
                # Calculate the Q-learning target value
                Q_target = reward + discount_factor*np.max(Q[position, velocity,:])
                # Calculate the difference/error between target and current Q
                Q_delta = Q_target - Q[position, velocity, action]
                # Update the Q table, alpha is the learning rate
                Q[position, velocity, action] = Q[position, velocity, action] + (alpha * Q_delta)
                
                # break if done, i.e. if end of this episode
                if done:
                    break
                # make the next_state into current state as we go for next iteration
                state = next_state
            # gradualy decay the epsilon
            if epsilon > 0.1:
                epsilon -= 1.0/num_episodes
    
    return Q    # return optimal Q
    as np




In [None]:

def test_algorithm(env, Q):
    """
    Test script for Q function
    Args:
    env: Given environment to test Q function
    Q: Q function to verified
    Returns:
    Total rewards for one episode
    """
    
    state = env.reset()
    total_reward = 0
    
    while True:
        position, velocity = get_pos_vel_from_state(state)
        
        # selection the action with highest values i.e. best action
        action = np.argmax(Q[position, velocity, :])
        # apply selected action
        
        next_state, reward, done, _ = env.step(action)
        # render environment
        env.render()
        # calculate total reward
        total_reward += reward
        
        if done:
            print(total_reward)
            break
            
        state = next_state
    
    return total_reward 

In [None]:
Q = np.zeros((3,19,14))
env = gym.make('MountainCar-v0')
Q = q_learning(env, num_episodes, alpha=0.85, discount_factor=0.99, Q)

