In [23]:
%matplotlib inline
import gym
import itertools
import matplotlib
import numpy as np
import pandas as pd
import sys


if "../" not in sys.path:
    sys.path.append("../")
    
from collections import defaultdict
from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting

matplotlib.style.use('ggplot')

In [24]:
env = CliffWalkingEnv()

In [25]:
def make_epsilon_greedy_policy(Q, epsilon, nA):
    """
    create an epsilon greedy policy based on given Q function and function
    
    """
    
    def policy_fn(observation):
        A = np.ones(nA, dtype=float)*epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [26]:
def q_learning(env, num_episodes, discount_factor = 1.0, alpha=0.5, epsilon=0.1):
    """
    Q learning algorithm: Off policy TD control. find the optimal greedy policy
    while following an epsilon greedy policy
    Args: 
        env: OpenAI environment.
        num_episodes = number of episode to run for.
        discount_factor = Gamma is a discount factor.
        alpha = TD learning rate
        epsilon = chance to sample random action (exploration rate) float between 0 and 1.
        
    Return:
        A tuple (Q, episode_length)
        Q is the optimal value function, a dictionary mapping state-> action values.
        state is an Episodestate object with two numpy arrays for episode_length and episode_reward.
    """
    #the final action-value function
    # a nested dictionary that maps state -> action ->action_value.
    
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
    
    #keep track of useful statistic
   # stats = plottingEpisodeStats(
    #episode_lengths = np.zeros(num_episodes),
    #episode_rewards = np.zeros(num_episodes))
    
    
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes)) 
    #the policy we are following.
    policy = make_epsilon_greedy_policy(Q, epsilon, env.action_space.n)
    
    for i_episode in range(num_episodes):
        #print out which episode is on, useful for debugging
        if(i_episode + 1) % 100 == 0:
            print("\rEpisode {}/{}.".format(i_episode + 1, num_episodes), end="")
            sys.stdout.flush()
        #reset the environment and pick the action
        state = env.reset()
        
        # one step in the environment
        #total reward = 0.0
        for t in itertools.count():
            #take a step
            action_probs = policy(state)
            action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
            next_state, reward, done, _ = env.step(action)
            
            #update statistic
            stats.episode_rewards[i_episode] += reward
            stats.episode_length[i_episode] = t
            
            
            #TD update
            best_next_action = np.argmax(Q[next_state])
            td_target = reward + discount_factor * Q[next_state][best_next_action]
            td_delta = td_target - Q[state][action]
            Q[state][action] += alpha*td_delta
            
            if done:
                break
                
            state = next_state
            
    return Q, stats

In [27]:
Q, states = q_learning(env, 500)

AttributeError: module 'lib.plotting' has no attribute 'EpisodeStates'