In [8]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
class QLearning():
    """
    Implements Q Learning Algorithm for openai-gym's FrozenLake8x8 Environment
    
    SFFF       (S: starting point, safe)
    FHFH       (F: frozen surface, safe)
    FFFH       (H: hole, fall to your doom)
    HFFG       (G: goal, where the frisbee is located)
    
    Agent starts at S and most traverse this icy field until they land on tile G!
    If they land on H or if they take 200 steps, the agent loses
    """
    
    
    def __init__(self, learning_rate, exploitation_rate, discount_rate):
        """
        Initializes hyper parameters
        
        @param learning_rate: INTEGER between 0 and 1 representing the size of a learning step
        @param exploitation_rate: INTEGER between 0 and 1 representing what fraction
            of the time do we choose to pick our current most optimal action rather than
            choose a random action. 
        @param discount_rate: INTEGER between 0 and 1 indicating our lookout discount rate
        """
        self.alpha = learning_rate
        self.epsilon = exploitation_rate
        self.beta = discount_rate
        self.env = gym.make('FrozenLake8x8-v0')
        
        # 2-D Matrix where rows are states and columns represent actions
        self.qtable = np.zeros((self.env.observation_space.n, self.env.action_space.n))
        self.actions = range(self.env.action_space.n)

        
    def policy(self, state, exploration=True):
        """
        Maps states to actions
        
        @param state: INTEGER representing agent's current state
        @return: INTEGER representing chosen action given some state
        """
        if exploration and np.random.uniform() > self.epsilon:
            return np.random.choice(self.actions)
        
        # Tie breaker for maximum reward
        max_value = max(self.qtable[state])
        max_indices = [i for i, val in enumerate(self.qtable[state]) if val == max_value]
        return np.random.choice(max_indices)
    
    
    def updateQtable(self, state, action, reward, next_state):
        """
        Updates a qtable element. Follows the formula:
        Q(s, a) = Q(s, a) + alpha * [Reward(s, a) + beta * max(Q'(s', for all a')) - Q(s, a)]
        """
        self.qtable[state][action] += self.alpha * \
            (reward + self.beta * max(self.qtable[next_state]) - self.qtable[state][action])
        
        
    def train(self, epochs):
        """
        Train RL Agent
        
        @params epochs: INTEGER representing number of training iterations
        """
        i = 1
        for epoch in range(epochs):
            if epoch/epochs > i/5:
                print("\nFinished {0}/5 epochs".format(i))
                i += 1
            self.sample(update_qtable=True)
        print("\nFinished {0}/5 epochs".format(i))
    
    
    @staticmethod
    def reward_modifier(reward, done):
        """
        Modifies reward to some extent. For example, uses inputs of
        reward and done param to determine whether the agent moved, whether they
        died, and whether they won and adjusts the reward accordingly. Without
        this function, reward will be 1 on win and 0 otherwise. We want to penalize
        excess moves, dying, and give a big reward towards winning.
        
        @param reward: INTEGER representing reward from some state, action
        @param done: BOOLEAN representing whether episode has ended
        @returns: INTEGER representing modified reward
        """
        if reward == 0 and done:
            # Died or time ran out
            reward = -50
            
        elif reward == 1:
            # Got to goal
            reward = 100
            
        else:
            # Moved
            reward = -1
            
        return reward
    
    
    def sample(self, update_qtable=False):
        """
        Undergoes one episode
        
        @param update_qtable: BOOLEAN indicating whether this episode
        will update the qtable
        """
        next_state = self.env.reset()
        
        done = False
        if not update_qtable:
            self.env.render()
            
        while not done:
            state = next_state
            action = self.policy(state, update_qtable)
            
            next_state, reward, done, info = self.env.step(action)
            
            if not update_qtable:
                self.env.render()
            else:
                reward = self.reward_modifier(reward, done)
                self.updateQtable(state, action, reward, next_state)

In [13]:
agent = QLearning(.05, .90, 1)

In [16]:
agent.train(2000)
print(agent.qtable)


Finished 1/5 epochs

Finished 2/5 epochs

Finished 3/5 epochs

Finished 4/5 epochs

Finished 5/5 epochs
[[-15.65221795 -14.35948371 -15.57806714 -12.42923674]
 [-16.26150812 -13.82934677  -8.33981511 -12.92893034]
 [-11.99347853  -8.85168665  -4.16848436 -11.28244503]
 [-12.62210584  -5.92256143   1.51076216  -3.66909698]
 [ -7.06802357  -1.49990518   6.96426339  -3.95998231]
 [ -1.43022689   2.63238419  15.96083073   7.12889433]
 [ 13.11209622  10.9535935   20.26335426  11.87918505]
 [ 11.77296703  22.13067259  11.06289103  10.68365927]
 [-22.49228773 -26.68306349 -23.8633351  -12.97586998]
 [-22.95304526 -21.82779511 -18.31707691 -11.03424504]
 [-22.78357245 -18.71508176 -16.68561848  -6.69268082]
 [-23.17805731 -24.52228097 -19.23437659  -1.26890976]
 [ -8.74154043  -7.01129576   4.35657713  -5.12141168]
 [ -2.17915146   0.9728268   16.45321894  -1.46418847]
 [ 14.88952872  18.21152942  24.84218738  15.25655012]
 [ 20.68299642  29.28834302  17.46450035  16.97049079]
 [-31.40424638 

In [17]:
agent.sample()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Up)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFF[41mF[0mF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFF[41mF[0m
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SF