In [1]:
import gym
import random
import numpy as np
import time
from IPython.display import clear_output
env = gym.make('FrozenLake-v0')

In [2]:
class Agent():
    def __init__(self, env):
        self.action_size = env.action_space.n
        
    def random_action(self, state):
        action = random.choice(range(self.action_size))
        return action

Q Update: $Q(s_{t}, a_{t}) = r_{t+1} + \gamma max(Qs_{t+1})$ <br>
Q Learn: $Q(s,a) \leftarrow Q(s,a) + \alpha(target - Q(s,a)) $

In [3]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.65):
        super().__init__(env)
        self.state_size = env.observation_space.n
        self.exploration_rate = .99
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):  
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
     
    def get_action(self, state):
        possible_actions = self.q_table[state] 
        exploit = np.argmax(possible_actions)  
        explore = super().random_action(state)
        return explore if random.random() < self.exploration_rate else exploit
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        q_next = np.zeros([self.action_size]) if done else self.q_table[next_state]
        
        q_target = reward + self.discount_rate * np.max(q_next)
        q_update = q_target - self.q_table[state, action] 
        self.q_table[state, action] = self.q_table[state, action] + self.learning_rate * (q_target - self.q_table[state, action])
        
        if done:
            self.exploration_rate = self.exploration_rate * .99

total_reward = 0
agent = QAgent(env)

In [7]:
for episode in range(100):
    state = env.reset()
    done = False
    
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward
        
        print(f"State: {state}, Action: {action}")
        print(f"Episode:{episode}, Total reward:{total_reward}, eps: {agent.exploration_rate}")
        env.render()
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)

State: 15, Action: 3
Episode:99, Total reward:66.0, eps: 0.017771047742294682
  (Up)
SFFF
FHFH
FFFH
HFF[41mG[0m
[[1.08587747e-01 1.04904330e-01 1.26793140e-01 9.89639228e-02]
 [2.03765227e-03 2.92810547e-02 3.36663970e-02 1.08347173e-01]
 [7.26457333e-02 6.48497144e-02 5.58298691e-02 8.56593283e-02]
 [1.72501795e-02 2.92711621e-02 4.91821266e-03 7.00915388e-02]
 [1.83281290e-01 1.33442497e-01 6.06294069e-02 7.52311886e-02]
 [5.03771945e-05 1.18980453e-05 3.96564600e-05 6.93966483e-05]
 [1.03554716e-02 1.73349438e-06 7.00790717e-02 1.26454125e-02]
 [2.71989523e-05 8.12533337e-05 1.47618955e-05 7.55793198e-05]
 [6.37660648e-02 1.20847864e-01 2.98056892e-02 3.25246230e-01]
 [3.70788375e-02 3.90755476e-01 1.96552453e-02 2.33259026e-02]
 [5.64498154e-01 3.93844730e-02 7.51683284e-02 1.73298040e-02]
 [6.63392192e-05 1.50279353e-05 3.48091677e-05 6.19481277e-05]
 [3.88999906e-05 9.52140135e-05 9.61827232e-05 1.06183791e-05]
 [1.87426054e-01 1.76388968e-01 6.95528962e-01 1.56362836e-01]
 [5.