In [20]:
import gym
import random
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output

In [21]:
env_name = "FrozenLake-v1"
env = gym.make(env_name)

In [22]:
class Agent():
    def __init__(self, env):
        self.is_discrete = gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", self.action_size)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
        
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
        return action

In [23]:
class QAgent(Agent):
    def __init__(self, env, discount_rate=0.97, learning_rate=0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)
        
        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()
        
    def build_model(self):
        self.q_table = 1e-4*np.random.random([self.state_size, self.action_size])
        
    def get_action(self, state):
        q_state = self.q_table[state]
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy
    
    def train(self, experience):
        state, action, next_state, reward, done = experience
        
        q_next = self.q_table[next_state]
        q_next = np.zeros([self.action_size]) if done else q_next
        q_target = reward + self.discount_rate * np.max(q_next)
        
        q_update = q_target - self.q_table[state,action]
        self.q_table[state,action] += self.learning_rate * q_update
        
        if done:
            self.eps = self.eps * 0.99
        
agent = QAgent(env)

Action size: 4
State size: 16


In [24]:
total_reward = 0
for ep in range(100):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state,action,next_state,reward,done))
        state = next_state
        total_reward += reward
        
        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}".format(ep,total_reward,agent.eps))
        env.render()
        print(agent.q_table)
        time.sleep(0.05)
        clear_output(wait=True)

s: 5 a: 1
Episode: 99, Total reward: 3.0, eps: 0.36603234127322926
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
[[8.59983287e-05 8.58158166e-05 3.33430934e-05 3.64624523e-05]
 [3.47498658e-05 4.85206771e-05 6.14002101e-05 1.57816113e-05]
 [1.91335443e-05 6.22892110e-05 9.50677891e-05 8.98788866e-05]
 [1.47921094e-05 9.32524144e-05 6.78415032e-05 9.46295330e-05]
 [3.08849263e-05 7.88256014e-05 3.24947195e-05 7.89643547e-05]
 [7.92600794e-05 2.95864783e-05 6.36834105e-05 8.96807456e-05]
 [2.89598861e-05 6.34593271e-05 8.20013861e-05 3.37423755e-05]
 [9.63918819e-05 3.87649003e-05 6.17086758e-05 7.38661838e-05]
 [5.23890592e-05 3.66367464e-05 7.78424867e-05 9.58490355e-05]
 [5.64711716e-05 6.41807894e-05 9.09839171e-05 3.30566159e-05]
 [3.85211288e-05 1.17242843e-05 2.43747759e-04 7.57239026e-05]
 [2.64402616e-05 9.51134080e-05 4.54016410e-05 4.30231907e-05]
 [3.82643399e-05 5.05634627e-05 6.88114331e-05 7.19473436e-05]
 [1.17885184e-05 5.08004955e-04 2.67908192e-04 2.58045710e-04]
 [7.89642742e