In [1]:
import numpy as np
import random

In [2]:
class FrozenLake:
    def __init__(self, size=4):
        self.size = size
        self.grid = np.zeros((size, size), dtype=int)
        self.start_state = (0, 0)
        self.goal_state = (size-1, size-1)
        self.hole_states = [(1, 1), (2, 3), (3, 0)]
        for i, j in self.hole_states:
            self.grid[i][j] = 1
    
    def reset(self):
        self.current_state = self.start_state
        return self.current_state
    
    def step(self, action):
        i, j = self.current_state
        if action == 0: # move up
            i = max(i-1, 0)
        elif action == 1: # move down
            i = min(i+1, self.size-1)
        elif action == 2: # move left
            j = max(j-1, 0)
        elif action == 3: # move right
            j = min(j+1, self.size-1)
        
        self.current_state = (i, j)
        
        if self.current_state == self.goal_state:
            reward = 10
            done = True
        elif self.current_state in self.hole_states:
            reward = -10
            done = True
        else:
            reward = -1
            done = False
        
        return self.current_state, reward, done


    # console text printing the grid and show the agent's moves
    def render(self):
        print('\n')
        for i in range(self.size):
            for j in range(self.size):
                if self.grid[i][j] == 0:
                    if (i, j) == self.current_state:
                        print('S', end=' ')
                    elif (i, j) == self.goal_state:
                        print('G', end=' ')
                    else:
                        print('.', end=' ')
                elif self.grid[i][j] == 1:
                    if (i, j) == self.current_state:
                        print('S', end=' ')
                    else:
                        print('X', end=' ')
            print()
        print()
    
    #print the Q-table of all values
    def show_q_table(self, q_table):

        print('-----------------------------------------------------------------')
        print('Q-Table:')
        print('-----------------------------------------------------------------')

        for i in range(self.size):
            for j in range(self.size):
                if self.grid[i][j] == 0:
                    print( '%.2f' % q_table[i][j][0], end='\t')
                    print('%.2f' % q_table[i][j][1], end='\t')
                    print('%.2f' % q_table[i][j][2], end='\t')
                    print('%.2f' % q_table[i][j][3])
                else:
                    print('NULL', end='\t')
                    print('NULL', end='\t')
                    print('NULL', end='\t')
                    print('NULL')
            print()


    # In one text line show the policy (the sequence of actions that agent take )
    def show_policy(self, q_table):
        print('\n Policy:')
        for i in range(self.size):
            for j in range(self.size):
                if self.grid[i][j] == 0:
                    action = np.argmax(q_table[i][j])
                    if action == 0:
                        print('UP', end=' ')
                    elif action == 1:
                        print('DOWN', end=' ')
                    elif action == 2:
                        print('LEFT', end=' ')
                    elif action == 3:
                        print('RIGHT', end=' ')
                else:
                    print('STAY', end=' ')

In [3]:
# Create the environment
env = FrozenLake()

# Initialize Q-table with zeros
q_table = np.zeros((env.size, env.size, 4))

In [4]:
# Set hyperparameters
num_episodes = 10000
max_steps_per_episode = 100
learning_rate = 0.1
discount_factor = 0.99
epsilon = 1.0
min_epsilon = 0.01
epsilon_decay_rate = 0.001

In [5]:
# Define epsilon-greedy policy
def epsilon_greedy_policy(state):
    if random.uniform(0, 1) < epsilon:
        return random.randint(0, 3)
    else:
        return np.argmax(q_table[state[0]][state[1]])

In [6]:
# Train agent
for episode in range(num_episodes):
    state = env.reset()
    done = False
    t = 0
    while not done and t < max_steps_per_episode:
        action = epsilon_greedy_policy(state)
        next_state, reward, done = env.step(action)
        q_table[state[0]][state[1]][action] += learning_rate * \
            (reward + discount_factor * np.max(q_table[next_state[0]][next_state[1]]) - q_table[state[0]][state[1]][action])
        state = next_state
        t += 1
    epsilon = max(min_epsilon, epsilon * (1 - epsilon_decay_rate))

    # Show progress
    if episode % 1000 == 0:
        env.render()
        env.show_q_table(q_table)
        env.show_policy(q_table)



. . . . 
. X . . 
. . . X 
S . . G 

-----------------------------------------------------------------
Q-Table:
-----------------------------------------------------------------
-0.34	-0.10	-0.19	-0.19
0.00	0.00	-0.19	-0.10
-0.10	0.00	-0.10	-0.10
-0.10	-0.10	-0.10	0.00

0.00	-0.10	0.00	0.00
NULL	NULL	NULL	NULL
0.00	0.00	0.00	0.00
-0.10	0.00	0.00	0.00

0.00	-1.00	0.00	0.00
0.00	0.00	0.00	0.00
0.00	0.00	0.00	0.00
NULL	NULL	NULL	NULL

NULL	NULL	NULL	NULL
0.00	0.00	0.00	0.00
0.00	0.00	0.00	0.00
0.00	0.00	0.00	0.00


 Policy:
DOWN UP DOWN RIGHT UP STAY UP DOWN UP UP UP STAY STAY UP UP UP 

. . . . 
. S . . 
. . . X 
X . . G 

-----------------------------------------------------------------
Q-Table:
-----------------------------------------------------------------
3.56	4.61	3.56	4.58
3.86	-10.00	2.97	5.66
4.39	6.73	4.06	2.60
0.07	0.50	5.08	0.31

3.56	5.67	4.61	-10.00
NULL	NULL	NULL	NULL
4.94	7.81	-9.91	4.47
0.36	-9.02	6.34	0.99

4.60	-10.00	5.67	6.73
-10.00	7.77	5.66	7.81
6.71	8.90	6.72	-

In [7]:
# Test agent
state = env.reset()
done = False
while not done:
    action = np.argmax(q_table[state[0]][state[1]])
    next_state, reward, done = env.step(action)
    env.render()
    state = next_state



. . . . 
S X . . 
. . . X 
X . . G 



. . . . 
. X . . 
S . . X 
X . . G 



. . . . 
. X . . 
. S . X 
X . . G 



. . . . 
. X . . 
. . S X 
X . . G 



. . . . 
. X . . 
. . . X 
X . S G 



. . . . 
. X . . 
. . . X 
X . . S 

