# Frozen Lake

In [None]:
import numpy as np
import pandas as pd
from enum import Enum
 
class ActionResult(Enum):
    CONTINUE = 0,
    WIN = 1,
    LOSE = 2
        
class FrozenLakeGame():
          
    def __init__(self, board):
        self.board = board
        self.num_actions = 4
        self.num_states = board.size
        self.nrows = board.shape[0]
        self.ncols = board.shape[1]
        self.action_dict = {
            0 : 'R',
            1 : 'D',
            2 : 'L',
            3 : 'U'
        }
        self.status_transitions = self._compute_action_map()
        self.rewards = self._compute_reward_map()
        self.current_state = 0
        self.action_result = ActionResult.CONTINUE
    
    def reset(self):
        self.current_state = 0
        self.action_result = ActionResult.CONTINUE
        
    def step(self, action_id):
        
        if self.action_result is not ActionResult.CONTINUE:
            return self.current_state, self.current_result, 0
            
        new_state = self.status_transitions[self.current_state, action_id]
        reward = self.rewards[self.current_state, new_state]        
        
        if self._get_state_value(new_state) == 'I':
            self.action_result = ActionResult.CONTINUE
            
            self.current_state = new_state
            
        if self._get_state_value(new_state) == 'H':
            self.action_result = ActionResult.LOSE
            self.current_state = new_state
        
        if self._get_state_value(new_state) == 'G':
            self.action_result = ActionResult.WIN
            self.current_state = new_state
        
        return self.current_state, self.action_result, reward
        
    def _compute_reward_map(self):
        rewards = np.zeros((self.num_states, self.num_states))
        rewards[:,self.num_states-1] = 1
        return rewards
    
    def _compute_action_map(self):
        status_transitions = np.zeros((self.num_states, self.num_actions),'uint8')
        for i in range(self.nrows):
            for j in range(self.ncols):
                s = i*self.ncols + j
                status_transitions[s,0] = (i*self.ncols + j+1) if (j+1) < self.ncols else s
                status_transitions[s,1]= ((i+1)*self.ncols + j) if (i+1) < self.nrows else s
                status_transitions[s,2]= (i*self.ncols + j-1) if (j-1) >= 0 else s
                status_transitions[s,3]= ((i-1)*self.ncols + j) if (i-1) >= 0 else s
        return status_transitions
                
    def translate_action(self, action_id):
        return self.action_dict[action_id]
    
    def print_board(self):
        print(pd.DataFrame(self.board).to_string(header=False, index=False))
        
    def print_transitions(self):
        print(pd.DataFrame(self.status_transitions).to_string(header=['R','D','L','U']))
    
    def print_reward_map(self):
        print(pd.DataFrame(self.rewards).to_string())
            
    def _get_state_value(self, state):
        i,j = np.unravel_index(state, self.board.shape)
        return self.board[i,j]


# Game

In [None]:
board = np.array((
    ('S','I','I','I'),
    ('I','H','I','H'),
    ('I','H','I','H'),
    ('H','I','I','G')
))

game = FrozenLakeGame(board)

In [None]:
print('Board:')
game.print_board()
print('\nRewards:')
game.print_reward_map()

# Agent training

In [None]:
discount_rate = 0.9
learning_rate = 0.1
learning_rate_decay = 0.0
episodes = 10000
e = 0.4
e_decay = 0.0

Q_list = [] # It will contain the evolution of the Q-matrix
reward_list = []
Q = np.random.rand(game.num_states,game.num_actions) # Initialized randomly

def choose_best_action(q_status):
    return np.argmax(q_status)

def e_greedy_strategy(e, Q, current_state):
    if np.random.rand(1) < e:
        action = np.random.randint(Q.shape[1])
    else:
        action = choose_best_action(Q[current_state,:])
    return action

for n in range(episodes):
    current_state = 0
    game.reset()
    while 1:
        # Choose and perform next action
        action = e_greedy_strategy(e, Q, current_state)
        new_state, action_result, reward = game.step(action)
        
        if action_result == ActionResult.LOSE:
            break
        
        # Update Q matrix
        best_action = choose_best_action(Q[new_state,:])
        Q[current_state,action] = (1.0 - learning_rate) * Q[current_state,action] + learning_rate * (reward + discount_rate * Q[new_state,best_action])
        
        current_state = new_state

        if action_result == ActionResult.WIN:
            break

    Q_list.append(np.copy(Q))
    reward_list.append(reward)
    
    # update rates
    learning_rate *= (1-learning_rate_decay)
    e *= (1-e_decay)

# Agent playing

In [None]:
Q=Q_list[-1]
print(Q)
    

In [None]:
optimum_actions = np.reshape(np.argmax(Q,1), game.board.shape)

movements = []
s = 0

game.reset()
result = ActionResult.CONTINUE
while result == ActionResult.CONTINUE:
    i,j = np.unravel_index(s, board.shape)
    
    action = optimum_actions[i,j]
    movements.append(game.translate_action(action))
    s, result, reward = game.step(action)

    if len(movements) > 20:
        print('In a looooop!!')
        break

print(result)
print(movements)
print(board)