 # Initializing the env

In [195]:
import gymnasium as gym
import numpy as np
import random
from tqdm import tqdm

In [196]:
env = gym.make('FrozenLake-v1', desc=None, map_name="8x8", is_slippery=False, render_mode="rgb_array")

## Printing the env


In [197]:
print("OBSERVATION SPACE \n")
print("observation space", env.observation_space)
print("simple observation", env.observation_space.sample())

OBSERVATION SPACE 

observation space Discrete(64)
simple observation 33


In [198]:
print("ACTION SPACE \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) 

ACTION SPACE 

Action Space Shape 4
Action Space Sample 0


## Initializing the Q-table

In [199]:
state_space = env.observation_space.n
print("state space: ", state_space)

action_space = env.observation_space.n
print("observation space: ", action_space)

state space:  64
observation space:  64


In [200]:
def initialize_qtable(state_space, action_space):
    q_table = np.zeros((state_space, action_space))
    return q_table

In [201]:
q_table_frozen_lake = initialize_qtable(state_space, action_space)

## Defining the policy

In [202]:
def greedy_policy(qtable, state):
    action = np.argmax(qtable[state][:])
    return action

## Defining epsilon-greedy-policy

In [203]:
def epsilon_greedy_policy(qtable, state, epsilon):
    random_num  = random.uniform(0,1)

    if random_num > epsilon:

        action = greedy_policy(qtable, state)
    else:
        action = env.action_space.sample()
    
    return action

In [204]:
epsilon_greedy_policy(q_table_frozen_lake, env.action_space.sample(), 0.3)

0

## Defining Parameters

In [205]:
episodes = 1000
learning_rate = 0.7
evaluation_episodes = 100

env_id = "Frozen_lake_ayres"
max_steps = 99
gamma = 0.95
eval_seed = []

max_epsilon = 1.0
min_epsilon = 0.05
decay_rate = 0.0005

## Trainig the agent



In [206]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, qtable):
    # it reduces the epsilon
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episodes)

    state, info = env.reset()
    step = 0
    terminated = False
    truncated = False

    for step in range(max_steps):
        action = epsilon_greedy_policy(qtable, state, epsilon)

        new_state, reward, terminated, truncated, info = env.step(action)
        #creating the q-table
        qtable[state][action] = qtable[state][action] + learning_rate*(reward + gamma*np.max(qtable[new_state]) - qtable[state][action])


        if terminated or truncated:
            break
        state = new_state
    return qtable

## Training the q-learning agent

In [207]:
q_table_frozenlake = train(episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, q_table_frozen_lake)

In [208]:
q_table_frozen_lake

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Evaluating the model

In [209]:
def evaluate_agent(env, max_steps, evaluation_episodes, qtable, seed):
 
  episode_rewards = []
  for episode in tqdm(range(evaluation_episodes)):
    if seed:
      state, info = env.reset(seed=seed[episode])
    else:
      state, info = env.reset()
    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):
      
      action = greedy_policy(qtable, state)
      new_state, reward, terminated, truncated, info = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [194]:

mean_reward, std_reward = evaluate_agent(env, max_steps, evaluation_episodes, q_table_frozen_lake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")


100%|███████████████████████████████████████| 100/100 [00:00<00:00, 1380.06it/s]

Mean_reward=0.00 +/- 0.00



