#Importing Libraries

In [33]:
!pip install -q gymnasium

import numpy as np
import gymnasium as gym
import random

#Create the Environment

In [41]:
env = gym.make("FrozenLake-v1", is_slippery=True)

state_space = env.observation_space.n
action_space = env.action_space.n

Initialize Q-Table

In [35]:
q_table = np.zeros((state_space, action_space))


Hyperparameters

In [36]:
alpha = 0.1      # Learning rate
gamma = 0.99     # Discount factor
epsilon = 1.0    # Exploration rate
epsilon_decay = 0.001
episodes = 2000


#Q-Learning Algorithm (Off-Policy)

In [38]:
for episode in range(episodes):
    state, _ = env.reset()
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state])

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        q_table[state, action] = q_table[state, action] + alpha * (
            reward + gamma * np.max(q_table[next_state]) - q_table[state, action]
        )

        state = next_state

    epsilon = max(0.01, epsilon - epsilon_decay)


Explanation

Uses best possible future action

Learns optimal policy even if agent behaves randomly

Re-initialize Q-Table

In [None]:
q_table_sarsa = np.zeros((state_space, action_space))
epsilon = 1.0

#SARSA Algorithm (On-Policy)

In [39]:
for episode in range(episodes):
    state, info = env.reset()

    if random.uniform(0, 1) < epsilon:
        action = env.action_space.sample()
    else:
        action = np.argmax(q_table_sarsa[state])

    done = False

    while not done:
        next_state, reward, terminated, truncated, info = env.step(action);
        done = terminated or truncated

        if random.uniform(0, 1) < epsilon:
            next_action = env.action_space.sample()
        else:
            next_action = np.argmax(q_table_sarsa[next_state])

        # SARSA update rule
        q_table_sarsa[state, action] = q_table_sarsa[state, action] + alpha * (
            reward + gamma * q_table_sarsa[next_state, next_action] - q_table_sarsa[state, action]
        )

        state = next_state
        action = next_action

    epsilon = max(0.01, epsilon - epsilon_decay)

In [40]:
print("Q-Learning Q-table:\n", q_table)
print("\nSARSA Q-table:\n", q_table_sarsa)


Q-Learning Q-table:
 [[0.50137894 0.47631036 0.48421145 0.45603591]
 [0.09186902 0.0570871  0.05250653 0.46739552]
 [0.38409074 0.15391945 0.05165008 0.0567021 ]
 [0.07730967 0.00321427 0.00194719 0.00417069]
 [0.51441131 0.30522337 0.43469294 0.42085162]
 [0.         0.         0.         0.        ]
 [0.08754964 0.0864306  0.27236785 0.06742032]
 [0.         0.         0.         0.        ]
 [0.30334063 0.4067126  0.3133837  0.54649424]
 [0.38099236 0.63555867 0.32188575 0.19260714]
 [0.66838505 0.33492117 0.26342163 0.13309709]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.27351325 0.34089321 0.71930426 0.33236988]
 [0.49774639 0.630934   0.84800359 0.51917434]
 [0.         0.         0.         0.        ]]

SARSA Q-table:
 [[0.46307269 0.41064049 0.4211268  0.4034234 ]
 [0.0229756  0.09611246 0.0466548  0.32631377]
 [0.05218288 0.22007996 0.05674719 0.06499841]
 [0.03291066 0.10229502 0.00102356 0.00499221]
 [0.48519021 0.403016

| Feature       | Q-Learning  | SARSA         |
| ------------- | ----------- | ------------- |
| Type          | Off-policy  | On-policy     |
| Future Action | Max Q-value | Chosen action |
| Risk          | Aggressive  | Conservative  |
| Convergence   | Faster      | Safer         |


In this experiment, Q-Learning and SARSA were implemented using the FrozenLake environment. Q-Learning, being an off-policy algorithm, learned the optimal policy faster by always considering the best possible future action. SARSA, on the other hand, followed an on-policy approach and learned more conservatively by updating values based on the actions actually taken. While Q-Learning showed faster convergence, SARSA demonstrated safer learning behavior. Both algorithms successfully learned optimal policies, highlighting the strengths of reinforcement learning techniques.     
