12. Implement  the  Q-learning  algorithm  in  Python  for  a  simple  5x5  grid  world 
environment for an agent to reach the bottom-right cell as its goal starting from the 
top-left  position  and  with  action  space defined as {up, down, right, left}. Use this 
implementation to find the optimal policy. 

In [2]:
import numpy as np
import random

In [3]:
grid_size = 5
actions = ["up","down","left","right"]
n_actions = len(actions)
goal_state = (4,4)

In [4]:
def state_to_index(state):
    return state[0] * grid_size + state[1]

In [6]:
def index_to_state(index):
    return (index // grid_size, index % grid_size)

In [8]:
def is_valid(state):
    return 0 <= state[0] < grid_size and 0 <= state[1] < grid_size 

In [13]:
def get_next_state(state, action):
    x, y = state
    if action == 0:
         x -= 1          # up
    elif action == 1:
         x += 1          # down
    elif action == 2:
         y -= 1          # left
    elif action == 3:
         y += 1          # right
    next_state = (x, y)
    return next_state if is_valid(next_state) else state

In [14]:
n_states = grid_size * grid_size
q_table = np.zeros((n_states, n_actions))

In [16]:
alpha = 0.1       # learning rate (how much to update Q-values)
gamma = 0.9       # discount factor (importance of future rewards)
epsilon = 0.2     # exploration rate (how often to explore)
episodes = 1000   # total times the agent will try to learn

In [None]:
for episode in range(episodes):
    state = (0,0)
    while state != goal_state:
        s_idx = state_to_index(state)

        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, n_actions - 1) 
        else:
            action = np.argmax(q_table[s_idx])  
        
        next_state = get_next_state(state, action)
        next_s_idx = state_to_index(next_state)
        reward = 10 if next_state == goal_state else -1

        q_table[s_idx, action] += alpha * (
            reward + gamma * np.max(q_table[next_s_idx]) - q_table[s_idx, action]
        )

        state = next_state

In [None]:
policy = np.array([''] * n_states, dtype=object)

for state_index in range(n_states):
    best_action = np.argmax(q_table[state_index])
    policy[state_index] = actions[best_action]

policy_grid = policy.reshape((grid_size, grid_size))
print(policy_grid)


[['down' 'down' 'down' 'down' 'down']
 ['down' 'down' 'down' 'right' 'down']
 ['right' 'right' 'down' 'right' 'down']
 ['right' 'right' 'down' 'down' 'down']
 ['right' 'right' 'right' 'right' 'up']]
