# Grid Environment


![alt text](assets/image.png)

In [1]:
from mdp import MDP
import torch
from agent import Agent

# Creating an environment 

In [2]:
# up = 1, down = 2, left = 3, right = 4 
# Original array
data = [
    [0, "u", 0, 0, 1], [0, "d", 0, 0, 1], [0, "r", 0, 0, 1], [0, "l", 0, 0, 1],
    [1, "u", 1, -1, 1], [1, "d", 5, -1, 1], [1, "r", 2, -1, 1], [1, "l", 0, -1, 1],
    [2, "u", 2, -1, 1], [2, "d", 6, -1, 1], [2, "r", 3, -1, 1], [2, "l", 1, -1, 1],
    [3, "u", 3, -1, 1], [3, "d", 7, -1, 1], [3, "r", 3, -1, 1], [3, "l", 2, -1, 1],
    [4, "u", 0, -1, 1], [4, "d", 8, -1, 1], [4, "r", 5, -1, 1], [4, "l", 4, -1, 1],
    [5, "u", 1, -1, 1], [5, "d", 9, -1, 1], [5, "r", 6, -1, 1], [5, "l", 4, -1, 1],
    [6, "u", 2, -1, 1], [6, "d", 10, -1, 1], [6, "r", 7, -1, 1], [6, "l", 5, -1, 1],
    [7, "u", 3, -1, 1], [7, "d", 11, -1, 1], [7, "r", 7, -1, 1], [7, "l", 6, -1, 1],
    [8, "u", 4, -1, 1], [8, "d", 12, -1, 1], [8, "r", 9, -1, 1], [8, "l", 8, -1, 1],
    [9, "u", 5, -1, 1], [9, "d", 13, -1, 1], [9, "r", 10, -1, 1], [9, "l", 8, -1, 1],
    [10, "u", 6, -1, 1], [10, "d", 14, -1, 1], [10, "r", 11, -1, 1], [10, "l", 9, -1, 1],
    [11, "u", 7, -1, 1], [11, "d", 0, -1, 1], [11, "r", 11, -1, 1], [11, "l", 10, -1, 1],
    [12, "u", 8, -1, 1], [12, "d", 12, -1, 1], [12, "r", 13, -1, 1], [12, "l", 12, -1, 1],
    [13, "u", 9, -1, 1], [13, "d", 13, -1, 1], [13, "r", 14, -1, 1], [13, "l", 12, -1, 1],
    [14, "u", 10, -1, 1], [14, "d", 14, -1, 1], [14, "r", 0, -1, 1], [14, "l", 13, -1, 1],
    [15, "u", 15, 0, 1], [15, "d", 15, 0, 1], [15, "r", 15, 0, 1], [15, "l", 15, 0, 1]
]
#s,a,s',r,p
# Mapping for directions
direction_map = {"u": 1, "d": 2, "l": 3, "r": 4}

# Convert the data
converted_data = [[item[0], direction_map[item[1]], item[2], item[3], item[4]] for item in data]

# Convert to PyTorch tensor
transition_probs= torch.tensor(converted_data, dtype=torch.int64)
action_spaces = [i for i in range(1,5)]
state_spaces = [i for i in range (0,16)]


grid_env = MDP(state_spaces,action_spaces,transition_probs)
agent = Agent(gamma=1.0)

# Policy Evaluation

In [3]:
value_of_policy = agent.policy_evaluation(0.001,grid_env)


Value function after 1 iterations.
---------------------------------
|   0.00 |  -1.00 |  -1.25 |  -1.31 |
---------------------------------
|  -1.00 |  -1.50 |  -1.69 |  -1.75 |
---------------------------------
|  -1.25 |  -1.69 |  -1.84 |  -1.90 |
---------------------------------
|  -1.31 |  -1.75 |  -1.90 |   0.00 |
---------------------------------

Value function after 2 iterations.
---------------------------------
|   0.00 |  -1.94 |  -2.55 |  -2.73 |
---------------------------------
|  -1.94 |  -2.81 |  -3.24 |  -3.40 |
---------------------------------
|  -2.55 |  -3.24 |  -3.57 |  -3.22 |
---------------------------------
|  -2.73 |  -3.40 |  -3.22 |   0.00 |
---------------------------------

Value function after 3 iterations.
---------------------------------
|   0.00 |  -2.82 |  -3.83 |  -4.18 |
---------------------------------
|  -2.82 |  -4.03 |  -4.71 |  -4.88 |
---------------------------------
|  -3.83 |  -4.71 |  -4.96 |  -4.26 |
--------------------------------

# Policy Iteration with Policy Improvement

In [4]:
new_policy = agent.policy_improvement(environment=grid_env,value_function=value_of_policy,verbose=True)

Policy Improvement-- New Policy

Policy after 1 iterations.
---------------------------------
|  ↑  |  ←  |  ←  |  ←  |
---------------------------------
|  ↑  |  ←  |  ←  |  ↓  |
---------------------------------
|  ↑  |  ↑  |  ↓  |  ↓  |
---------------------------------
|  ↑  |  →  |  →  |  ↑  |
---------------------------------
