In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.table import Table
import pandas as pd
plt.style.use("seaborn")

In [2]:
WORLD_SIZE = 5
A_POS = [0, 1]
A_PRIME_POS = [4, 1]
B_POS = [0, 3]
B_PRIME_POS = [2, 3]
DISCOUNT = 0.9
ACTIONS = [[-1, 0],  #up
           [1, 0],   #down
           [0, -1],  #left
           [0, 1]]   #right
class Grid:
     def __init__(self,pos,rew,next_pos):
            self.position = pos
            self.reward = rew
            self.next_position = next_pos

In [3]:
grid_word = [[0]*WORLD_SIZE for i in range(WORLD_SIZE)]
for i in range(WORLD_SIZE):
    for j in range(WORLD_SIZE):
        if [i,j] == B_POS: 
            b_reward = [5]*4
            next_pos = [B_PRIME_POS for i in range(WORLD_SIZE)]
            grid_word[i][j] = Grid([i,j], b_reward,next_pos)
        elif [i,j] == A_POS:
            a_reward = [10]*4 
            next_pos = [A_PRIME_POS for i in range(WORLD_SIZE)]
            grid_word[i][j] = Grid([i,j], a_reward,next_pos)
        else:
            reward = [0]*4
            next_pos = []
            for k,action in enumerate(ACTIONS):
                x = i+action[0]
                y = j+action[1]
                next_pos.append([x,y])
                if x<0 or x>=WORLD_SIZE or y < 0 or y >= WORLD_SIZE:
                        reward[k] = -1
                        next_pos[k] = [i,j]
            grid_word[i][j] = Grid([i,j],reward,next_pos)

In [4]:
for i in range(WORLD_SIZE):
    for j in range(WORLD_SIZE):
        print(grid_word[i][j].position ,grid_word[i][j].reward)

[0, 0] [-1, 0, -1, 0]
[0, 1] [10, 10, 10, 10]
[0, 2] [-1, 0, 0, 0]
[0, 3] [5, 5, 5, 5]
[0, 4] [-1, 0, 0, -1]
[1, 0] [0, 0, -1, 0]
[1, 1] [0, 0, 0, 0]
[1, 2] [0, 0, 0, 0]
[1, 3] [0, 0, 0, 0]
[1, 4] [0, 0, 0, -1]
[2, 0] [0, 0, -1, 0]
[2, 1] [0, 0, 0, 0]
[2, 2] [0, 0, 0, 0]
[2, 3] [0, 0, 0, 0]
[2, 4] [0, 0, 0, -1]
[3, 0] [0, 0, -1, 0]
[3, 1] [0, 0, 0, 0]
[3, 2] [0, 0, 0, 0]
[3, 3] [0, 0, 0, 0]
[3, 4] [0, 0, 0, -1]
[4, 0] [0, -1, -1, 0]
[4, 1] [0, -1, 0, 0]
[4, 2] [0, -1, 0, 0]
[4, 3] [0, -1, 0, 0]
[4, 4] [0, -1, 0, -1]


In [5]:
def equiprobable_random_policy():
    values = np.zeros((WORLD_SIZE,WORLD_SIZE))
    while True:
        new_values = np.zeros((WORLD_SIZE,WORLD_SIZE))
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                for k in range(len(ACTIONS)):
                    x,y = grid_word[i][j].next_position[k]
                    new_values[i,j] += 0.25*(grid_word[i][j].reward[k]+DISCOUNT*values[x,y])
        if np.sum(np.abs(new_values - values)) < 1e-4:
            return pd.DataFrame(values)
        values = new_values

In [7]:
df = equiprobable_random_policy()
df.round(1)

Unnamed: 0,0,1,2,3,4
0,3.3,8.8,4.4,5.3,1.5
1,1.5,3.0,2.3,1.9,0.5
2,0.1,0.7,0.7,0.4,-0.4
3,-1.0,-0.4,-0.4,-0.6,-1.2
4,-1.9,-1.3,-1.2,-1.4,-2.0


In [12]:
def optimal_policy():
    values = np.zeros((WORLD_SIZE,WORLD_SIZE))
    while True:
        new_values = np.zeros((WORLD_SIZE,WORLD_SIZE))
        for i in range(WORLD_SIZE):
            for j in range(WORLD_SIZE):
                all_value = []
                for k in range(len(ACTIONS)):
                    x,y = grid_word[i][j].next_position[k]
                    all_value.append(grid_word[i][j].reward[k]+DISCOUNT*values[x,y])
                new_values[i,j] = max(all_value)
        if np.sum(np.abs(new_values - values)) < 1e-4:
            return pd.DataFrame(values)
        values = new_values

In [13]:
df = optimal_policy()
df.round(1)

Unnamed: 0,0,1,2,3,4
0,22.0,24.4,22.0,19.4,17.5
1,19.8,22.0,19.8,17.8,16.0
2,17.8,19.8,17.8,16.0,14.4
3,16.0,17.8,16.0,14.4,13.0
4,14.4,16.0,14.4,13.0,11.7
