In [3]:
import numpy as np

In [11]:
class gridWorld():
    def __init__(self, grid_size=5, actions=['N', 'S', 'E', 'W'], gamma=0.9):
        self.grid_size = grid_size
        self.actions = actions
        self.grid_values = np.zeros((grid_size, grid_size))
        self.prev_values = np.zeros((grid_size, grid_size))
        self.gamma = gamma
        
    def get_new_state_reward(self, state, action):
        """
        input state tuple and action char
        
        Returns state, reward
        """
        x, y = state
        if (x==0 and y==0) and (action=='N' or action=='W'):
            return (x, y), -1
        
        elif (x==0 and y==self.grid_size-1) and (action=='N' or action=='E'):
            return (x, y), -1
        
        elif (x==self.grid_size-1 and y==self.grid_size-1) and (action=='S' or action=='E'):
            return (x, y), -1
        
        elif (x==self.grid_size-1 and y==0) and (action=='S' or action=='W'):
            return (x, y), -1
        
        elif x==0 and action=='N':
            return (x, y), -1
        
        elif x==self.grid_size-1 and action=='S':
            return (x, y), -1
        
        elif y==0 and action=='W':
            return (x, y), -1
        
        elif y==self.grid_size-1 and action=='E':
            return (x, y), -1
        
        elif x==0 and y==1:
            return (4, 1), 10
        
        elif x==0 and y==3:
            return (2, 3), 5
        
        elif action=='N':
            return (x-1, y), 0
        elif action=='S':
            return (x+1, y), 0
        elif action=='E':
            return (x, y+1), 0
        elif action=='W':
            return (x, y-1), 0
        
        else:
            print("Unknown state or action:", state, action)
            
    def get_new_value_estimate(self, i, j):
#         temp = 0
        prev_max = -np.inf
        for action in self.actions:
            a, b = self.get_new_state_reward((i, j), action)
            c = self.gamma * self.grid_values[a[0], a[1]] + b
            if prev_max < c:
                prev_max = c
            
        return prev_max
            
    def value_iteration(self):
        for i in range(self.grid_size):
            for j in range(self.grid_size):
                self.grid_values[i, j] = self.get_new_value_estimate(i, j)
                
    def get_grid(self):
        for i in range(10000):
            self.value_iteration()
#             print("Iteration:", i)
        print(np.round(self.grid_values, decimals=1), "\n")
            

In [5]:
"""
Q2. 

We attempt to solve the system of 25 linear equations (1 equation per state)
using Ax = b, where A is a matrix of coefficients of the equations, x is a
a vector of all state values and b is a vector of constants obtained from the 
system of equations.

This can be done using the numpy library.
"""
import numpy as np
from numpy import linalg

A = np.zeros((25, 25), dtype=np.float32)
b = np.zeros((25,), dtype=np.float32)
with open('variable_coeffs.csv', 'r') as f:
    lines = f.readlines()
    
for i, line in enumerate(lines):
    A[i] = np.asarray(list(map(float, line.split(","))))
    
with open('constants.csv', 'r') as f:
    lines = f.readlines()
    
for i, line in enumerate(lines):
    b[i] = np.asarray(list(map(float, line.split(","))))
    
# print(A)
# print(b)

x = linalg.solve(A, b)
x = np.reshape(x, (5, 5))
print(np.round(x, decimals=1))

[[ 3.3  8.8  4.4  5.3  1.5]
 [ 1.5  3.   2.3  1.9  0.5]
 [ 0.1  0.7  0.7  0.4 -0.4]
 [-1.  -0.4 -0.4 -0.6 -1.2]
 [-1.9 -1.3 -1.2 -1.4 -2. ]]


In [12]:
"""
Q4. 

We solve for optimal state values by picking actions corresponding max return of rewards and solving 
Belman equations using Value Iteration. I iterate for 10,000 steps considering gamma^10,000 ~ 0 hence 
rewards will have converged as well. 
"""
A = gridWorld()
A.get_grid()

[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]] 

