In [8]:
import numpy as np

num_states = 14
num_actions = 4
gamma = 1
theta = 0.5
prob = 0.02

a_desc = ["UP", "DOWN", "LEFT", "RIGHT"]
# 0 - UP
# 1 - DOWN
# 2 - LEFT
# 3 - RIGHT

maze = [['W','W','W','W','W','W'],
        ['W','S','S','S','S','W'],
        ['W','S','W','B','S','W'],
        ['W','S','W','S','B','W'],
        ['W','S','W','W','G','W'],
        ['W','B','S','S','W','W'],
        ['W','W','W','W','W','W']]

s_loc = np.zeros((num_states,2))
# print(s_loc)

# Initialize the reward function
r = np.zeros((num_actions,num_states, num_states))

# Initialize the transition probability function
p = np.zeros((num_actions,num_states, num_states))

# Initialize the probability of taking each action in each state
# pi = np.ones((num_states, num_actions, num_states))

maze_col = len(maze[0])
maze_row = len(maze)


# Get the reward for the next state (s_prime)
def getReward(s_prime):
    reward = 0
    
    # The location of the current state in the maze
    i = s_loc[s_prime,0]
    j = s_loc[s_prime,1]
    
    if(maze[i][j] == 'B'):
        reward += -10
    elif(maze[i][j] == 'G'):
        reward += 200
    
    reward += -1
    
    return  reward

# Map between the state and the location in the maze
def mapStateToMaze():
    s = 0
    for i in range(maze_row):
        for j in range(maze_col):
            if(maze[i][j] == 'S' or maze[i][j] == 'B'):
                s_loc[s] = [i,j]
                s += 1

mapStateToMaze()
s_loc = s_loc.astype(int)

print("The state location in the maze are: ")
print(s_loc)

# Get state from the location in the maze
def getStateFromMaze(i,j):
    k = np.where((s_loc == (i,j)).all(axis=1))
    if(k[0].size == 0):
        return 0
    else:
        return k[0][0]
            
# Get the next state given the current state and the action
def getNextState(s, a):
    s_prime = 0
    
    # The location of the current state in the maze
    i = s_loc[s,0]
    j = s_loc[s,1]
    
    if(a == 0):
        # Action is UP
        if(maze[i-1][j] != 'W'):
            s_prime = getStateFromMaze(i-1,j)
        else:
            s_prime = s
    elif(a == 1):
        # Action is DOWN
        if(maze[i+1][j] != 'W'):
            s_prime = getStateFromMaze(i+1,j)
        else:
            s_prime = s
    elif(a == 2):
        # Action is LEFT
        if(maze[i][j-1] != 'W'):
            s_prime = getStateFromMaze(i,j-1)
        else:
            s_prime = s
    elif(a == 3):
        # Action is RIGHT
        if(maze[i][j+1] != 'W'):
            s_prime = getStateFromMaze(i,j+1)
        else:
            s_prime = s
    
    r[a][s][s_prime] = getReward(s_prime)
    
    return s_prime



The state location in the maze are: 
[[1 1]
 [1 2]
 [1 3]
 [1 4]
 [2 1]
 [2 3]
 [2 4]
 [3 1]
 [3 3]
 [3 4]
 [4 1]
 [5 1]
 [5 2]
 [5 3]]


In [9]:

# Calculate the probability of the other states given the current state and action
def getOtherState(s,a):
    for i in range(num_actions):
        if(i != a):
            s_prime = getNextState(s,i)
            p[a][s][s_prime] += prob/3


# Create the probability transition matrix
def createTransitionMatrix():
    for a in range(num_actions):
        for s in range(num_states):
            for s_prime in range(num_states):
                if(s_prime == getNextState(s,a) and s_prime != s):
                    p[a][s][s_prime] = 1-prob
                    getOtherState(s,a)
                elif(getNextState(s,a) == s):
                    p[a][s][s] = 1-prob
                    
createTransitionMatrix()
print("")




# Problem 1

## Vector Form Policy Iteration

Value-State Function:

$
\begin{align}
V^{\pi}(S) = \sum_{S'} P(S'|S,\pi(S))[R(S,\pi(S),S')+\gamma V^{\pi}(S')]
\end{align}
$

$
p = 0.02, \gamma = 0.95, \theta = 0.01
$

In [11]:
prob = 0.1
gamma = 0.9 
theta = 0.5
    
# Initialize the transition probability function
p = np.zeros((num_actions,num_states, num_states))

createTransitionMatrix()

pi_0 = np.zeros((num_states)).astype(int)
v_0 = np.zeros((num_states))
v_1 = np.zeros((num_states))

pi_0 += 2

while True:
    delta = 0
    # v_1 = np.zeros((num_states))
    for s in range(num_states):
        v_0[s] = v_1[s]
        pi = pi_0[s]
        
        v_1[s] = 0
        
        for s_prime in range(num_states):
            if(p[pi][s][s_prime] != 0):
                v_1[s] += (p[pi][s][s_prime] * (r[pi][s][s_prime] + (gamma * v_0[s_prime])))
            else:
                v_1[s] += 0
                
        delta = max(delta, abs(v_0[s] - v_1[s]))
            
        if(delta < theta):
            break
    if(delta < theta):
        break


print("The state-value function is: ")
print(v_1)
print(delta)


The state-value function is: 
[ -2.6977869  -2.34747    -2.63691    -2.63934    -2.21949   -24.41439
 -25.03917    -2.21949    -2.21949    -2.87505    -2.21949   -24.41439
 -25.55217   -16.70553  ]
0.47829690000000014
