In [78]:
import numpy as np

num_states = 15
num_actions = 4

prob = 0.02
gamma = 0.95
theta = 0.05

a_desc = ["UP", "DOWN", "LEFT", "RIGHT"]
# 0 - UP
# 1 - DOWN
# 2 - LEFT
# 3 - RIGHT

maze = [['W','W','W','W','W','W'],
        ['W','S','S','S','S','W'],
        ['W','S','W','B','S','W'],
        ['W','S','W','S','B','W'],
        ['W','S','W','W','G','W'],
        ['W','B','S','S','W','W'],
        ['W','W','W','W','W','W']]

s_loc = np.zeros((num_states,2))

# Initialize the reward function
r = np.zeros((num_actions,num_states, num_states))

# Initialize the transition probability function
p = np.zeros((num_actions,num_states, num_states))

# Initialize the probability of taking each action in each state
# pi = np.ones((num_states, num_actions, num_states))

maze_col = len(maze[0])
maze_row = len(maze)


# Get the reward for the next state (s_prime)
def getReward(s_prime):
    reward = 0
    
    # The location of the current state in the maze
    i = s_loc[s_prime,0]
    j = s_loc[s_prime,1]
    
    if(maze[i][j] == 'B'):
        reward += -10
    elif(maze[i][j] == 'G'):
        reward += 200
    
    reward += -1
    
    return  reward

# Map between the state and the location in the maze
def mapStateToMaze():
    s = 0
    for i in range(maze_row):
        for j in range(maze_col):
            if(maze[i][j] == 'S' or maze[i][j] == 'B' or maze[i][j] == 'G'):
                s_loc[s] = [i,j]
                s += 1

mapStateToMaze()
s_loc = s_loc.astype(int)

print("The state location in the maze are: ")
print(s_loc)

# Get state from the location in the maze
def getStateFromMaze(i,j):
    k = np.where((s_loc == (i,j)).all(axis=1))
    if(k[0].size == 0):
        return 0
    else:
        return k[0][0]
            
# Get the next state given the current state and the action
def getNextState(s, a):
    s_prime = 0
    
    # The location of the current state in the maze
    i = s_loc[s,0]
    j = s_loc[s,1]
    
    if(a == 0):
        # Action is UP
        if(maze[i-1][j] != 'W'):
            s_prime = getStateFromMaze(i-1,j)
        else:
            s_prime = s
    elif(a == 1):
        # Action is DOWN
        if(maze[i+1][j] != 'W'):
            s_prime = getStateFromMaze(i+1,j)
        else:
            s_prime = s
    elif(a == 2):
        # Action is LEFT
        if(maze[i][j-1] != 'W'):
            s_prime = getStateFromMaze(i,j-1)
        else:
            s_prime = s
    elif(a == 3):
        # Action is RIGHT
        if(maze[i][j+1] != 'W'):
            s_prime = getStateFromMaze(i,j+1)
        else:
            s_prime = s
    
    # Insert the reward of the going from s to s_prime by taking action a
    if(s != 11):
        r[a][s][s_prime] = getReward(s_prime)
    
    return s_prime

getAdjacentStates = lambda s: [getNextState(s,0), getNextState(s,1), getNextState(s,2), getNextState(s,3)]

rewardMatrix = lambda s: [getReward(s_prime) for s_prime in getAdjacentStates(s)]

# Calculate the probability of the other states given the current state and action
def getOtherState(s,a):
    for i in range(num_actions):
        if(i != a):
            s_prime = getNextState(s,i)
            p[a][s][s_prime] += prob/3


# Create the probability transition matrix
def createTransitionMatrix(prob):
    for a in range(num_actions):
        for s in range(num_states):
            next_state = getNextState(s,a)
            adjStates = getAdjacentStates(s)

            for s_prime in adjStates:
                if(s_prime == next_state):
                    p[a][s][s_prime] = 1-prob
                    # getOtherState(s,a)
                else:
                    p[a][s][s_prime] += prob/3



def createRewardMatrix():
    for a in range(num_actions):
        for s in range(num_states):
            adjStates = getAdjacentStates(s)
            for s_prime in adjStates:
                r[a][s][s_prime] = rewardMatrix(s)[a]

createTransitionMatrix(prob)
print("")




The state location in the maze are: 
[[1 1]
 [1 2]
 [1 3]
 [1 4]
 [2 1]
 [2 3]
 [2 4]
 [3 1]
 [3 3]
 [3 4]
 [4 1]
 [4 4]
 [5 1]
 [5 2]
 [5 3]]



# Problem 1

## Policy Evaluation

Value-State Function:

$
V^{\pi}(S) = \sum_{S'} P(S'|S,\pi(S))[R(S,\pi(S),S')+\gamma V^{\pi}(S')]
$

In [None]:
prob = 0.02
gamma = 0.95
theta = 0.05
    
# Initialize the transition probability function
p = np.zeros((num_actions,num_states, num_states))

createTransitionMatrix(prob)
createRewardMatrix()

pi_0 = np.zeros((num_states)).astype(int)
pi_0 += 2

def policy_evaluation(p, r, gamma, theta, initial_pi):
    v_0 = np.zeros((num_states))
    v_1 = np.zeros((num_states))

    while True:
        delta = 0
        # v_1 = np.zeros((num_states))
        for s in range(num_states):
            v_0[s] = v_1[s]
            pi = initial_pi[s]
            
            v_1[s] = 0
            
            for s_prime in range(num_states):
                if(p[pi][s][s_prime] != 0):
                    v_1[s] += (p[pi][s][s_prime] * (r[pi][s][s_prime] + (gamma * v_0[s_prime])))
                else:
                    v_1[s] += 0
            print(v_1)

            delta = max(delta, abs(v_1[s] - v_0[s]))
                
            if(delta < theta):
                return v_1,delta


# Generate method for policy improvement in reinforcement learning
def policy_improvement(p, r, gamma, v, pi):
    policy_stable = True
    for s in range(num_states):
        old_action = pi[s]
        q = np.zeros((num_actions))
        
        for a in range(num_actions):
            for s_prime in range(num_states):
                if(p[a][s][s_prime] != 0):
                    q[a] += (p[a][s][s_prime] * (r[a][s][s_prime] + (gamma * v[s_prime])))
                else:
                    q[a] += 0
        
        pi[s] = np.argmax(q)
        
        if(old_action != pi[s]):
            policy_stable = False
            
    return pi, policy_stable

v,delta = policy_evaluation(p,r,gamma,theta,pi_0)

# pi, policy_stable = policy_improvement(p,r,gamma,v,pi_0)
# print(pi)
# print(policy_stable)

# while(policy_stable == False):
#     v, delta = policy_evaluation(p,r,gamma,theta,pi)
#     pi, policy_stable = policy_improvement(p,r,gamma,v,pi)
#     print(np.average(v))
#     print(pi)
#     print(policy_stable)

## Value Iteration Backup

$
V_{k+1}(S) = max_{a \in A} \sum_{S'} \; P(S'|S,a) \; [R(S,a,S')+ \gamma \; V_{k}(S')]
$

In [80]:
prob = 0.02
gamma = 0.95
theta = 0.05

# Initialize the transition probability function
p = np.zeros((num_actions,num_states, num_states))

createTransitionMatrix(prob)
createRewardMatrix()

def vib(p, r, gamma, theta):
    v_0 = np.zeros((num_actions, num_states))
    v_1 = np.zeros((num_actions, num_states))

    while True:
        delta = 0
        v_max = 0

        for s in range(num_states):
            
            # print("")
            
            for a in range(num_actions):
                v_0[a][s] = v_1[a][s]
                v_1[a][s] = 0

                for s_prime in range(num_states):
                    
                    if(p[a][s][s_prime] != 0):
                        v_1[a][s] += (p[a][s][s_prime] * (r[a][s][s_prime] + (gamma * v_0[a][s_prime])))
                    else:
                        v_1[a][s] += 0
        
            v_max = np.max(v_1[:,s])
            for i in range(num_actions):
                if(i==0):
                    v_max = v_1[i][s]
                else:
                    v_max = max(v_max, v_1[i][s])
                
                v_1[i][s] = v_max
                delta = max(delta, abs(v_0[i][s] - v_1[i][s]))
            
            if(delta < theta):
                return v_1,delta
        print(v_0)
        print(v_1)


v_result,delta = vib(p,r,gamma,theta)
# print(v_result)

pi = np.zeros((num_states)).astype(int)

def optimal_policy(p, r, gamma, v):
    # policy_stable = True
    for s in range(num_states):
        # old_action = pi[s]
        q = np.zeros((num_actions))
        
        for a in range(num_actions):
            for s_prime in range(num_states):
                if(p[a][s][s_prime] != 0):
                    q[a] += (p[a][s][s_prime] * (r[a][s][s_prime] + (gamma * v[a][s_prime])))
                else:
                    q[a] += 0
        
        pi[s] = np.argmax(q)
        
        # if(old_action != pi[s]):
        #     policy_stable = False
            
    return pi

pi = optimal_policy(p,r,gamma,v_result)
for i in range(num_states):
    print(s_loc[i], pi[i], a_desc[pi[i]])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[ -0.99333333  -0.99333333  -1.          -0.99333333  -1.
   -1.          -1.          -1.         -11.          -1.
   -1.         -11.          -1.          -0.99333333  -0.98666667]
 [ -0.99333333  -0.99333333  -1.          -0.99333333  -1.
   -1.          -1.          -1.          -0.99333333 199.
   -1.         196.34666667  -1.          -0.99333333  -0.98666667]
 [ -0.99333333  -0.99333333  -1.          -0.99333333  -0.99333333
   -1.          -1.          -0.99333333  -0.99333333 199.
   -0.99333333 196.34666667  -1.          -0.99333333  -0.98666667]
 [ -0.99333333  -0.99333333  -1.          -0.99333333  -0.99333333
   -1.          -1.          -0.99333333  -0.99333333 199.
   -0.99333333 196.34666667  -1.          -0.99333333  -0.98666667]]
[[ -0.99333333  -0.99333333  -1.          -0.9