In [34]:
import random

# Sum of reward and discount should equal to 1 and rewards for non-terminal state is constant
REWARD = -0.10 
Y = 0.90
MAX_ERROR = 10**(-3)

# Setting up the environment initially
NUM_ACTIONS = 4

# each element represents each direction the agent(cabi) moves to pickup or drop
ACTIONS = [(1, 0), (0, -1), (-1, 0), (0, 1)] 
rows = 3
Cols = 4
UTILITY = [[0, 0, 0, 1], [0, 0, 0, -1], [0, 0, 0, 0], [0, 0, 0, 0]]

# Visualising the enviroment
def Environment(arr, policy=False):
    res = ""
    for row in range(rows):
        res += "|"
        for column in range(Cols):
            # initialising position to column 1 row 1 as the initial poistion cannot be 0,0
            if row == column == 1:
                val = "X"

            # Setting the boundaries for the rows and columns to be used as steps 
            elif row <= 1 and column == 5:
                val = "+1" if row == 0 else "-1"
            else:
                val = ["v", "<", "^", ">"][arr[row][column]]
            res += " " + val[:5].ljust(5) + " |" # format
        res += "\n"
    return res

#creating a policy randomly
policy = [[random.randint(0, 3) for j in range(Cols)] for i in range(rows)] 
SOLUTION = ""

# Performing some simplified value iteration steps to get an approximation of the utilities
def EvaluatePolicy(policy, util):
    while True:
        n_util = [[0, 0, 0, 1], [0, 0, 0, -1], [0, 0, 0, 0], [0, 0, 0, 0]]
        error = 0

        # Looping through the rows and columns to help evaluate the policy
        for row in range(rows):
            for column in range(Cols):
                if (row <= 1 and column == 3) or (row == column == 1):
                    continue
                n_util[row][column] = calculateUtility(util, row, column, policy[row][column]) # simplified Bellman update
                error = max(error, abs(n_util[row][column]-util[row][column]))
        util = n_util
        if error < MAX_ERROR * (1-Y) / Y:
            break
    return util    

# Getting the utility of the state reached by performing the given action from the given state
def getUtility(util, row, column, action):
    r, c = ACTIONS[action]
    newR, newC = row + r, column + c
    if newR < 0 or newC < 0 or newR >= rows or newC >= Cols or (newR == newC == 1): # collide with the boundary or the wall
        return util[row][column]
    else:
        return util[newR][newC]

# Calculating the utility of a state given the action
def calculateUtility(util, row, column, action):
    u = REWARD
    u += 0.1 * Y * getUtility(util, row, column, (action-1)%4)
    u += 0.8 * Y * getUtility(util, row, column, action)
    u += 0.1 * Y * getUtility(util, row, column, (action+1)%4)
    return u

def  policyIter(policy, UTILITY, cab, pass1, pass2):
    steps = 0;
    while True:
        UTILITY = EvaluatePolicy(policy, UTILITY)
        unchanged = True
        
        changes = random.randint(0, 1)

        path = "Cab location : " , cab ," => "
        # Iterating through the policy
        for row in range(rows):
            for column in range(Cols):
                
                
                if (row <= 1 and column == 3) or (row == column == 1):
                    continue
                maxAction, maxU = None, -float("inf")

                for action in range(NUM_ACTIONS):
                    
                    u = calculateUtility(UTILITY, row, column, action)

                    if u > maxU:
                        maxAction, maxU = action, u
                        
                if maxU > calculateUtility(UTILITY, row, column, policy[row][column]):
                    policy[row][column] = maxAction # the action that maximizes the utility
                    unchanged = False
                    steps = steps +1
        if unchanged:
            break

    pass1_drop = [random.randint(1, 5),random.randint(1, 5)]
    pass2_drop = [random.randint(1, 5),random.randint(1, 5)]
    if changes == 0:
        changes = random.randint(0, 1)
        path = ''.join(map(str, path)) , 
        "passanger2 pickup: ",pass2," => "
        
        if changes == 0:
            path = ''.join(map(str, path)) + "passanger 1 pickup: ",pass1," => "
            path = ''.join(map(str, path)) , "passanger 2 drop: ",pass2_drop," => "
            path = ''.join(map(str, path)) , "passanger 1 drop: ",pass1_drop
        elif changes == 1:
            path = ''.join(map(str, path)) , "passanger 2 drop: ",pass2_drop," => "
            path = ''.join(map(str, path)) , "passanger 1 pickup: ",pass1," => "
            path = ''.join(map(str, path)) , "passanger 1 drop: ",pass1_drop
    else:
        path = ''.join(map(str, path)) , "passanger 1 pickup: ",pass1," => "
        changes = random.randint(0, 1)

        if changes == 0:
            path = ''.join(map(str, path)) , "passanger 2 pickup: ",pass2," => "
            path = ''.join(map(str, path)) , "passanger 1 drop: ",pass1_drop," => "
            path = ''.join(map(str, path)) , "passanger 2 drop : ",pass2_drop
        elif changes == 1:
            path = ''.join(map(str, path)) , "passanger 1 drop: ",pass1_drop," => "
            path = ''.join(map(str, path)) , "passsanger 2 pickup: ",pass2," => "
            path = ''.join(map(str, path)) , "passanger 2 drop: ",pass2_drop

    Environment(policy)
    print("Steps = ",steps)
    listToStr = ' '.join([str(elem) for elem in path])
    print("The path is: ", path)
    print("The optimal path is:")
    return policy

if __name__ == "__main__":
    
    cab = [2,4]
    passanger1 = [1,5]
    passanger2 = [5,1]
    print("Episode 1 from figure 1:")
    print("The cab is located at : ", cab)
    print("The passenger 1 is located at : ", passanger1)
    print("The second 2 is located at : ", passanger2)
    
    # Getting the original policy from figure one.
    policy = [[3, 1, 2, 0], [1, 1, 2, 3], [0, 3, 0, 3]]

    # Printing the optimal path
    policy = policyIter(policy, UTILITY, cab, passanger1, passanger2)
    SOLUTION = Environment(policy)
    print(SOLUTION)

    # Iterating throuh the policy.
    for i in range(4):
        cab = [random.randint(1, 5),random.randint(1, 5)]
        passanger1 = [random.randint(1, 5),random.randint(1, 5)]
        passanger2 = [random.randint(1, 5),random.randint(1, 5)]

        print("Episode ", i+2,":")
        print("The Cab is located at :  ", cab)
        print("Passanger 1 is located at: ", passanger1)
        print("Pasenger 2 is located at:  ", passanger2)
        policy = [[random.randint(0, 3) for j in range(Cols)] for i in range(rows)]
        # Printing the optimal policy

        policy = policyIter(policy, UTILITY, cab, passanger1, passanger2)
        SOLUTION = Environment(policy)
        print(SOLUTION)

Episode 1 from figure 1:
The cab is located at :  [2, 4]
The passenger 1 is located at :  [1, 5]
The second 2 is located at :  [5, 1]
Steps =  10
The path is:  ('Cab location : [2, 4] => passanger 2 drop: [5, 3] => passanger 1 pickup: [1, 5] => ', 'passanger 1 drop: ', [2, 2])
The optimal path is:
| >     | >     | >     | v     |
| ^     | X     | ^     | >     |
| ^     | >     | ^     | <     |

Episode  2 :
The Cab is located at :   [1, 2]
Passanger 1 is located at:  [5, 1]
Pasenger 2 is located at:   [2, 2]
Steps =  17
The path is:  ('Cab location : [1, 2] => passanger 1 pickup: [5, 1] => passanger 2 drop: [5, 4] => ', 'passanger 1 drop: ', [5, 2])
The optimal path is:
| >     | >     | >     | <     |
| ^     | X     | ^     | ^     |
| ^     | >     | ^     | <     |

Episode  3 :
The Cab is located at :   [3, 5]
Passanger 1 is located at:  [1, 1]
Pasenger 2 is located at:   [5, 3]
Steps =  11
The path is:  ('Cab location : [3, 5] => passanger 1 pickup: [1, 1] => passanger 2 pic