## MDP

### initializing  

In [1]:
import numpy as np

actions=["Left", "Right","Load","Unload"]
states = ["1U","2U","3U","1L","2L","3L"]

actionIndices=dict(zip(actions, range(len(actions))))
stateIndices = dict(zip(states, range(len(states))))

### transition and probabilities function

In [2]:
TLeft = np.array([
    [1, 0, 0, 0, 0, 0],
    [0.9, 0.1, 0, 0, 0, 0],
    [0, 0.9, 0.1, 0, 0, 0],
    [1, 0, 0, 0, 0, 0],
    [0.9, 0.1, 0, 0, 0, 0],
    [0, 0.9, 0.1, 0, 0, 0]
])

TRight = np.array([
    [0.1, 0.9, 0, 0, 0, 0],
    [0, 0.1, 0.9, 0, 0, 0],
    [0, 0, 1, 0, 0, 0],
    [0.1, 0.9, 0, 0, 0, 0],
    [0, 0.1, 0.9, 0, 0, 0],
    [0, 0, 1, 0, 0, 0]
])

TLoad = np.array([
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 0, 1],
    [1, 0, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 0],
    [0, 0, 1, 0, 0, 0]
])

TUnload= np.array([
    [1, 0, 0, 0, 0, 0],
    [0, 1, 0, 0, 0, 0],
    [0, 0, 1, 0, 0, 0],
    [0, 0, 0, 1, 0, 0],
    [0, 0, 0, 0, 1, 0],
    [0, 0, 0, 0, 0, 1]
])

In [3]:
actionMatrices = {
    "Left": TLeft,
    "Right": TRight,
    "Load": TLoad,
    "Unload": TUnload
}

In [4]:
def transition(state, action):
    currentIdx = stateIndices[state]
    probabilities = actionMatrices[action][currentIdx]
    transitionList=[]
    for idx, prob in enumerate(probabilities):
        transitionList.append((states[idx], prob))
    return transitionList

### reward function

In [5]:
Reward = np.array([
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 0],
    [0, 0, 0, 10]
])

def reward(state,action):
    return Reward[stateIndices[state],actionIndices[action]]

### update Q using Bellman equation <br/>
#### Q*(s, a) = R(s, a) + γ ∑<sub>s'∈S</sub> p(s'|s, a) max<sub>a'∈A</sub> Q*(s', a').

In [6]:
gamma=0.95

def Q(state, action, QValues):
    stateIdx = stateIndices[state]
    actionIdx = actionIndices[action]
    
    currentStateReward = Reward[stateIdx, actionIdx]
    
    transitionProbs = transition(state, action)
    

    futureValue = 0
    for nextState, prob in transitionProbs:
        nextStateIdx = stateIndices[nextState]
        if prob > 0: 
            maxQNext = max(QValues[nextStateIdx])  
            futureValue += prob * maxQNext
    
    return currentStateReward + gamma * futureValue

def updateQ(QValues):
    for state in states:
        for action in actions:
            QValues[stateIndices[state], actionIndices[action]] = Q(state, action, QValues)
    return QValues



### Result and Optimal Policy for states

In [7]:
QValues = np.zeros((len(states), len(actions)))

for iteration in range(1000):  
    QValues = updateQ(QValues)

print("Optimal Policy:")
for state in states:
    bestActionIdx = np.argmax(QValues[stateIndices[state], :])
    bestAction = actions[bestActionIdx]
    print(f"State {state}: {bestAction}")


Optimal Policy:
State 1U: Right
State 2U: Right
State 3U: Load
State 1L: Right
State 2L: Right
State 3L: Unload
