<a href="https://colab.research.google.com/github/divyasree-coder/AIML--2025/blob/main/lab_1mdps.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Value Iteration and Policy Iteration for a simple MDP

import numpy as np

In [12]:
# Define MDP parameters
states = [0, 1, 2]
actions = [0, 1]  # 0: left, 1: right
P = {
    0: {0: [(1.0, 0, 0)], 1: [(1.0, 1, 0)]},
    1: {0: [(1.0, 0, 0)], 1: [(1.0, 2, 1)]},
    2: {0: [(1.0, 1, 0)], 1: [(1.0, 2, 0)]}
}
gamma = 0.9

def value_iteration(P, states, actions, gamma, theta=1e-5):
    V = np.zeros(len(states))
    while True:
        delta = 0
        for s in states:
            v = V[s]
            V[s] = max(
                sum(prob * (reward + gamma * V[next_state])
                    for prob, next_state, reward in P[s][a])
                for a in actions
            )
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    # Derive policy
    policy = np.zeros(len(states), dtype=int)
    for s in states:
        q = [sum(prob * (reward + gamma * V[next_state])
                 for prob, next_state, reward in P[s][a])
             for a in actions]
        policy[s] = np.argmax(q)
    return V, policy

def policy_iteration(P, states, actions, gamma):
    policy = np.zeros(len(states), dtype=int)
    V = np.zeros(len(states))
    while True:
        # Policy Evaluation
        while True:
            delta = 0
            for s in states:
                v = V[s]
                a = policy[s]
                V[s] = sum(prob * (reward + gamma * V[next_state])
                           for prob, next_state, reward in P[s][a])
                delta = max(delta, abs(v - V[s]))
            if delta < 1e-5:
                break
        # Policy Improvement
        policy_stable = True
        for s in states:
            old_action = policy[s]
            q = [sum(prob * (reward + gamma * V[next_state])
                     for prob, next_state, reward in P[s][a])
                 for a in actions]
            policy[s] = np.argmax(q)
            if old_action != policy[s]:
                policy_stable = False
        if policy_stable:
          break
    return V, policy


In [13]:
# Run Value Iteration
V_vi, policy_vi = value_iteration(P, states, actions, gamma)
print("Value Iteration - Values:", V_vi)
print("Value Iteration - Policy:", policy_vi)

Value Iteration - Values: [4.73680657 5.26312591 4.73681332]
Value Iteration - Policy: [1 1 0]


In [15]:
 # Run Policy Iteration
V_pi, policy_pi = policy_iteration(P, states, actions, gamma)
print("Policy Iteration - Values:", V_pi)
print("Policy Iteration - Policy:", policy_pi)

Policy Iteration - Values: [4.73681332 5.26313199 4.73681879]
Policy Iteration - Policy: [1 1 0]
