<a href="https://colab.research.google.com/github/dandamudi-geeta/Reinforcement-Learning/blob/main/2348512_RL(Lab4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np


In [3]:
class MDP:
    def __init__(self, states, actions, transition_probabilities, rewards):
        self.states = states      # List of states
        self.actions = actions    # List of actions
        self.transition_probabilities = transition_probabilities  # dict: P(state'|state, action)
        self.rewards = rewards      # dict: reward(state, action)

    def get_actions(self, state):
        return self.actions

    def get_next_states(self, state, action):
        return list(self.transition_probabilities[state][action].keys())

In [4]:
class Policy:
    def __init__(self, states, actions):
        self.policy = {state: np.random.choice(actions) for state in states}

    def select_action(self, state):
        return self.policy[state]

    def update(self, state, action):
        self.policy[state] = action

class ValueFunction:
    def __init__(self, states):
        self.values = {state: 0.0 for state in states}

    def get_value(self, state):
        return self.values[state]

    def set_value(self, state, value):
        self.values[state] = value

In [5]:
def policy_evaluation(mdp, policy, theta=0.001):
    value_func = ValueFunction(mdp.states)
    while True:
        delta = 0.0
        for state in mdp.states:
            v = value_func.get_value(state)
            action = policy.select_action(state)
            value_func.set_value(state, sum(
                mdp.transition_probabilities[state][action][next_state] *
                (mdp.rewards[state][action] + value_func.get_value(next_state))
                for next_state in mdp.get_next_states(state, action)
            ))
            delta = max(delta, abs(v - value_func.get_value(state)))
        if delta < theta:
            break
    return value_func

In [7]:
def policy_improvement(mdp, value_func, policy):
    policy_stable = True
    for state in mdp.states:
        old_action = policy.select_action(state)
        # pick the action that maximizes the value function
        action_values = {}
        for action in mdp.get_actions(state):
            action_values[action] = sum(
                mdp.transition_probabilities[state][action][next_state] *
                (mdp.rewards[state][action] + value_func.get_value(next_state))
                for next_state in mdp.get_next_states(state, action)
            )
        best_action = max(action_values, key=action_values.get)
        policy.update(state, best_action)
        if old_action != best_action:
            policy_stable = False
    return policy_stable


In [9]:
def policy_iteration(mdp):
    policy = Policy(mdp.states, mdp.actions)
    while True:
        value_func = policy_evaluation(mdp, policy)
        policy_stable = policy_improvement(mdp, value_func, policy)
        if policy_stable:
            break
    return policy


In [10]:
# Example usage
states = ['s1', 's2', 's3']
actions = ['a1', 'a2']
transition_probabilities = {
    's1': {'a1': {'s1': 0.8, 's2': 0.2}, 'a2': {'s1': 0.5, 's3': 0.5}},
    's2': {'a1': {'s1': 0.1, 's3': 0.9}, 'a2': {'s2': 1.0}},
    's3': {'a1': {'s1': 1.0}, 'a2': {'s2': 1.0}}
}

In [11]:
rewards = {
    's1': {'a1': 0, 'a2': 1},
    's2': {'a1': 0, 'a2': 0},
    's3': {'a1': 0, 'a2': 0}
}


In [None]:
mdp = MDP(states, actions, transition_probabilities, rewards)
optimal_policy = policy_iteration(mdp)

In [None]:
# Print the optimal policy
for state in states:
    print(f"Optimal action for {state}: {optimal_policy.select_action(state)}")
