In [1]:
import numpy as np

# Deterministic Policies

In [2]:
def deterministic_policy(state):
    if state == "S1":
        return "A1"
    elif state == "S2":
        return "A2"
    elif state == "S3":
        return "A1"
    else:
        return "Invalid state"

In [3]:
def simulate_environment(state, action):
    
    transitions = {
        "S1" : {"A1" : "S2", "A2" : "S3"},
        "S2" : {"A1" : "S1", "A2" : "S3"},
        "S3" : {"A1" : "S1", "A2" : "S2"}
    }
    
    rewards = {
        "S1" : {"A1" : {"S2" : 10}, "A2" : {"S3" : 0 }},
        "S2" : {"A1" : {"S1" : 0},  "A2" : {"S3" : 5 }},
        "S3" : {"A1" : {"S1" : -1}, "A2" : {"S2" : 0 }}
    }
    
    next_state = transitions[state][action]
    reward = rewards[state][action][next_state]
    
    return [next_state, reward] 

In [4]:
current_state      = "S1"
cummulative_reward = 0

In [5]:
for _ in range(10):
    action = deterministic_policy(current_state)
    print(f"Current State : {current_state}, Action : {action}")
        
    next_state, reward = simulate_environment(current_state, action)
    print(f"Next state : {next_state}. Reward : {reward} \n")
        
    cummulative_reward += reward
    current_state  =  next_state
        


Current State : S1, Action : A1
Next state : S2. Reward : 10 

Current State : S2, Action : A2
Next state : S3. Reward : 5 

Current State : S3, Action : A1
Next state : S1. Reward : -1 

Current State : S1, Action : A1
Next state : S2. Reward : 10 

Current State : S2, Action : A2
Next state : S3. Reward : 5 

Current State : S3, Action : A1
Next state : S1. Reward : -1 

Current State : S1, Action : A1
Next state : S2. Reward : 10 

Current State : S2, Action : A2
Next state : S3. Reward : 5 

Current State : S3, Action : A1
Next state : S1. Reward : -1 

Current State : S1, Action : A1
Next state : S2. Reward : 10 



In [6]:
print("Cummulative Reward : ", cummulative_reward)

Cummulative Reward :  52


# Stochastic Policies

In [7]:
def stochastic_policy(state):
    if state == "S1":
        action_probs = { "A1" : 0.6, "A2" : 0.4}
    elif state == "S2":
        action_probs = { "A1" : 0.3, "A2" : 0.7}
    elif state == "S3":
        action_probs = { "A1" : 0.8, "A2" : 0.2}
    else:
        return ValueError("Invalid State")
    
    action = np.random.choice(list(action_probs.keys()), p = list(action_probs.values()))
    
    return action

In [8]:
def simulate_environment(state, action):
    
    transitions = {
        "S1" : {"A1" : "S2", "A2" : "S3"},
        "S2" : {"A1" : "S1", "A2" : "S3"},
        "S3" : {"A1" : "S1", "A2" : "S2"}
    }
    
    rewards = {
        "S1" : {"A1" : {"S2" : 10}, "A2" : {"S3" : 0 }},
        "S2" : {"A1" : {"S1" : 0},  "A2" : {"S3" : 5 }},
        "S3" : {"A1" : {"S1" : -1}, "A2" : {"S2" : 0 }}
    }
    
    next_state = transitions[state][action]
    reward = rewards[state][action][next_state]
    
    return [next_state, reward] 

In [9]:
current_state      = "S1"
cummulative_reward = 0

In [10]:
for _ in range(5):
    action = stochastic_policy(current_state)
    print(f"Current State : {current_state}, Action : {action}")
        
    next_state, reward = simulate_environment(current_state, action)
    print(f"Next state : {next_state}. Reward : {reward} \n")
        
    cummulative_reward += reward
    current_state  =  next_state
        


Current State : S1, Action : A1
Next state : S2. Reward : 10 

Current State : S2, Action : A1
Next state : S1. Reward : 0 

Current State : S1, Action : A1
Next state : S2. Reward : 10 

Current State : S2, Action : A2
Next state : S3. Reward : 5 

Current State : S3, Action : A1
Next state : S1. Reward : -1 



In [11]:
print("Cummulative Reward : ", cummulative_reward)

Cummulative Reward :  24
