In [1]:
# This program uses Value Iteration to find the long-term Discounted Rewards of each state of a Markov System.
import pandas as pd
# [sun, wind, hail] transition probabilities matrix, 2D
# Ex. sun -> sun probability is at transition_probabilities[0][0]
# Ex. wind -> hail probability is at transition_probabilities[1][2]
# transition_probabilities = [[0.5, 0.5, 0], [0.5, 0, 0.5], [0, 0.5, 0.5]]
# [sun, wind, hail] original state rewards



def get_transition_probabilities_and_rewards_from_csv(transitionFile, rewardFile):
  # Read in CSV files
  transition_df = pd.read_csv(transitionFile)
  rewards_df = pd.read_csv(rewardFile)

  # Process Markov State Transition Probabilities
  transition_probabilities = []
  for i in range(len(transition_df)):
    temp = []
    ser = pd.Series(transition_df.iloc[i])
    for j in range(transition_df.shape[0]):
      temp.append(ser.iloc[j])
    transition_probabilities.append(temp)

  # Process Markov State Rewards
  state_reward = []
  for i in range(len(rewards_df)):
    ser = pd.Series(rewards_df.iloc[i])
    state_reward.append(ser.iloc[0])

  return transition_probabilities, state_reward


# Given a particular state S_i, calculate the J*(S_i) Reward
def J_star_reward(state, original_state_rewards, prior_state_rewards, gamma, transition_prob):
    reward = original_state_rewards[state]
    Jsum = 0
    for i in range(len(prior_state_rewards)):
        Jsum += transition_prob[state][i] * prior_state_rewards[i]
    reward += gamma * Jsum
    return reward


# Given an Epsilon threshold, calculate the Markov System Value Iteration Reward until the difference between consecutive
# rewards is smaller than Epsilon.
def Markov_System_Value_Iteration(Epsilon, discountFactor, transitionProbabilities, stateRewards):
    # At K=1 iteration, the state reward is the reward
    discounted_rewards = stateRewards

    while True:
        # Iterate through every State and calculate the discounted rewards against epsilon
        temp = []  # temp holds the value of the current iteration-K rewards.
        for i in range(len(discounted_rewards)):
            temp.append(J_star_reward(i, stateRewards, discounted_rewards, discountFactor, transitionProbabilities))

        # Find the Max difference between K and K-1 iterations of Rewards, and compare it to Epsilon.
        # If the difference is smaller than Epsilon, stop calculating. The discounted rewards have converged.
        Max_diff = 0
        for i in range(len(temp)):
            curr_diff = abs(temp[i] - discounted_rewards[i])
            Max_diff = curr_diff if curr_diff > Max_diff else Max_diff
        # Round the calculated Discounted Rewards to 2 decimal places.
        if Max_diff < Epsilon:
            temp = [round(elem, 2) for elem in temp]
            return temp

        discounted_rewards = temp


epsilon = 0.001
discount_factor = 0.5
markovTransitionFileName = '/content/drive/MyDrive/Markov Transition Probabilities - Sheet1.csv'
markovRewardFileName = '/content/drive/MyDrive/Markov State Rewards - Sheet1.csv'
trans_prob, st_rewards = get_transition_probabilities_and_rewards_from_csv(markovTransitionFileName, markovRewardFileName)
print(f"The Discounted Rewards are: {Markov_System_Value_Iteration(epsilon, discount_factor, trans_prob, st_rewards)}")



The Discounted Rewards are: [4.8, -1.6, -11.2]
