## Markov Decision Process

In [1]:
# Import Libraries
import numpy as np
from scipy.optimize import optimize

In [2]:
# Define the MDP parameters
N = 12 # Time periods
S = 5 # Number of states
A = 3 # Number of actions
gamma = 1 # Discount factor

In [3]:
# Define transition probabilities
P = np.random.rand(S, A, S) # Random transition probabilities

# Define reward function
# Expected revenue for action a in state s
R = np.random.rand(S, A) # Random rewards (should be based on data)

# Define the value function and policy
V = np.zeros((N+1, S)) # Value function
policy = np.zeros((N, S), dtype=int) # Policy (action for each state at each time step)

In [4]:
# Dynamic Programming Algorithm to Solve the MDP
for t in range(N-1, -1, -1):
  for s in range(S):
    Q = np.zeros(A) # Action-value function
    for a in range(A):
      Q[a] = R[s, a] + gamma * np.sum(P[s, a, :] * V[t+1, :])
    V[t, s] = np.max(Q)
    policy[t, s] = np.argmax(Q)

# Print the optimal value function and policy
print("Optimal Value Function")
print(policy)

Optimal Value Function
[[0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 0 1]
 [0 0 1 2 1]
 [0 0 1 2 1]
 [0 0 2 2 1]]
