In [2]:
import numpy as np

class AdjustedMDPEnvironment:
    def __init__(self, num_states, num_actions):
        self.num_states = num_states
        self.num_actions = num_actions
        self.P = np.random.rand(num_states, num_actions, num_states)
        self.R = np.random.rand(num_states, num_actions, num_states) - 0.5  # Rewards in range [-0.5, 0.5]

        # Normalize transition probabilities
        for s in range(num_states):
            for a in range(num_actions):
                self.P[s, a, :] /= self.P[s, a, :].sum()

def policy_evaluation(policy, env, gamma=0.99, theta=1e-5):
    V = np.zeros(env.num_states)
    while True:
        delta = 0
        for s in range(env.num_states):
            v = V[s]
            V[s] = sum(env.P[s, policy[s], s_prime] * (env.R[s, policy[s], s_prime] + gamma * V[s_prime])
                       for s_prime in range(env.num_states))
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    return V

def policy_improvement(V, env, gamma=0.99):
    policy = np.zeros(env.num_states, dtype=int)
    for s in range(env.num_states):
        action_values = [sum(env.P[s, a, s_prime] * (env.R[s, a, s_prime] + gamma * V[s_prime])
                             for s_prime in range(env.num_states)) for a in range(env.num_actions)]
        policy[s] = np.argmax(action_values)
    return policy

def policy_iteration(env, gamma=0.99, theta=1e-5):
    policy = np.zeros(env.num_states, dtype=int)
    while True:
        old_policy = policy.copy()
        V = policy_evaluation(policy, env, gamma, theta)
        policy = policy_improvement(V, env, gamma)
        if np.array_equal(policy, old_policy):
            break
    return policy, V

# Create an adjusted environment instance
env = AdjustedMDPEnvironment(12, 2)

# Perform Policy Iteration
optimal_policy, optimal_value_function = policy_iteration(env)

print("Optimal Policy:", optimal_policy)
print("Optimal Value Function:", optimal_value_function)


Optimal Policy: [0 0 1 0 0 0 0 1 0 0 0 1]
Optimal Value Function: [6.3776102  6.15771968 6.15215525 6.25796375 6.27704857 6.41739394
 6.16346683 6.38685311 6.23410736 6.39871135 6.27061703 6.19663139]


In [2]:
import pickle

# Function to save the model parameters
def save_model_parameters(policy, value_function, file_name='model_params.pkl'):
    with open(file_name, 'wb') as f:
        pickle.dump({'policy': policy, 'value_function': value_function}, f)
    print(f"Model parameters saved to {file_name}")

# After obtaining the optimal policy and value function
save_model_parameters(optimal_policy, optimal_value_function)


Model parameters saved to model_params.pkl


In [3]:
# Function to load the model parameters
def load_model_parameters(file_name='model_params.pkl'):
    with open(file_name, 'rb') as f:
        data = pickle.load(f)
    return data['policy'], data['value_function']

# Loading the model parameters from the file
loaded_policy, loaded_value_function = load_model_parameters()
print("Loaded Policy:", loaded_policy)
print("Loaded Value Function:", loaded_value_function)


Loaded Policy: [2 0 2 2 1 1 2 2 1 1 1 2]
Loaded Value Function: [6.12010594 6.05629564 5.9841324  5.96426021 6.02267353 5.96504004
 6.07872222 6.10301176 6.01719121 6.01365836 6.1000341  6.09944546]


In [5]:
# Assuming the AdjustedMDPEnvironment and policy_iteration functions are defined as above

# Create an adjusted environment instance
env = AdjustedMDPEnvironment(24, 3)

# Perform Policy Iteration
optimal_policy, optimal_value_function = policy_iteration(env)

# Save the optimal policy and value function to a file
save_model_parameters(optimal_policy, optimal_value_function)

# Later, you can load the model parameters as needed
loaded_policy, loaded_value_function = load_model_parameters()


Model parameters saved to model_params.pkl


In [6]:
loaded_policy

array([2, 2, 2, 0, 2, 2, 1, 1, 0, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2,
       0, 0])