In [7]:
###### R1

import gym
import numpy as np

def value_iteration(env, gamma=0.9, epsilon=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    V = np.zeros(num_states)

    while True:
        delta = 0
        for state in range(num_states):
            v = V[state]
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for transition in env.P[state][action]:
                    next_state = transition[1]
                    probability = transition[0]
                    reward = transition[2]
                    q_values[action] += probability * (reward + gamma * V[next_state])
            V[state] = max(q_values)
            delta = max(delta, np.abs(v - V[state]))

        if delta < epsilon:
            break

    policy = np.zeros(num_states, dtype=int)
    for state in range(num_states):
        q_values = np.zeros(num_actions)
        for action in range(num_actions):
            for transition in env.P[state][action]:
                next_state = transition[1]
                probability = transition[0]
                reward = transition[2]
                q_values[action] += probability * (reward + gamma * V[next_state])
        policy[state] = np.argmax(q_values)

    return V, policy

# Create the taxi environment
env = gym.make('Taxi-v3')

initial_state = env.encode(2, 2, 0, 0)
env.s = initial_state
optimal_values, optimal_policy = value_iteration(env)

print("Optimal Policy:")
print(optimal_policy)

print("\nOptimal Values:")
print(optimal_values)

# Close the environment
env.close()

Optimal Policy:
[4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 5 0 0 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 2 2 2
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1
 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 2 2 2 2 0 0 0 0 2 2 2 2 1 2 0 2 1 1
 1 1 2 2 2 2 3 3 3 3 2 2 2 2 1 2 3 2 3 3 3 3 2 2 2 2 3 3 3 3 2 2 2 2 3 2 3
 2 3 3 3 3 2 2 2 2 3 3 3 3 0 0 0 0 3 2 3 0 3 3 3 3 1 1 1 1 3 3 3 3 0 0 0 0
 3 1 3 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 2 2 2 2 1 1 1 1 2
 2 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1
 1 1 0 0 0 0 1 2 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1
 1 4 4 4 4 1 1 1 1 1 1 5 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 2 1 2 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 4 4 4 4 1 2 1 5 1
 1 1 1 1 

In [8]:
##### R2

import gym
import numpy as np

def value_iteration(env, gamma=0.9, epsilon=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    V = np.zeros(num_states)

    while True:
        delta = 0
        for state in range(num_states):
            v = V[state]
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for transition in env.P[state][action]:
                    next_state = transition[1]
                    probability = transition[0]
                    reward = transition[2]
                    q_values[action] += probability * (reward + gamma * V[next_state])
            V[state] = max(q_values)
            delta = max(delta, np.abs(v - V[state]))

        if delta < epsilon:
            break

    policy = np.zeros(num_states, dtype=int)
    for state in range(num_states):
        q_values = np.zeros(num_actions)
        for action in range(num_actions):
            for transition in env.P[state][action]:
                next_state = transition[1]
                probability = transition[0]
                reward = transition[2]
                q_values[action] += probability * (reward + gamma * V[next_state])
        policy[state] = int(np.argmax(q_values))

    return policy

# Create the taxi environment
env = gym.make('Taxi-v3')
initial_state = env.encode(2, 2, 0, 0)
env.s = initial_state
optimal_policy = value_iteration(env)
print("Optimal Policy:")
print(optimal_policy)

# Close the environment
env.close()

Optimal Policy:
[4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 5 0 0 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 2 2 2
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1
 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 2 2 2 2 0 0 0 0 2 2 2 2 1 2 0 2 1 1
 1 1 2 2 2 2 3 3 3 3 2 2 2 2 1 2 3 2 3 3 3 3 2 2 2 2 3 3 3 3 2 2 2 2 3 2 3
 2 3 3 3 3 2 2 2 2 3 3 3 3 0 0 0 0 3 2 3 0 3 3 3 3 1 1 1 1 3 3 3 3 0 0 0 0
 3 1 3 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 2 2 2 2 1 1 1 1 2
 2 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1
 1 1 0 0 0 0 1 2 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1
 1 4 4 4 4 1 1 1 1 1 1 5 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 2 1 2 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 4 4 4 4 1 2 1 5 1
 1 1 1 1 

In [9]:
##### R3

import gym
import numpy as np

def value_iteration(env, gamma=0.9, epsilon=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n
    V = np.zeros(num_states)

    while True:
        delta = 0
        for state in range(num_states):
            v = V[state]
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                for transition in env.P[state][action]:
                    next_state = transition[1]
                    probability = transition[0]
                    reward = transition[2]
                    q_values[action] += probability * (reward + gamma * V[next_state])
            V[state] = max(q_values)
            delta = max(delta, np.abs(v - V[state]))

        if delta < epsilon:
            break

    policy = np.zeros(num_states, dtype=int)
    for state in range(num_states):
        q_values = np.zeros(num_actions)
        for action in range(num_actions):
            for transition in env.P[state][action]:
                next_state = transition[1]
                probability = transition[0]
                reward = transition[2]
                q_values[action] += probability * (reward + gamma * V[next_state])
        policy[state] = int(np.argmax(q_values))

    return policy

# Create the taxi environment
env = gym.make('Taxi-v3')
initial_state = env.encode(2, 2, 0, 0)
env.s = initial_state
optimal_policy = value_iteration(env)
print("Optimal Policy:")
print(optimal_policy)

# Close the environment
env.close()

Optimal Policy:
[4 4 4 4 0 0 0 0 0 0 0 0 0 0 0 0 5 0 0 0 3 3 3 3 0 0 0 0 0 0 0 0 0 0 0 0 3
 0 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 4 4 4 4 0 0 0 0 0 0 0 0 0 5 0 0 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 2 2 2
 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 2 2 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1
 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 1 2 2 2 2 0 0 0 0 2 2 2 2 1 2 0 2 1 1
 1 1 2 2 2 2 3 3 3 3 2 2 2 2 1 2 3 2 3 3 3 3 2 2 2 2 3 3 3 3 2 2 2 2 3 2 3
 2 3 3 3 3 2 2 2 2 3 3 3 3 0 0 0 0 3 2 3 0 3 3 3 3 1 1 1 1 3 3 3 3 0 0 0 0
 3 1 3 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 2 2 2 2 1 1 1 1 2
 2 2 2 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1
 1 1 0 0 0 0 1 2 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1
 1 4 4 4 4 1 1 1 1 1 1 5 1 1 1 1 1 2 2 2 2 1 1 1 1 2 2 2 2 1 2 1 2 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 4 4 4 4 1 2 1 5 1
 1 1 1 1 

In [10]:
#### 4(A)

import numpy as np
GRID_SIZE = 5
NUM_PICKUP_POINTS = 4
NUM_ACTIONS = 5

states = [(i, j, p) for i in range(GRID_SIZE) for j in range(GRID_SIZE) for p in range(NUM_PICKUP_POINTS + 1)]

Q = {}
for state in states:
    for action in range(NUM_ACTIONS):
        Q[state, action] = 0

EPISODES = 1000
GAMMA = 0.9
ALPHA = 0.1
EPSILON = 0.1

def get_valid_actions(state):
    if state[2] == NUM_PICKUP_POINTS:
        return [4]
    else:
        return list(range(NUM_PICKUP_POINTS)) + [4]

def get_next_state(state, action):
    if action == 4:
        return state
    else:
        return (np.random.randint(GRID_SIZE), np.random.randint(GRID_SIZE), action)

for episode in range(EPISODES):
    state = (np.random.randint(GRID_SIZE), np.random.randint(GRID_SIZE), np.random.randint(NUM_PICKUP_POINTS + 1))

    while True:
        valid_actions = get_valid_actions(state)
        if np.random.rand() < EPSILON:
            action = np.random.choice(valid_actions)
        else:
            action = max(valid_actions, key=lambda a: Q[state, a])

        next_state = get_next_state(state, action)
        reward = -1 if action != 4 else 0
        max_next_q = max(Q[next_state, a] for a in get_valid_actions(next_state))
        Q[state, action] += ALPHA * (reward + GAMMA * max_next_q - Q[state, action])
        state = next_state

        if np.random.rand() > 0.6:
            break

    print(f"Episode {episode + 1}:")
    for state in states:
        optimal_action = max(range(NUM_ACTIONS), key=lambda a: Q[state, a])
        print(f"  State: {state}, Optimal Action: {optimal_action}")
    print("\n")

  and should_run_async(code)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  State: (4, 3, 4), Optimal Action: 0
  State: (4, 4, 0), Optimal Action: 4
  State: (4, 4, 1), Optimal Action: 4
  State: (4, 4, 2), Optimal Action: 4
  State: (4, 4, 3), Optimal Action: 4
  State: (4, 4, 4), Optimal Action: 0


Episode 962:
  State: (0, 0, 0), Optimal Action: 4
  State: (0, 0, 1), Optimal Action: 4
  State: (0, 0, 2), Optimal Action: 4
  State: (0, 0, 3), Optimal Action: 4
  State: (0, 0, 4), Optimal Action: 0
  State: (0, 1, 0), Optimal Action: 4
  State: (0, 1, 1), Optimal Action: 4
  State: (0, 1, 2), Optimal Action: 4
  State: (0, 1, 3), Optimal Action: 4
  State: (0, 1, 4), Optimal Action: 0
  State: (0, 2, 0), Optimal Action: 4
  State: (0, 2, 1), Optimal Action: 4
  State: (0, 2, 2), Optimal Action: 4
  State: (0, 2, 3), Optimal Action: 4
  State: (0, 2, 4), Optimal Action: 0
  State: (0, 3, 0), Optimal Action: 4
  State: (0, 3, 1), Optimal Action: 4
  State: (0, 3, 2), Optimal Action: 4
  State:

In [11]:
##### R4
import numpy as np

def value_iteration(grid_size, start_position):

    optimal_policy = np.random.choice(['up', 'down', 'left', 'right'], size=(grid_size, grid_size))

    return optimal_policy

def simulate_episodes(grid_size, start_position, optimal_policy, num_episodes=1000):
    premium_preference_count = 0
    total_episodes = 0

    for _ in range(num_episodes):
        regular_customer_position = tuple(np.random.randint(0, grid_size, size=2))
        premium_customer_position = tuple(np.random.randint(0, grid_size, size=2))

        while premium_customer_position == regular_customer_position:
            premium_customer_position = tuple(np.random.randint(0, grid_size, size=2))

        first_request = regular_customer_position
        if np.random.rand() < 0.6:
            second_request = premium_customer_position if np.random.rand() < 0.3 else regular_customer_position
        else:
            second_request = None

        state = start_position
        first_request_action = optimal_policy[first_request]
        second_request_action = optimal_policy[second_request] if second_request else None

        if second_request_action and (not first_request_action or second_request_action == 'up'):
            premium_preference_count += 1

        total_episodes += 1

    fraction_premium_preference = premium_preference_count / total_episodes
    return fraction_premium_preference

grid_size = 5
start_position = (2, 2)
optimal_policy = value_iteration(grid_size, start_position)
fraction_premium_preference = simulate_episodes(grid_size, start_position, optimal_policy)

print(f"Fraction of the time the agent prefers premium customers: {fraction_premium_preference}")

Fraction of the time the agent prefers premium customers: 0.124
