For the car rental, there are much more states, actions and rewards. 

In [13]:
import math
import time

def poisson(n, lambd):
    return (math.exp(-lambd) * lambd**n) / math.factorial(n)

In [15]:
states = [(i, j) for i in range(21) for j in range(21)]

# Calculate the expected return of a state given the policy
def state_action_value(action, state, values):
    gamma = 0.9
    if abs(action) > 5:
        print("No more than 5 cars can be moved")
        return 
    # Move cars, clamp to 20 cars
    init_nloc1 = min(state[0] - action, 20)
    init_nloc2 = min(state[1] + action, 20)
    base_reward = -2 * abs(action)
    value = 0
    # We now iterate through every possible combination of returned and requested cars to obtain an expected reward. 
    for rented_loc1 in range(init_nloc1 + 1):
        for rented_loc2 in range(init_nloc2 + 1):
            new_init_nloc1 = init_nloc1 - rented_loc1
            new_init_nloc2 = init_nloc2 - rented_loc2
            for returned_loc1 in range(20 - new_init_nloc1 + 1):
                for returned_loc2 in range(20 - new_init_nloc2 + 1):
                    nloc1 = new_init_nloc1 + returned_loc1
                    nloc2 = new_init_nloc2 + returned_loc2
                    # Get the probability of this new state occuring
                    prob = probability_state(rented_loc1, rented_loc2, returned_loc1, returned_loc2)
                    # Calculate the return based on the reward and the value of the new state
                    value += prob * (base_reward + 10 * (rented_loc1 + rented_loc2) + gamma * values[(nloc1, nloc2)])
    return value

def probability_state(rented_loc1, rented_loc2, returned_loc1, returned_loc2):
    return poisson(rented_loc1, 3) * poisson(rented_loc2, 4) * poisson(returned_loc1, 3) * poisson(returned_loc2, 2)

def evaluate(accuracy, values, iterations = None):
    difference = accuracy
    i = 0
    while difference >= accuracy and (iterations == None or i < iterations):
        difference = 0
        t = time.time()
        for s in states:
            s_value = values[s]
            values[s] = state_action_value(policy[s], s, values)
            difference = max(difference, abs(s_value - values[s]))
        print(f"diff: {round(difference, 4)}, duration: {round(time.time() - t, 4)}")
        i += 1

    return values

policy = dict()
value = dict()
for state in states:
    policy[state] = 0
    value[state] = 0

policy_stable = True
value = evaluate(0.001, value)

diff: 128.6814, duration: 11.9587
diff: 88.9841, duration: 11.955
diff: 55.0557, duration: 11.7982
diff: 30.8141, duration: 11.666
diff: 15.7996, duration: 11.8029
diff: 7.537, duration: 11.6833
diff: 3.3714, duration: 11.736
diff: 1.4376, duration: 11.6027
diff: 0.5861, duration: 11.6757
diff: 0.2314, duration: 11.7024
diff: 0.0895, duration: 11.7433
diff: 0.0339, duration: 11.7768
diff: 0.0126, duration: 11.7783
diff: 0.0047, duration: 12.0985
diff: 0.0017, duration: 11.9075
diff: 0.0006, duration: 11.8627
diff: 0.0002, duration: 11.7881
diff: 0.0001, duration: 11.9681


KeyboardInterrupt: 

In [42]:
print(value)

(120, (8, 10))
