For the car rental, there are much more states, actions and rewards. 

In [23]:
import math
import time

def poisson(n, lambd):
    return (math.exp(-lambd) * lambd**n) / math.factorial(n)

poisson_table = dict()
for n, lam in [(i, j) for i in range(21) for j in (2, 3, 4)]:
    poisson_table[(n, lam)] = poisson(n, lam)

In [24]:
states = [(i, j) for i in range(21) for j in range(21)]

# Calculate the expected return of a state given the policy
def state_action_value(action, state, values):
    gamma = 0.9
    if abs(action) > 5:
        print("No more than 5 cars can be moved")
        return 
    # Move cars, clamp to 20 cars
    init_nloc1 = min(state[0] - action, 20)
    init_nloc2 = min(state[1] + action, 20)
    base_reward = -2 * abs(action)
    value = 0
    # We now iterate through every possible combination of returned and requested cars to obtain an expected return. 
    for rented_loc1 in range(init_nloc1 + 1):
        for rented_loc2 in range(init_nloc2 + 1):
            new_init_nloc1 = init_nloc1 - rented_loc1
            new_init_nloc2 = init_nloc2 - rented_loc2
            reward = base_reward + 10 * (rented_loc1 + rented_loc2)
            rented_prob = poisson_table[(rented_loc1, 3)] * poisson_table[(rented_loc2, 4)]
            for returned_loc1 in range(20 - new_init_nloc1 + 1):
                for returned_loc2 in range(20 - new_init_nloc2 + 1):
                    nloc1 = new_init_nloc1 + returned_loc1
                    nloc2 = new_init_nloc2 + returned_loc2
                    # Get the probability of this new state occuring
                    prob = rented_prob * poisson_table[(returned_loc1, 3)] * poisson_table[(returned_loc2, 2)]
                    # Calculate the return based on the reward and the value of the new state
                    value += prob * (reward + gamma * values[(nloc1, nloc2)])
    return value

def probability_state(rented_loc1, rented_loc2, returned_loc1, returned_loc2):
    return poisson(rented_loc1, 3) * poisson(rented_loc2, 4) * poisson(returned_loc1, 3) * poisson(returned_loc2, 2)

def evaluate(accuracy, values, iterations = None):
    difference = accuracy
    i = 0
    while difference >= accuracy and (iterations == None or i < iterations):
        difference = 0
        t = time.time()
        for s in states:
            s_value = values[s]
            values[s] = state_action_value(policy[s], s, values)
            difference = max(difference, abs(s_value - values[s]))
        print(f"diff: {round(difference, 4)}, duration: {round(time.time() - t, 4)}")
        i += 1

    return values

policy = dict()
value = dict()
for state in states:
    policy[state] = 0
    value[state] = 0

policy_stable = True
value = evaluate(0.001, value)

diff: 128.6814, duration: 2.5037
diff: 88.9841, duration: 2.4185
diff: 55.0557, duration: 2.41
diff: 30.8141, duration: 2.4009
diff: 15.7996, duration: 2.373
diff: 7.537, duration: 2.4485
diff: 3.3714, duration: 2.4169
diff: 1.4376, duration: 2.4205
diff: 0.5861, duration: 2.661
diff: 0.2314, duration: 2.441
diff: 0.0895, duration: 2.4602
diff: 0.0339, duration: 2.3996
diff: 0.0126, duration: 2.3994
diff: 0.0047, duration: 2.3813
diff: 0.0017, duration: 2.4199
diff: 0.0006, duration: 2.4596


In [26]:
print(value)

{(0, 0): 0.008977623543818838, (0, 1): 0.08800705824757603, (0, 2): 0.340780053525016, (0, 3): 0.8248574564698994, (0, 4): 1.4957192150795795, (0, 5): 2.2453566915566965, (0, 6): 2.9756675768426866, (0, 7): 3.636026167594565, (0, 8): 4.216880594976066, (0, 9): 4.727733553732355, (0, 10): 5.181194776044242, (0, 11): 5.587050495582887, (0, 12): 5.9518312831886995, (0, 13): 6.279547820162638, (0, 14): 6.571657233392684, (0, 15): 6.825571281339305, (0, 16): 7.03092842029587, (0, 17): 7.1625699527653905, (0, 18): 7.17093791963202, (0, 19): 6.9774712090951265, (0, 20): 6.492466793166393, (1, 0): 0.06629171331377344, (1, 1): 0.506129375889156, (1, 2): 1.7718338582734678, (1, 3): 4.06400335542223, (1, 4): 7.132743485606571, (1, 5): 10.48401397591516, (1, 6): 13.699428365513869, (1, 7): 16.579877534905062, (1, 8): 19.101860569716095, (1, 9): 21.31692167954489, (1, 10): 23.28400200791744, (1, 11): 25.046590017349267, (1, 12): 26.63281076581936, (1, 13): 28.059511923148275, (1, 14): 29.3323534447