In [142]:
import numpy as np
import matplotlib.pyplot as plt
import time

In [143]:
class Location():
    
    def __init__(self, name, cars, exp_out, exp_in):
        self.name = name
        self.cars = cars
        self.exp_out = exp_out
        self.exp_in = exp_in
        
    def move(self, location, cars):
        self.cars = self.cars-cars
        location.cars = np.clip(location.cars+cars, 1, 20)
        
    def rent(self, cars):
        self.cars = self.cars-min(self.cars, cars)
        return 10 * min(self.cars, cars)
    
    def restore(self, cars):
        self.cars = np.clip(self.cars+cars, 1, 20)
        
    def episode(self):
        return np.random.poisson(self.exp_in), np.random.poisson(self.exp_out)

In [144]:
# actions (- is loc2 --> loc1 and + is loc1 --> loc2)
A = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]

# state-value function
state_value = {
    k:v for k,v in zip(
        ((first, second) for first in range(21) for second in range(21)), 
        np.random.rand(420)
    )
}

# policy
policy = {
    k:v for k,v in zip(
        ((first, second) for first in range(21) for second in range(21)), 
        (np.random.choice(A) for choice in range(420))
    )
}

In [145]:
# map (s,a) to (s')
def next_state(state, action):
    
    if policy[state]<0:
        loc1 = max(0, state[1]+policy[state])
        loc2 = np.clip(state[0] + min(np.abs(policy[state]), state[1]), 0, 20)
        return tuple((loc1, loc2))
    if policy[state]>0:
        loc1 = max(0, state[0]-policy[state])
        loc2 = np.clip(state[1] + min(policy[state], state[0]), 0, 20)
        return tuple((loc1, loc2))
    return state

In [146]:
def policy_evaluation():
    while True:
        delta = 0
        for state in state_value.keys():
            old = state_value[state]
            state_value[state] = 1 * ( np.abs(policy[state]) * -2 + gamma * state_value[next_state(state, policy[state])])
            delta = max(delta, np.abs(old-state_value[state]))
        if delta < theta:
            break

In [147]:
def policy_improvement():
    policy_stable = True
    for state in state_value.keys():
        old = policy[state]
        expected_reward = lambda x: -2 * np.abs(x) + gamma * state_value[next_state(state, x)]
        policy[state] = max(A, key=expected_reward)
        if old != policy[state]:
            policy_stable = False

In [149]:
loc1 = Location("San Diego", 20, 3, 3)
loc2 = Location("New York", 20, 4, 2)
days = 365
gamma = 0.9
bank = 0
reward_rent = 10
reward_empty = 0
transfer_cost = 2
theta = 0.1

In [152]:
# You are not taking into account the reward from renting a car
for day in range(days):
    
    policy_evaluation()
    policy_improvement()

KeyboardInterrupt: 