In [182]:
import numpy as np
import pandas as pd
import os
import pickle
import RL
from taxi_mdp import TaxiMDP
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [183]:
data_dir = os.path.join(os.getcwd(), '..', 'Data')
neighborhoods = pickle.load(open(os.path.join(data_dir, "neighorhoods.p"), "rb"))
driver_areas = pickle.load(open(os.path.join(data_dir, "driver_areas.p"), "rb"))
change_pairs = pickle.load(open(os.path.join(data_dir, "change_pairs.p"), "rb"))
taxi_data = pd.read_csv(os.path.join(data_dir, 'taxi_data.csv'), index_col=0)

In [185]:
def get_policy(mdp, driver=None, data=None):
    """Finding the policy for the agent.

    :param mdp: MDP object created for some set of taxi drivers.
    :param driver: Driver id.
    :param data: Data including the driver data.

    :return: N, the policy for the driver.
    """

    if isinstance(driver, tuple):
        data = data.loc[data['hack_license'].isin(driver)]
    else:
        data = data.loc[data['hack_license'] == driver]

    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
    data['date'] = data['pickup_datetime'].apply(lambda x: x.date())

    data['cum_rewards'] = pd.Series([None for row in xrange(len(data))], index=data.index)

    # Tracking the daily cumulative rewards at each transaction.
    data['cum_rewards'] = data.groupby(['hack_license', 'date'])['profit'].cumsum()

    # Label indicating what reward interval earnings are at following a transaction.
    data['reward_interval'] = data['cum_rewards'].apply(lambda y: mdp.reward_intervals.index(filter(lambda x: x[0] <= y < x[1], 
                                                                                                     mdp.reward_intervals)[0]))

    data['next_trip_area'] = data.groupby(['hack_license', 'date'])['start_trip_area'].shift(-1)
    data.dropna(inplace=True)

    # Starting and ending areas of policy decision following each transaction.
    data['start_choice'] = data['end_trip_area'].apply(lambda x: mdp.mapping[x])
    data['end_choice'] = data['next_trip_area'].apply(lambda x: mdp.mapping[x])

    # Finding the policy for the data.
    N = np.zeros((mdp.n, mdp.m))

    for state in mdp.X:
        state_num = mdp.state2num[state]

        # Empty and not in final reward indicates a choice is being made.
        if state[1] == 'e' and state[2] != mdp.reward_intervals[-1]:

            state_num = mdp.state2num[state]

            start_choice = state[0]

            reward_interval = mdp.reward_intervals.index(state[2])

            final_reward = mdp.reward_intervals.index(mdp.reward_intervals[-1])

            for action in mdp.U:
                N[state_num, action] = len(data.loc[(data['reward_interval'] == reward_interval) & 
                                                   (data['start_choice'] == start_choice) & 
                                                   (data['end_choice'] == action) & 
                                                   (data['reward_interval'] != final_reward)])
        else:
            N[state_num, :] = 1/float(len(mdp.U))

    empty_rows = np.where(~N.any(axis=1))[0].tolist()

    if not empty_rows:
        pass
    else:
        for row in empty_rows:
            N[row] = 1

    return N

In [240]:
all_learned_policies = []
all_driver_policies = []
all_v_errors = []
all_percent_opt = []

for gamma in [.9, .95, 1]:
    learned_policies = []
    driver_policies = []
    v_errors = []

    for driver in driver_areas.keys():
        print 'Driver ID', driver
        driver_mdp = TaxiMDP(driver, taxi_data, driver_areas[driver].keys(), 
                             neighborhoods, 20, change_pairs)
        states = driver_mdp.states
        actions = driver_mdp.actions
        driver_mdp.states = driver_mdp.X
        driver_mdp.actions = driver_mdp.U
        driver_mdp.X = states
        driver_mdp.U = actions
        model_rl = RL.ModelBasedRL(gamma=gamma)
        model_rl.q_value_iteration(driver_mdp)

        v_error = model_rl.test_optimal_v(driver_mdp).max()
        print v_error

        policy_states = [i for i in xrange(driver_mdp.n) if 'e' in states[i] and ~np.isinf(states[i][2]).any()]
        mdp_policy = model_rl.policy[policy_states].tolist()
        learned_policy = model_rl.policy[policy_states]
        learned_policy = learned_policy.reshape(10, -1)

        driver_policy = get_policy(driver_mdp, driver, taxi_data)
        driver_policy = np.argmax(driver_policy/driver_policy.sum(axis=1, keepdims=True), axis=1)
        driver_policy = driver_policy[policy_states].reshape(10, -1)

        learned_policies.append(learned_policy)
        driver_policies.append(driver_policy)
        v_errors.append(v_error)

    percent_opt = []
    for i in xrange(len(learned_policies)):
        size = float(driver_policies[i].shape[0]*driver_policies[i].shape[1])
        percent_opt.append((driver_policies[i] == learned_policies[i]).sum()/float(size))
        
    all_learned_policies.append(learned_policies)
    all_driver_policies.append(driver_policies)
    all_v_errors.append(v_errors)
    all_percent_opt.append(percent_opt)

Driver ID 2010003240
4.28511602664e-05
Driver ID 2010002704
6.31521704797e-05
Driver ID 2010002920
3.08040586816e-05
Driver ID 2010001271
4.16877443215e-05
Driver ID 2010007770
4.43669538726e-05
Driver ID 2010007579
2.94281395554e-05
Driver ID 2010007519
5.20730355049e-05
Driver ID 2010003240
5.19652735704e-05
Driver ID 2010002704
5.57930440976e-05
Driver ID 2010002920
3.20551791333e-05
Driver ID 2010001271
2.61486997033e-05
Driver ID 2010007770
5.28543509404e-05
Driver ID 2010007579
2.10352710894e-05
Driver ID 2010007519
4.2771833364e-05
Driver ID 2010003240
5.81923568461e-05
Driver ID 2010002704
6.88179477493e-05
Driver ID 2010002920
5.42016850886e-05
Driver ID 2010001271
6.27083693416e-05
Driver ID 2010007770
5.16083769071e-05
Driver ID 2010007579
4.78391639263e-05
Driver ID 2010007519
4.38555532583e-05


In [245]:
for val in all_percent_opt[0]:
    print round(val*100, 1)

59.3
51.7
32.2
34.1
33.3
28.6
44.7


In [246]:
for val in all_percent_opt[1]:
    print round(val*100, 1)

60.0
44.4
17.2
21.8
26.0
16.4
36.7


In [247]:
for val in all_percent_opt[2]:
    print round(val*100, 1)

75.0
75.6
77.8
84.7
63.3
86.4
64.7
