In [1]:
# This notebook is used to generate random demo data, generate the simulator, fit the prediction model
# In the real numerical experiment, we use the true feature as input, instead of the demo data generated in this notebook

In [2]:
import numpy as np
import pandas as pd
import math
import random
from sklearn.linear_model import LogisticRegression

from params import *
from functions import *

import os
os.makedirs('data/', exist_ok = False) 

In [3]:
# simulate feature
def feature_simulator():
    
    total_distance = np.random.uniform(1,20)
    fare = np.random.uniform(2,4) * total_distance
    compensation = np.random.uniform(0.7,0.9) * fare
    
    return total_distance, fare, compensation

In [4]:
# simulate underlying true probability for training data
def prob_simulator_1(feature):
    
    beta_1 = [4.0, 0, 0.05, -0.05, 0]
    beta_2 = [0, 3.0, 0.02, 0, 0.05]
    prob_1 = 1 / (1 + math.exp(np.random.uniform(-1.5,-1) - np.dot(beta_1, feature)))
    prob_2 = 1 / (1 + math.exp(np.random.uniform(3,3.5) - np.dot(beta_2, feature)))
                  
    return prob_1 * prob_2

In [5]:
# generate training data and fit the prediction model
SEED_TRAIN = 42
set_seed(SEED_TRAIN)

x_train = np.zeros((N_train,num_feature + 2))
y_train = np.zeros(N_train)

for n in range(0, N_train):

    # randomly generate other features
    total_distance, fare, compensation = feature_simulator()
    
    # randomly select passenger incentive level as the given incentive
    i = random.choice(range(0, len(passenger_incentive_list)))
    passenger_incentive = passenger_incentive_list[i]
    
    # randomly select driver incentive level as the given incentive
    j = random.choice(range(0, len(driver_incentive_list)))
    driver_incentive = driver_incentive_list[j]

    x_train[n,:] = [passenger_incentive, driver_incentive, total_distance, fare, compensation]
    true_prob = prob_simulator_1([passenger_incentive, driver_incentive, total_distance, fare, compensation])
    
    # simulate label according to the probability given by the simulator
    y_train[n] = np.random.choice([0,1], size=1, p=[1-true_prob, true_prob])[0]

# fit the prediction model
model = LogisticRegression(fit_intercept = True, random_state = SEED_TRAIN)
model.fit(x_train,y_train)
print(sum(y_train)/N_train,model.coef_)

0.1088 [[ 1.59414062  2.10553869  0.07794291 -0.03815195  0.04777697]]


In [6]:
# predicted probability
def prob_prediction(feature):
    return model.predict_proba(np.array(feature).reshape(1,-1))[0,1]

# simulate underlying true probability for other data
def prob_simulator_2(feature):
    
    beta_1 = [4.0, 0, 0.05, -0.05, 0]
    beta_2 = [0, 3.0, 0.02, 0, 0.05]
    prob_1 = 1 / (1 + math.exp(np.random.uniform(-2,-1.5) - np.dot(beta_1, feature)))
    prob_2 = 1 / (1 + math.exp(np.random.uniform(2.5,3) - np.dot(beta_2, feature)))
                  
    return prob_1 * prob_2

In [None]:
# generate and save historical data and incoming data, repeated 10 times
for SEED in range(0,10):
    
    # generate historical data
    data_hist = pd.DataFrame()

    for n in range(0, N_hist):
        
        # randomly generate other features
        total_distance, fare, compensation = feature_simulator()
        
        data_hist.loc[n, 'total_distance'] = total_distance
        data_hist.loc[n, 'fare'] = fare
        data_hist.loc[n, 'compensation'] = compensation
        
        for i in range(0, len(passenger_incentive_list)):
            for j in range(0, len(driver_incentive_list)):
                
                passenger_incentive = passenger_incentive_list[i]
                driver_incentive = driver_incentive_list[j]
                
                true_prob = prob_simulator_2([passenger_incentive, driver_incentive, total_distance, fare, compensation])
                true_label = np.random.choice([0,1], size=1, p=[1-true_prob, true_prob])[0]
                pred_prob = prob_prediction([passenger_incentive, driver_incentive, total_distance, fare, compensation])
                
                data_hist.loc[n, f'true_prob_{i}_{j}']  = true_prob
                data_hist.loc[n, f'true_label_{i}_{j}'] = true_label
                data_hist.loc[n, f'pred_prob_{i}_{j}'] = pred_prob
                
        # randomly select passenger incentive level as the given incentive
        i = random.choice(range(0, len(passenger_incentive_list)))
        passenger_incentive = passenger_incentive_list[i]

        # randomly select driver incentive level as the given incentive
        j = random.choice(range(0, len(driver_incentive_list)))
        driver_incentive = driver_incentive_list[j]
        
        data_hist.loc[n, 'passenger_incentive'] = passenger_incentive
        data_hist.loc[n, 'driver_incentive'] = driver_incentive
        
    data_hist.to_pickle(f'data/data_hist_{SEED}.pickle')
    
    # generate incoming data
    data_incoming = pd.DataFrame()

    for n in range(0, N_incoming):
        
        # randomly generate other features
        total_distance, fare, compensation = feature_simulator()
        
        data_incoming.loc[n, 'total_distance'] = total_distance
        data_incoming.loc[n, 'fare'] = fare
        data_incoming.loc[n, 'compensation'] = compensation
        
        for i in range(0, len(passenger_incentive_list)):
            for j in range(0, len(driver_incentive_list)):
                
                passenger_incentive = passenger_incentive_list[i]
                driver_incentive = driver_incentive_list[j]
                
                true_prob = prob_simulator_2([passenger_incentive, driver_incentive, total_distance, fare, compensation])
                true_label = np.random.choice([0,1], size=1, p=[1-true_prob, true_prob])[0]
                pred_prob = prob_prediction([passenger_incentive, driver_incentive, total_distance, fare, compensation])
                
                data_incoming.loc[n, f'true_prob_{i}_{j}']  = true_prob
                data_incoming.loc[n, f'true_label_{i}_{j}'] = true_label
                data_incoming.loc[n, f'pred_prob_{i}_{j}'] = pred_prob
        
    data_incoming.to_pickle(f'data/data_incoming_{SEED}.pickle')