In [None]:
import numpy as np
import pandas as pd
import math
import random
import csv

import pickle

from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import pairwise_kernels

from params import *
from functions import *

import os
os.makedirs('results/', exist_ok = True) 

In [None]:
for SEED in range(0,10):

    set_seed(SEED)
    data_hist = pd.read_pickle(f'data/data_hist_{SEED}.pickle')
    
    scaler = MinMaxScaler()
    kmeans = KMeans(n_clusters = K, random_state = SEED)

    pred_prob_original = np.zeros((N_hist,len(passenger_incentive_list),len(driver_incentive_list)))
    pred_prob_adjusted = np.zeros((N_hist,len(passenger_incentive_list),len(driver_incentive_list)))
    
    fare_list = np.zeros(N_hist)
    compensation_list = np.zeros(N_hist)
    feature_list = np.zeros((N_hist, num_feature))
    
    error_list = np.zeros(N_hist)

    for n in range(0,N_hist):

        # obtain the feature for each data
        fare_list[n] = data_hist.loc[n, 'fare']
        compensation_list[n] = data_hist.loc[n, 'compensation']
        feature_list[n,:] = data_hist.loc[n, feature_name].values

        for i in range(0, len(passenger_incentive_list)):
            for j in range(0, len(driver_incentive_list)):
                pred_prob_original[n,i,j] = data_hist.loc[n, f'pred_prob_{i}_{j}']
                
        i = np.argwhere(passenger_incentive_list == data_hist.loc[n, 'passenger_incentive'])[0][0]
        j = np.argwhere(driver_incentive_list == data_hist.loc[n, 'driver_incentive'])[0][0]
        
        # obtain the error for each data
        error_list[n] = data_hist.loc[n, f'true_label_{i}_{j}'] - data_hist.loc[n, f'pred_prob_{i}_{j}']
        
    # normalize and cluster the features of all historical data
    cluster_idx = kmeans.fit_predict(scaler.fit_transform(feature_list))
    feature_cluster_list = kmeans.cluster_centers_
    error_cluster_list = np.zeros(K)
    size_cluster_list = np.zeros(K)

    for k in range(0, K):
        # calculate the average error information for each cluster
        error_cluster_list[k] = np.mean(error_list[cluster_idx == k])
        # calculate the size information for each cluster
        size_cluster_list[k] = np.sum(cluster_idx == k)
        
    for n in range(0,N_hist):
        # adjust the probability using the clustering information
        kernel = pairwise_kernels(feature_cluster_list, scaler.transform(feature_list[n].reshape(1, -1)), metric='rbf', gamma=gamma).flatten()
        error = np.sum(kernel*size_cluster_list*error_cluster_list)/np.sum(kernel*size_cluster_list)          
        pred_prob_adjusted[n] = pred_prob_original[n] + error
    
    lambda_h = solve_largrange_multiplier(N_hist, pred_prob_adjusted, fare_list, compensation_list, passenger_incentive_list, driver_incentive_list, B=B_hist, r=0)
    print('Seed', SEED, 'Multiplier', lambda_h)
    
    pickle.dump([lambda_h, scaler, feature_cluster_list, error_cluster_list, size_cluster_list], open(f'results/results_offline_SHR_CLUSTER_{SEED}.pickle', 'wb'))