In [17]:
import xgboost as xgb
import pandas as pd
import numpy as np
from utils import k_fold_cross_validation_classification, acc_pre_rec

In [20]:
df_train = pd.read_csv('data/xgboost_distance_classification_prob_merged_train')
df_test = pd.read_csv('data/xgboost_distance_classification_prob_merged_test')


df_train = df_train[df_train['mode'] != np.nan]

df = pd.concat((df_train, df_test), ignore_index=True)
string_cols = ['survey_language', 'disability']
df[string_cols] = df[string_cols].astype('category')
df[string_cols] = df[string_cols].apply(lambda x: x.cat.codes)



In [21]:
df_train

Unnamed: 0,id,person_id,trip_n,travel_date,travel_date_dow,o_purpose_category,d_purpose_category,num_non_hh_travelers,num_hh_travelers,num_travelers,...,mode,trip_distance_category,age_merged,income_aggregate_merged,res_type_merged,education_merged,travel_date_dow_merged,trip_distance_short_prob,trip_distance_medium_prob,trip_distance_long_prob
0,0,0,0,30/05/2019,4,7,2,0,1,1,...,subway,long,1,1,0,1,0,0.029140,0.057234,0.913626
1,1,0,1,01/06/2019,6,6,1,0,1,1,...,drive,long,1,1,0,1,1,0.217593,0.449712,0.332695
2,2,0,2,02/06/2019,7,7,7,1,2,3,...,drive,long,1,1,0,1,1,0.167383,0.429124,0.403492
3,3,0,3,02/06/2019,7,7,7,1,2,3,...,drive,long,1,1,0,1,1,0.167383,0.429124,0.403492
4,4,0,4,03/06/2019,1,7,7,0,1,1,...,drive,medium,1,1,0,1,0,0.461662,0.314734,0.223605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39439,39493,1794,53,07/06/2019,5,6,6,0,2,2,...,walk,short,1,0,2,1,0,0.612537,0.284326,0.103137
39440,39494,1794,54,07/06/2019,5,6,6,0,2,2,...,walk,short,1,0,2,1,0,0.612537,0.284326,0.103137
39441,39495,1794,55,07/06/2019,5,6,6,0,2,2,...,walk,short,1,0,2,1,0,0.612537,0.284326,0.103137
39442,39496,1794,56,07/06/2019,5,6,8,0,2,2,...,walk,medium,1,0,2,1,0,0.546916,0.318323,0.134761


In [22]:
TRANSPORT_MODES = ['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other']
candidate_cols = [
#      'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
     'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
#      'industry',
#      'gender',
#      'education',
#      'survey_language',
#      'num_bicycles',
     'num_vehicles',
#      'res_type',
#      'rent_own',
#      'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
#      'disability',
    'res_type_merged',
    'income_aggregate_merged',
    'age_merged',
    'travel_date_dow_merged',
    'education_merged',
    'trip_distance_short_prob',
    'trip_distance_medium_prob',
    'trip_distance_long_prob',
]

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df[candidate_cols], columns=[x for x in categorial_columns if x in candidate_cols])

X_train_val = onehot.to_numpy()[:len(df_train)]
y_train_val = df['mode'][:len(df_train)].apply(lambda x: TRANSPORT_MODES.index(x))
group = df['person_id'].to_numpy()[:len(df_train)]

X_test = onehot.to_numpy()[len(df_train):]




XGBoost parameters: https://xgboost.readthedocs.io/en/stable/parameter.html

In [23]:
model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 100,
                          learning_rate=0.138,
                          n_jobs=-1,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)


train_metrics, val_metrics = k_fold_cross_validation_classification(X_train_val, y_train_val, group, model)

  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
train_metrics

[(0.6857867215972112, 0.8237470862901479),
 (0.6846775471399145, 0.8263189519581261),
 (0.6883219774996039, 0.8221343247119719),
 (0.6934875614007289, 0.8050595469247461),
 (0.6916909620991254, 0.8160039966426081)]

In [30]:
metrics = np.concatenate((train_metrics, val_metrics), axis=1)
metrics = metrics[:,[0, 2, 1, 3]]

df_metrics = pd.DataFrame(metrics, columns=['acc-train', 'acc-val', 'cross entropy-train', 'cros entropy-val'])

In [31]:
df_metrics

Unnamed: 0,acc-train,acc-val,cross entropy-train,cros entropy-val
0,0.685787,0.599442,0.823747,1.075036
1,0.684678,0.593231,0.826319,1.116948
2,0.688322,0.60071,0.822134,1.100199
3,0.693488,0.556218,0.80506,1.22267
4,0.691691,0.56681,0.816004,1.135596


In [32]:
df_metrics.to_csv('stats/xgboost_mode_categorized')

## Parameter search

In [33]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV, GroupKFold

distributions = {
    'learning_rate': loguniform(10**-4, 1),
}

model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 100,
                          n_jobs=4,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)

def neg_log_loss(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    print(-log_loss(y, y_pred))
    return -log_loss(y, y_pred)

k_fold = GroupKFold(n_splits=5)
search = RandomizedSearchCV(model,
                            distributions,
                            cv=k_fold,
                            scoring='neg_log_loss',
                            random_state=1,
                            n_jobs=4,
                            n_iter=20
                           ).fit(X_train_val, y_train_val, groups=group)

print('best parameter', search.best_params_)
print('best score', search.best_score_)


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


KeyboardInterrupt: 

In [11]:
search.cv_results_

{'mean_fit_time': array([29.53360972, 31.4361516 , 32.63969784, 31.80048542, 31.80072756,
        31.95987124, 32.09798398, 32.12544904, 32.62127662, 33.38728733,
        34.13237529, 34.16185617, 33.79377985, 31.06667085, 34.236268  ,
        33.52530828, 33.58916059, 34.07106066, 35.31015878, 34.15357633]),
 'std_fit_time': array([1.10680398, 0.47702197, 0.36125041, 0.1169873 , 0.40438678,
        0.32963035, 0.26257833, 0.26399307, 0.47588401, 0.35010389,
        0.54597674, 0.16685064, 0.8211521 , 0.27660427, 0.68154286,
        0.1559721 , 0.40296493, 0.60561158, 0.66630555, 1.66805205]),
 'mean_score_time': array([0.04287691, 0.04817863, 0.04554868, 0.04418831, 0.0469636 ,
        0.04533377, 0.04570761, 0.04482813, 0.04652901, 0.04656291,
        0.04649839, 0.05073004, 0.04783802, 0.05267715, 0.051265  ,
        0.05048618, 0.04828382, 0.05009952, 0.05207973, 0.04561291]),
 'std_score_time': array([0.00703152, 0.00200324, 0.00244287, 0.0015174 , 0.00566742,
        0.00214633, 

## Train on whole training/validation data, with tuned parameters.

In [12]:
# train on whole training-validation dataset
model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 100,
                          learning_rate=0.055, # obtained from parameter search, takes too much time to rerun so I'll just hard-code it here.
                          n_jobs=-1,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)


model.fit(X_train_val, y_train_val)

In [13]:
y_test_prob = model.predict_proba(X_test)

In [14]:
y_test_prob

array([[0.04802497, 0.00512609, 0.08704595, ..., 0.01706883, 0.75014204,
        0.0093229 ],
       [0.04802497, 0.00512609, 0.08704595, ..., 0.01706883, 0.75014204,
        0.0093229 ],
       [0.08796829, 0.00724865, 0.09751408, ..., 0.02076613, 0.6143551 ,
        0.01681341],
       ...,
       [0.01342414, 0.01747837, 0.01264768, ..., 0.00444878, 0.9222363 ,
        0.00742382],
       [0.01342414, 0.01747837, 0.01264768, ..., 0.00444878, 0.9222363 ,
        0.00742382],
       [0.01628765, 0.01841126, 0.01669242, ..., 0.00909512, 0.8962816 ,
        0.00644125]], dtype=float32)

In [15]:
df_test_prob = pd.DataFrame(y_test_prob)
df_test_prob = pd.concat((df_test['id'], df_test_prob), axis=1, ignore_index=True)
df_test_prob.columns = ['id'] + TRANSPORT_MODES 

In [16]:
df_test_prob

Unnamed: 0,id,drive,passenger,bus,subway,bike,walk,other
0,39498,0.048025,0.005126,0.087046,0.083269,0.017069,0.750142,0.009323
1,39499,0.048025,0.005126,0.087046,0.083269,0.017069,0.750142,0.009323
2,39500,0.087968,0.007249,0.097514,0.155334,0.020766,0.614355,0.016813
3,39501,0.063746,0.006355,0.092687,0.106486,0.031983,0.687186,0.011558
4,39502,0.052509,0.005285,0.048704,0.085983,0.026596,0.771268,0.009655
...,...,...,...,...,...,...,...,...
26289,65787,0.013424,0.017478,0.012648,0.022341,0.004449,0.922236,0.007424
26290,65788,0.013424,0.017478,0.012648,0.022341,0.004449,0.922236,0.007424
26291,65789,0.013424,0.017478,0.012648,0.022341,0.004449,0.922236,0.007424
26292,65790,0.013424,0.017478,0.012648,0.022341,0.004449,0.922236,0.007424


In [17]:
df_test_prob.to_csv('prediction/xgboost_with_distance_classification_prob.csv', index=False)