In [94]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [95]:
df_train = pd.read_csv('data/xgboost_distance_regression_train')
df_test = pd.read_csv('data/xgboost_distance_regression_test')


df_train = df_train[df_train['mode'] != np.nan]

df = pd.concat((df_train, df_test), ignore_index=True)
string_cols = ['survey_language', 'disability']
df[string_cols] = df[string_cols].astype('category')
df[string_cols] = df[string_cols].apply(lambda x: x.cat.codes)



In [96]:
df_train['mode'].unique()

array(['subway', 'drive', 'walk', 'passenger', 'bus', 'other', 'bike'],
      dtype=object)

In [97]:
TRANSPORT_MODES = ['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other']
candidate_cols = [
#      'travel_date_dow',
#      'o_purpose_category',
     'd_purpose_category',
     'num_non_hh_travelers',
#      'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
#      'license',
#      'planning_apps',
#      'industry',
#      'gender',
#      'education',
#      'survey_language',
#      'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
#      'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
#      'disability'
#     'trip_distance'
]

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df[candidate_cols], columns=[x for x in categorial_columns if x in candidate_cols])

X_train_val = onehot.to_numpy()[:len(df_train)]
y_train_val = df['mode'][:len(df_train)].apply(lambda x: TRANSPORT_MODES.index(x))
group = df['person_id'].to_numpy()[:len(df_train)]

X_test = onehot.to_numpy()[len(df_train):]



XGBoost parameters: https://xgboost.readthedocs.io/en/stable/parameter.html

In [95]:
# param = {'objective': 'multi:softprob',
#          'num_class': 7, 
#          'eval_metric': ['mlogloss', 'merror']}

In [98]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, plot_confusion_matrix

import matplotlib.pyplot as plt
import matplotlib

def acc_pre_rec(y_true, y_pred, verbose=False):
    ''' Returns accuracy, precision, and recall together. 
    If verbose is set to True, it prints the scores for 
    each mode.'''
    acc = accuracy_score(y_true, y_pred)
    prec, rec, fsc, sup = precision_recall_fscore_support(
        y_true, y_pred)
    
    if verbose:
        print(f'Accuracy: \n    {acc*100:.3f}%')
        scrs = {'Precision': prec, 'Recall': rec}
        for k, v in scrs.items():
            str_ = '%;\n    '.join(
                f'{TRANSPORT_MODES[i]} - {100*s:.3f}'
                for i, s in enumerate(v)
            )
            print(f"{k}: \n    {str_}%")
            
    return acc, prec, rec


In [99]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ConfusionMatrixDisplay, log_loss


def k_fold_cross_validation_classification(X, y, group, model, fold=5):

    k_fold = GroupKFold(n_splits=5)

    train_metrics  = [] #[(accuracy, cross entropy) for each fold]
    val_metrics = []  
    

    for train_idx, validate_idx in k_fold.split(X, y, groups=group):
        X_train, X_val = X[train_idx], X[validate_idx]
        y_train, y_val = y[train_idx], y[validate_idx]
        
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_train_prob = model.predict_proba(X_train)
        
        y_val_pred = model.predict(X_val)
        y_val_prob = model.predict_proba(X_val)
        
        # training metrics
        acc, prec, rec = acc_pre_rec(y_train, y_train_pred)
        cross_entropy = log_loss(y_train, y_train_prob)
        train_metrics.append((acc, cross_entropy))
        
        # validation metrics
        acc, prec, rec = acc_pre_rec(y_val, y_val_pred)
        cross_entropy = log_loss(y_val, y_val_prob)
        val_metrics.append((acc, cross_entropy))
    
    return train_metrics, val_metrics


In [102]:
model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 100,
                          n_jobs=-1,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)


train_metrics, val_metrics = k_fold_cross_validation_classification(X_train_val, y_train_val, group, model)

In [103]:
train_metrics

[(0.6184122959911266, 0.9867083882391329),
 (0.621391221676438, 0.9764864369542978),
 (0.6262715892885438, 0.967563399860551),
 (0.6302012359372524, 0.9554267231513327),
 (0.6236531879832679, 0.9759766434630595)]

In [104]:
metrics = np.concatenate((train_metrics, val_metrics), axis=1)
metrics = metrics[:,[0, 2, 1, 3]]

df_metrics = pd.DataFrame(metrics, columns=['acc-train', 'acc-val', 'cross entropy-train', 'cros entropy-val'])

In [105]:
df_metrics

Unnamed: 0,acc-train,acc-val,cross entropy-train,cros entropy-val
0,0.618412,0.576626,0.986708,1.133357
1,0.621391,0.564203,0.976486,1.226159
2,0.626272,0.555837,0.967563,1.181026
3,0.630201,0.535683,0.955427,1.294288
4,0.623653,0.559204,0.975977,1.185926


## Parameter search

In [86]:
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV

distributions = {
    'learning_rate': loguniform(10**-4, 1),
}

model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 100,
                          n_jobs=-1,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)

def neg_log_loss(estimator, X, y):
    y_pred = estimator.predict_proba(X)
    print(-log_loss(y, y_pred))
    return -log_loss(y, y_pred)

k_fold = GroupKFold(n_splits=5)
search = RandomizedSearchCV(model,
                            distributions,
                            random_state=1,
                            n_jobs=-1,
                            scoring=neg_log_loss,
                            n_iter=10
                           ).fit(X_train_val, y_train_val)

print('best parameter', search.best_params_)
print('best score', search.best_score_)


best parameter {'learning_rate': 0.07608481233714792}
best score -1.1442509743048093


In [85]:
search.cv_results_

{'mean_fit_time': array([9.0586925 , 9.07957211, 9.02653799, 9.0358922 , 9.05830102,
        8.926371  , 9.05386844, 9.07585526, 9.16332622, 6.46034641]),
 'std_fit_time': array([0.14739635, 0.12149694, 0.21936335, 0.16507139, 0.23387401,
        0.09270123, 0.22283313, 0.11750754, 0.20019281, 3.04108482]),
 'mean_score_time': array([0.01704617, 0.0179132 , 0.01716881, 0.01493983, 0.01596665,
        0.01499977, 0.01840992, 0.01771216, 0.01517253, 0.01298838]),
 'std_score_time': array([0.00210323, 0.00187377, 0.00186885, 0.00101407, 0.00300942,
        0.00109471, 0.00160075, 0.00188963, 0.00226825, 0.00251978]),
 'param_learning_rate': masked_array(data=[0.001216494146415184, 0.03995972283135063,
                    1.0013176560941263e-05, 0.00032483503450866794,
                    5.4172571827679706e-05, 2.8952960233492824e-05,
                    8.536916958038756e-05, 0.000534322712559412,
                    0.0009634681692624377, 0.004944059287398676],
              mask=[False

## Train on whole training/validation data, with tuned parameters.

In [106]:
# train on whole training-validation dataset
model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 100,
                          learning_rate=0.076, # obtained from parameter search, takes too much time to rerun so I'll just hard-code it here.
                          n_jobs=-1,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)


model.fit(X, y)

In [107]:
y_test_prob = model.predict_proba(X_test)

In [108]:
y_test_prob

array([[0.06039679, 0.00209222, 0.08713622, ..., 0.01353482, 0.7440381 ,
        0.00669944],
       [0.06039679, 0.00209222, 0.08713622, ..., 0.01353482, 0.7440381 ,
        0.00669944],
       [0.08574916, 0.00282143, 0.10853294, ..., 0.01969968, 0.591495  ,
        0.01585623],
       ...,
       [0.03062928, 0.04641289, 0.01406188, ..., 0.00425692, 0.8613369 ,
        0.00852847],
       [0.03062928, 0.04641289, 0.01406188, ..., 0.00425692, 0.8613369 ,
        0.00852847],
       [0.0179481 , 0.02039165, 0.01594599, ..., 0.0057186 , 0.88780403,
        0.00758224]], dtype=float32)

In [109]:
df_test_prob = pd.DataFrame(y_test_prob)
df_test_prob = pd.concat((df_test['id'], df_test_prob), axis=1, ignore_index=True)
df_test_prob.columns = ['id'] + TRANSPORT_MODES 

In [110]:
df_test_prob

Unnamed: 0,id,drive,passenger,bus,subway,bike,walk,other
0,39498,0.060397,0.002092,0.087136,0.086102,0.013535,0.744038,0.006699
1,39499,0.060397,0.002092,0.087136,0.086102,0.013535,0.744038,0.006699
2,39500,0.085749,0.002821,0.108533,0.175846,0.019700,0.591495,0.015856
3,39501,0.066115,0.002304,0.093736,0.110587,0.027774,0.688176,0.011307
4,39502,0.051739,0.001733,0.048403,0.086415,0.019721,0.781771,0.010218
...,...,...,...,...,...,...,...,...
26289,65787,0.030629,0.046413,0.014062,0.034774,0.004257,0.861337,0.008528
26290,65788,0.030629,0.046413,0.014062,0.034774,0.004257,0.861337,0.008528
26291,65789,0.030629,0.046413,0.014062,0.034774,0.004257,0.861337,0.008528
26292,65790,0.030629,0.046413,0.014062,0.034774,0.004257,0.861337,0.008528


In [111]:
df_test_prob.to_csv('prediction/xgboost.csv', index=False)