In [6]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [23]:
df_train = pd.read_csv('data/xgboost_distance_regression_train')
df_test = pd.read_csv('data/xgboost_distance_regression_test')


df_train = df_train[df_train['mode'] != np.nan]

df = pd.concat((df_train, df_test), ignore_index=True)
string_cols = ['survey_language', 'disability']
df[string_cols] = df[string_cols].astype('category')
df[string_cols] = df[string_cols].apply(lambda x: x.cat.codes)



In [25]:
df_train['mode'].unique()

array(['subway', 'drive', 'walk', 'passenger', 'bus', 'other', 'bike'],
      dtype=object)

In [27]:
TRANSPORT_MODES = ['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other']
candidate_cols = [
#      'travel_date_dow',
#      'o_purpose_category',
     'd_purpose_category',
     'num_non_hh_travelers',
#      'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
#      'license',
#      'planning_apps',
#      'industry',
#      'gender',
#      'education',
#      'survey_language',
#      'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
#      'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
#      'disability'
#     'trip_distance'
]

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df[candidate_cols], columns=[x for x in categorial_columns if x in candidate_cols])

X_train_val = onehot.to_numpy()[:len(df_train)]
y_train_val = df['mode'][:len(df_train)].apply(lambda x: TRANSPORT_MODES.index(x))
group = df['person_id'].to_numpy()[:len(df_train)]

X_test = onehot.to_numpy()[len(df_train):]



XGBoost parameters: https://xgboost.readthedocs.io/en/stable/parameter.html

In [95]:
# param = {'objective': 'multi:softprob',
#          'num_class': 7, 
#          'eval_metric': ['mlogloss', 'merror']}

In [42]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, plot_confusion_matrix

import matplotlib.pyplot as plt
import matplotlib

def acc_pre_rec(y_true, y_pred, verbose=False):
    ''' Returns accuracy, precision, and recall together. 
    If verbose is set to True, it prints the scores for 
    each mode.'''
    acc = accuracy_score(y_true, y_pred)
    prec, rec, fsc, sup = precision_recall_fscore_support(
        y_true, y_pred)
    
    if verbose:
        print(f'Accuracy: \n    {acc*100:.3f}%')
        scrs = {'Precision': prec, 'Recall': rec}
        for k, v in scrs.items():
            str_ = '%;\n    '.join(
                f'{TRANSPORT_MODES[i]} - {100*s:.3f}'
                for i, s in enumerate(v)
            )
            print(f"{k}: \n    {str_}%")
            
    return acc, prec, rec


In [46]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ConfusionMatrixDisplay, log_loss


def k_fold_cross_validation_classification(X, y, group, model, fold=5):

    k_fold = GroupKFold(n_splits=5)

    train_metrics  = [] #[(accuracy, cross entropy) for each fold]
    val_metrics = []  
    

    for train_idx, validate_idx in k_fold.split(X, y, groups=group):
        X_train, X_val = X[train_idx], X[validate_idx]
        y_train, y_val = y[train_idx], y[validate_idx]
        
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_train_prob = model.predict_proba(X_train)
        
        y_val_pred = model.predict(X_val)
        y_val_prob = model.predict_proba(X_val)
        
        # training metrics
        acc, prec, rec = acc_pre_rec(y_train, y_train_pred)
        cross_entropy = log_loss(y_train, y_train_prob)
        train_metrics.append((acc, cross_entropy))
        
        # validation metrics
        acc, prec, rec = acc_pre_rec(y_val, y_val_pred)
        cross_entropy = log_loss(y_val, y_val_prob)
        val_metrics.append((acc, cross_entropy))
    
    return train_metrics, val_metrics


In [47]:
model = xgb.XGBClassifier(objective='multi:softprob',
                          n_estimators = 10,
                          n_jobs=-1,
                          num_class=7,
                          random_state=42,
                          eval_metric=['mlogloss', 'merror'],
                          use_label_encoder =False)


train_metrics, val_metrics = k_fold_cross_validation_classification(X_train_val, y_train_val, group, model)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
train_metrics

[(0.5906512438599271, 1.1234847123938154),
 (0.5945175091110759, 1.1148023083572285),
 (0.5933766439549992, 1.1119159501014657),
 (0.598985897639043, 1.0947881090241516),
 (0.5976993281784764, 1.1128541707996331)]

In [53]:
metrics = np.concatenate((train_metrics, val_metrics), axis=1)
metrics = metrics[:,[0, 2, 1, 3]]

df_metrics = pd.DataFrame(metrics, columns=['acc-train', 'acc-val', 'cross entropy-train', 'cros entropy-val'])

In [54]:
df_metrics

Unnamed: 0,acc-train,acc-val,cross entropy-train,cros entropy-val
0,0.590651,0.585879,1.123485,1.134877
1,0.594518,0.569527,1.114802,1.181012
2,0.593377,0.5874,1.111916,1.158207
3,0.598986,0.55457,1.094788,1.239469
4,0.597699,0.56288,1.112854,1.174227


In [55]:
# train on whole training-validation dataset

model.fit(X, y)

In [59]:
y_test_prob = model.predict_proba(X_test)

In [60]:
y_test_prob

array([[0.06583673, 0.02161659, 0.08684087, ..., 0.03143208, 0.678451  ,
        0.0257199 ],
       [0.06583673, 0.02161659, 0.08684087, ..., 0.03143208, 0.678451  ,
        0.0257199 ],
       [0.08562235, 0.02623584, 0.10539789, ..., 0.0381488 , 0.545269  ,
        0.03881695],
       ...,
       [0.03779454, 0.04302442, 0.02779315, ..., 0.01823059, 0.80556256,
        0.02602088],
       [0.03779454, 0.04302442, 0.02779315, ..., 0.01823059, 0.80556256,
        0.02602088],
       [0.03695714, 0.03290609, 0.03161744, ..., 0.02090146, 0.7957875 ,
        0.02542345]], dtype=float32)

In [74]:
df_test_prob = pd.DataFrame(y_test_prob)
df_test_prob = pd.concat((df_test['id'], df_test_prob), axis=1, ignore_index=True)
df_test_prob.columns = ['id'] + TRANSPORT_MODES 

In [75]:
df_test_prob

Unnamed: 0,id,drive,passenger,bus,subway,bike,walk,other
0,39498,0.065837,0.021617,0.086841,0.090103,0.031432,0.678451,0.025720
1,39499,0.065837,0.021617,0.086841,0.090103,0.031432,0.678451,0.025720
2,39500,0.085622,0.026236,0.105398,0.160509,0.038149,0.545269,0.038817
3,39501,0.069929,0.022960,0.092239,0.101232,0.041656,0.643923,0.028060
4,39502,0.058295,0.018841,0.058601,0.092070,0.034182,0.708742,0.029269
...,...,...,...,...,...,...,...,...
26289,65787,0.037795,0.043024,0.027793,0.041574,0.018231,0.805563,0.026021
26290,65788,0.037795,0.043024,0.027793,0.041574,0.018231,0.805563,0.026021
26291,65789,0.037795,0.043024,0.027793,0.041574,0.018231,0.805563,0.026021
26292,65790,0.037795,0.043024,0.027793,0.041574,0.018231,0.805563,0.026021


In [77]:
df_test_prob.to_csv('prediction/xgboost.csv', index=False)