In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df_test = pd.read_csv('data/xgboost_distance_regression_merged_test')

In [3]:
df_train = pd.read_csv('data/xgboost_distance_regression_merged_train')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [25]:
# covert categorial features into one-hot encoding
selected_features = [
#      'travel_date_dow',
     'travel_date_dow_merged',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
     'age_merged',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
#      'industry',
#      'gender',
#      'education',
#      'education_merged',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
#      'res_type',
     'res_type_merged',
#      'rent_own',
#      'income_aggregate',
     'income_aggregate_merged',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
#      'disability',
     'trip_distance',
#      'trip_distance_category',
]

df_selected = df_train[selected_features]
df_test_selected = df_test[selected_features]
#df_selected.loc[:,'trip_distance_category'] = df_selected['trip_distance_category'].replace({"short":0, "medium":1, "long":2})

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(pd.concat([df_selected, df_test_selected],axis=0), columns=[x for x in categorial_columns if x in selected_features])
onehot_train = onehot.iloc[:len(df_selected),:]
onehot_test = onehot.iloc[len(df_selected):,:]

In [33]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df_train['mode'].replace(str_to_val).to_numpy()
X = onehot_train.to_numpy()

X_test = onehot_test.to_numpy()

## Model selection

In [6]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

from scipy.stats import uniform
from scipy.stats import randint

groups = df_train['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### hyperparameters search

In [7]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

### performance

In [8]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

Define performance evaluation function for later use

In [9]:
def classifier_performance(best_estimator, X, y):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

In [10]:
best_param

{'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.268}

In [31]:
classifier_performance(best_estimator, X, y)

training loss:0.889409889702791	 validating loss:1.0892860158129967
training accuracy:0.6621454603073997	 validating accuracy:0.592977563696286
Confusion matrix:


array([[1884,   47,    1,   56,    1,  409,    1],
       [ 433,  114,    0,   10,    0,  179,    2],
       [  45,    9,    7,   62,    4,  303,    0],
       [ 110,   13,   16,  276,    7,  559,    1],
       [   8,    7,    0,   14,    3,   96,    0],
       [ 462,   54,   22,  127,    6, 2393,    2],
       [  34,    4,    2,   19,    1,   85,    1]], dtype=int64)

training loss:0.8890692837254917	 validating loss:1.1056082071531041
training accuracy:0.6591665346220884	 validating accuracy:0.5886677652427431
Confusion matrix:


array([[1776,   67,    2,   35,    1,  282,    0],
       [ 229,  106,    6,    8,    0,  164,    0],
       [  81,   21,   12,   70,    0,  263,    1],
       [  99,   14,   25,  360,    2,  540,    5],
       [  10,    0,    0,   21,   16,  101,    0],
       [ 642,   91,   25,  185,    3, 2374,   15],
       [ 122,   22,    1,   25,    0,   67,    0]], dtype=int64)

training loss:0.8849203613692828	 validating loss:1.0982034208342983
training accuracy:0.6614165742354619	 validating accuracy:0.5937381163645582
Confusion matrix:


array([[1883,   84,    2,   44,    2,  405,    4],
       [ 231,  123,    8,    6,    2,  238,    1],
       [  68,   24,   11,   94,    0,  289,    3],
       [  97,   28,   15,  326,    4,  539,    2],
       [  13,    1,    0,    2,    1,   60,    0],
       [ 469,   77,   28,  177,    9, 2339,    4],
       [  46,   15,    4,   35,    0,   75,    1]], dtype=int64)

training loss:0.8757894377637789	 validating loss:1.201708002061989
training accuracy:0.6644905720171129	 validating accuracy:0.5572315882874889
Confusion matrix:


array([[1759,  104,    8,   42,    0,  380,    4],
       [ 202,  110,    7,    8,    0,  210,    3],
       [  92,   10,   24,   83,    0,  312,    0],
       [ 125,   15,   24,  321,    5,  546,    2],
       [  46,    5,    0,   25,    3,  139,    0],
       [ 657,   47,   15,  124,    6, 2177,    5],
       [  75,    6,    3,   34,    0,  124,    2]], dtype=int64)

training loss:0.8874080404544211	 validating loss:1.1017415460608107
training accuracy:0.6600646469768031	 validating accuracy:0.5774594320486816
Confusion matrix:


array([[1886,   85,    1,   42,    0,  371,    1],
       [ 261,  123,    0,    5,    0,  215,    3],
       [  65,   20,   32,   65,    0,  266,    0],
       [ 146,   17,   34,  276,    6,  471,    1],
       [  31,    0,    0,   27,    6,   67,    0],
       [ 675,   41,   24,  178,   12, 2232,    2],
       [  55,   15,    2,   10,    2,  117,    0]], dtype=int64)

loss:1.1017415460608107	 accuracy:0.5774594320486816


## Random forest

### hyperparameters search

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

distributions = {"n_estimators": randint(low=10,high=100),
                 "criterion": ["gini", "entropy"],
                "max_depth": randint(low=10,high=20),
                "min_samples_leaf": randint(low=5,high=100)}


lr_rf = RandomizedSearchCV(rf, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_rf = lr_rf.fit(X, y, groups = groups)

### performance

In [27]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [28]:
best_param

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 14,
 'n_estimators': 30}

In [29]:
classifier_performance(best_estimator, X, y)

training loss:0.9425008477544735	 validating loss:1.079533129711213
training accuracy:0.62757090793852	 validating accuracy:0.6047661300545063
Confusion matrix:


array([[2052,    2,    0,   26,    0,  319,    0],
       [ 522,   20,    0,    8,    0,  188,    0],
       [  51,    2,    0,   42,    0,  335,    0],
       [ 113,    0,    5,  235,    0,  629,    0],
       [  10,    0,    0,   14,    0,  104,    0],
       [ 511,   12,    4,   75,    0, 2464,    0],
       [  38,    1,    0,   15,    0,   92,    0]], dtype=int64)

training loss:0.9334661227049481	 validating loss:1.121217456093517
training accuracy:0.6314054824908889	 validating accuracy:0.5871466599061985
Confusion matrix:


array([[1823,   13,    0,   31,    0,  290,    6],
       [ 275,   32,    0,    6,    0,  200,    0],
       [  84,    4,    0,   56,    0,  304,    0],
       [ 111,    3,    0,  312,    0,  619,    0],
       [  13,    1,    0,   29,    0,  105,    0],
       [ 683,   62,    4,  121,    0, 2465,    0],
       [ 133,    4,    1,   22,    0,   77,    0]], dtype=int64)

training loss:0.9313396246915374	 validating loss:1.109603245110732
training accuracy:0.6345745523688797	 validating accuracy:0.592977563696286
Confusion matrix:


array([[1987,   24,    0,   21,    0,  391,    1],
       [ 293,   29,    1,    3,    0,  283,    0],
       [  75,    3,    0,   78,    0,  333,    0],
       [ 127,    1,    0,  231,    0,  652,    0],
       [  13,    0,    0,    1,    0,   63,    0],
       [ 561,    4,    0,  107,    0, 2431,    0],
       [  56,    6,    0,   30,    0,   84,    0]], dtype=int64)

training loss:0.9163489016362202	 validating loss:1.2133832687155743
training accuracy:0.6424972270638568	 validating accuracy:0.5596400050703512
Confusion matrix:


array([[1830,   17,    0,   24,    0,  426,    0],
       [ 241,   26,    0,    5,    0,  268,    0],
       [  90,    7,    0,   79,    0,  345,    0],
       [ 137,    1,    0,  288,    0,  612,    0],
       [  45,    1,    0,   18,    0,  154,    0],
       [ 657,    6,    0,   97,    0, 2271,    0],
       [  83,    0,    0,   30,    0,  131,    0]], dtype=int64)

training loss:0.9323643342424917	 validating loss:1.1276751852781723
training accuracy:0.6327481303080238	 validating accuracy:0.5704868154158215
Confusion matrix:


array([[1959,   31,    0,   39,    0,  357,    0],
       [ 321,   27,    0,    4,    0,  255,    0],
       [  70,    9,    5,   47,    0,  317,    0],
       [ 162,    4,    1,  221,    0,  563,    0],
       [  26,    0,    0,   21,    0,   84,    0],
       [ 726,   12,    5,  133,    0, 2288,    0],
       [  66,    4,    0,    9,    0,  122,    0]], dtype=int64)

loss:1.1276751852781723	 accuracy:0.5704868154158215


## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [34]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

classifier_performance(gnb, X, y)

training loss:9.660625358034274	 validating loss:10.117222675862747
training accuracy:0.17055934083346538	 validating accuracy:0.15781467866649765
Confusion matrix:


array([[ 527,   42,    4,   23, 1132,   26,  645],
       [ 154,   37,    4,   14,  324,   19,  186],
       [  24,    5,   40,   48,  208,   16,   89],
       [  13,    8,   36,  211,  524,   30,  160],
       [   0,    0,    0,    9,  108,    7,    4],
       [ 145,   42,   89,  192, 1944,  268,  386],
       [  15,    6,    1,   12,   54,    4,   54]], dtype=int64)

training loss:12.918177933585676	 validating loss:13.382202406166483
training accuracy:0.12913959752812548	 validating accuracy:0.12663201926733425
Confusion matrix:


array([[  97,  321,   43,   32, 1052,   51,  567],
       [   7,  174,   12,   11,  232,   12,   65],
       [   4,   28,   48,   51,  213,   16,   88],
       [   4,   31,   52,  169,  561,   31,  197],
       [   2,    5,    1,    5,  108,   10,   17],
       [  12,  227,   98,  203, 2097,  249,  449],
       [   3,    7,    3,   13,   53,    4,  154]], dtype=int64)

training loss:10.017706600449246	 validating loss:11.093700743446883
training accuracy:0.16884804309935034	 validating accuracy:0.13144885283305868
Confusion matrix:


array([[ 343,  114,   23,   54, 1181,   88,  621],
       [  62,   46,   19,   12,  317,   23,  130],
       [   3,    8,   71,   64,  224,   24,   95],
       [   7,    3,   91,  178,  551,   37,  144],
       [   3,    0,    0,    3,   59,    6,    6],
       [  85,   56,  153,  202, 1972,  280,  355],
       [   7,    8,    6,   14,   77,    4,   60]], dtype=int64)

training loss:12.065742239223294	 validating loss:13.062874120110456
training accuracy:0.13170654412929805	 validating accuracy:0.13525161617442008
Confusion matrix:


array([[ 276,  325,   29,   55, 1354,   86,  172],
       [  53,  129,    8,   13,  259,   11,   67],
       [  16,   10,   61,   43,  272,    8,  111],
       [  14,   13,   69,  176,  624,   33,  109],
       [   7,   13,    8,   10,  164,    5,   11],
       [  50,  182,  110,  165, 2147,  181,  196],
       [   4,    9,   14,    7,  129,    1,   80]], dtype=int64)

training loss:10.554333319913015	 validating loss:10.499130838152825
training accuracy:0.14044872607428063	 validating accuracy:0.12677484787018256
Confusion matrix:


array([[ 287,   96,  302,   39,  897,   80,  685],
       [  27,   67,   78,    9,  231,   35,  160],
       [   7,   12,  154,   17,  170,   24,   64],
       [   5,    4,  236,  135,  370,   14,  187],
       [   0,    0,    6,   16,   76,    2,   31],
       [  42,   36,  422,  166, 1804,  212,  482],
       [   4,    3,   25,    7,   89,    4,   69]], dtype=int64)

loss:10.499130838152825	 accuracy:0.12677484787018256


## Neural network

We don't use cross validation for neural network

In [35]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupShuffleSplit

# train-validation split

gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=42)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train, X_validate = X[train_idx,:], X[test_idx,:]
    y_train, y_validate = y[train_idx], y[test_idx]

parameters = {'learning_rate_init':np.arange(1e-4, 1e-2, 1e-3),
              'alpha': np.arange(1e-5,1e-3,1e-4),
              'learning_rate':['constant', 'invscaling', 'adaptive']}

ann = MLPClassifier(hidden_layer_sizes=(25,), activation = 'relu', solver = 'adam', max_iter=500, random_state=42)

lr_ann = RandomizedSearchCV(ann, parameters, random_state=0,  scoring = "neg_log_loss", n_iter = 5)
search_ann = lr_ann.fit(X_train, y_train)

In [37]:
best_estimator = search_ann.best_estimator_
best_score = search_ann.best_score_
best_param = search_ann.best_params_

In [38]:
best_param

{'learning_rate_init': 0.0011, 'learning_rate': 'invscaling', 'alpha': 0.00071}

In [76]:
proba_train = best_estimator.predict_proba(X_train)
proba_val = best_estimator.predict_proba(X_validate)

loss_train = log_loss(y_train, proba_train)
loss_val = log_loss(y_validate, proba_val)

acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))

print(f"training loss:{loss_train}\t validating loss:{loss_val}")

print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")

# Confusion matrix
print('Confusion matrix:')
display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))

training loss:0.7768802163017411	 validating loss:1.0472762670038298
training accuracy:0.7273501942872724	 validating accuracy:0.6436770235503662
Confusion matrix:


array([[1970,  197,   20,  128,    4,  381,   27],
       [ 389,  246,   15,   46,    0,  118,    2],
       [  83,   42,   61,  179,    2,  239,    4],
       [ 220,   54,   52,  730,   24,  231,   14],
       [  22,   11,    4,   47,   20,  106,    0],
       [ 411,   53,   37,  115,   16, 3472,    7],
       [ 131,   22,    8,   57,    0,   83,    6]], dtype=int64)

### Train the best model with the whole dataset

In [29]:
best_param = {'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.268}
best_estimator = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False, **best_param)
best_estimator.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.268, max_delta_step=0,
              max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=40, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

In [30]:
proba = best_estimator.predict_proba(X_test)

In [24]:
result = pd.DataFrame({'id':df_test['id'].values, 'drive':proba[:,0], 'passenger':proba[:,1], 'bus':proba[:,2],'subway':proba[:,3],\
                      'bike':proba[:,4],'walk':proba[:,5],'other':proba[:,1],})
result.to_csv('xgboost_xgboost_test.csv',index=False)