In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df_test = pd.read_csv('data/xgboost_distance_regression_merged_test')

In [3]:
df_train = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [4]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
#      'disability',
     'trip_distance',
#      'trip_distance_category',
]

df_selected = df_train[selected_features]
df_test_selected = df_test[selected_features]
#df_selected.loc[:,'trip_distance_category'] = df_selected['trip_distance_category'].replace({"short":0, "medium":1, "long":2})

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(pd.concat([df_selected, df_test_selected],axis=0), columns=[x for x in categorial_columns if x in selected_features])
onehot_train = onehot.iloc[:len(df_selected),:]
onehot_test = onehot.iloc[len(df_selected):,:]

In [5]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df_train['mode'].replace(str_to_val).to_numpy()
X = onehot_train.to_numpy()

X_test = onehot_test.to_numpy()

## Model selection

In [10]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

from scipy.stats import uniform
from scipy.stats import randint

groups = df_train['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### hyperparameters search

In [6]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

### performance

In [7]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

Define performance evaluation function for later use

In [8]:
def classifier_performance(best_estimator, X, y):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

In [9]:
best_param

{'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}

In [11]:
import xgboost as xgb

best_param = {'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}
best_estimator = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False, **best_param)
classifier_performance(best_estimator, X, y)

training loss:0.7738068046769907	 validating loss:0.878634037049283
training accuracy:0.7241958485184599	 validating accuracy:0.6780327037647357
Confusion matrix:


array([[1903,   24,    8,  106,    1,  356,    1],
       [ 498,  117,    1,   16,    0,  105,    1],
       [  56,   22,   36,  157,    2,  157,    0],
       [ 129,   50,   40,  595,    0,  167,    1],
       [  13,    8,    0,   67,    1,   39,    0],
       [ 218,   28,   18,  103,    3, 2695,    1],
       [  27,   22,    5,   37,    1,   52,    2]], dtype=int64)

training loss:0.7702628275667303	 validating loss:0.909441603161371
training accuracy:0.7239106322294406	 validating accuracy:0.6866523006718215
Confusion matrix:


array([[1672,   76,    5,   92,    0,  256,   62],
       [ 253,  168,    5,   15,    0,   70,    2],
       [  73,   33,   33,  132,    1,  175,    1],
       [ 113,   60,   35,  644,    3,  190,    0],
       [  19,    5,    1,   53,    8,   62,    0],
       [ 303,   41,   18,   86,    0, 2887,    0],
       [ 117,   30,    3,   37,    1,   44,    5]], dtype=int64)

training loss:0.765197689219013	 validating loss:0.8931613523970653
training accuracy:0.7252099508794169	 validating accuracy:0.6923564456838636
Confusion matrix:


array([[1857,   62,    3,  108,    5,  378,   11],
       [ 263,  192,    7,   20,    5,  122,    0],
       [  75,   34,   34,  173,    3,  170,    0],
       [ 121,   49,   29,  630,    5,  177,    0],
       [  14,    1,    0,    8,    4,   50,    0],
       [ 213,   27,   17,   99,    2, 2744,    1],
       [  51,   25,    3,   60,    0,   36,    1]], dtype=int64)

training loss:0.7475979090115311	 validating loss:1.0108504049385048
training accuracy:0.7343051814292505	 validating accuracy:0.6591456458359741
Confusion matrix:


array([[1691,   99,    8,  108,    2,  383,    6],
       [ 225,  194,    7,   23,    1,   90,    0],
       [  89,   34,   62,  168,    0,  163,    5],
       [ 138,   36,   41,  608,    3,  208,    4],
       [  38,    2,    3,   54,    3,  118,    0],
       [ 216,   45,   24,  103,    0, 2641,    2],
       [  89,   22,    1,   61,    0,   70,    1]], dtype=int64)

training loss:0.7660555516687558	 validating loss:0.9164678141385937
training accuracy:0.7266763848396501	 validating accuracy:0.681921906693712
Confusion matrix:


array([[1863,   77,    7,  146,    1,  290,    2],
       [ 292,  197,    3,   12,    0,  100,    3],
       [  63,   31,   44,  133,    0,  177,    0],
       [ 170,   50,   37,  531,    6,  155,    2],
       [  22,    9,    0,   49,   13,   38,    0],
       [ 277,   34,   16,   95,    9, 2731,    2],
       [  59,   19,    4,   31,    0,   88,    0]], dtype=int64)

loss:0.9164678141385937	 accuracy:0.681921906693712


## Random forest

### hyperparameters search

In [25]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

distributions = {"n_estimators": randint(low=10,high=100),
                 "criterion": ["gini", "entropy"],
                "max_depth": randint(low=10,high=20),
                "min_samples_leaf": randint(low=5,high=100)}


lr_rf = RandomizedSearchCV(rf, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_rf = lr_rf.fit(X, y, groups = groups)

### performance

In [12]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [13]:
best_param

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 14,
 'n_estimators': 30}

In [81]:
best_param = {'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 14,
 'n_estimators': 30}
best_estimator = RandomForestClassifier(random_state=42, **best_param)
classifier_performance(best_estimator, X, y)

training loss:0.8005732503523793	 validating loss:0.9247948698210914
training accuracy:0.7189035018222152	 validating accuracy:0.6791735327671441
Confusion matrix:


array([[1913,    9,    2,  103,    0,  372,    0],
       [ 502,   96,    0,   19,    0,  121,    0],
       [  77,   21,    5,  160,    0,  167,    0],
       [ 144,   39,    5,  615,    0,  179,    0],
       [  12,    7,    0,   70,    0,   39,    0],
       [ 234,   18,    1,   84,    0, 2729,    0],
       [  33,   17,    0,   43,    0,   53,    0]], dtype=int64)

training loss:0.7935573251068186	 validating loss:0.9388636086035871
training accuracy:0.7186499762319759	 validating accuracy:0.6928634807960451
Confusion matrix:


array([[1761,   45,    0,   82,    0,  272,    3],
       [ 264,  139,    0,   16,    0,   94,    0],
       [  79,   28,    6,  141,    0,  194,    0],
       [ 115,   59,    3,  657,    0,  211,    0],
       [  23,    6,    0,   55,    0,   64,    0],
       [ 322,   24,    0,   85,    0, 2903,    1],
       [ 136,   15,    0,   44,    0,   42,    0]], dtype=int64)

training loss:0.794330642066712	 validating loss:0.9382094653129953
training accuracy:0.7244176834099192	 validating accuracy:0.691469134237546
Confusion matrix:


array([[1893,   35,    0,  107,    0,  386,    3],
       [ 277,  168,    0,   23,    0,  141,    0],
       [  70,   29,    2,  190,    0,  198,    0],
       [ 129,   33,    0,  636,    1,  212,    0],
       [  13,    1,    0,   12,    0,   51,    0],
       [ 232,   16,    1,   98,    0, 2756,    0],
       [  53,   18,    0,   64,    0,   41,    0]], dtype=int64)

training loss:0.7764196203219393	 validating loss:1.0478734028369634
training accuracy:0.7248930438916178	 validating accuracy:0.6595259221701103
Confusion matrix:


array([[1728,   52,    3,  104,    0,  410,    0],
       [ 239,  180,    0,   23,    0,   98,    0],
       [  88,   32,    0,  212,    0,  189,    0],
       [ 149,   33,    0,  626,    0,  230,    0],
       [  47,    0,    0,   49,    0,  122,    0],
       [ 230,   37,    0,   95,    0, 2669,    0],
       [  87,   19,    1,   59,    0,   78,    0]], dtype=int64)

training loss:0.7967294558884229	 validating loss:0.9691317024858053
training accuracy:0.7211623779946761	 validating accuracy:0.6828093306288032
Confusion matrix:


array([[1908,   34,    3,  123,    0,  318,    0],
       [ 315,  143,    0,    9,    0,  140,    0],
       [  70,   21,    4,  139,    0,  214,    0],
       [ 172,   34,    0,  563,    0,  182,    0],
       [  25,    7,    0,   48,    3,   48,    0],
       [ 289,   20,    0,   86,    4, 2765,    0],
       [  64,    9,    0,   34,    0,   94,    0]], dtype=int64)

loss:0.9691317024858053	 accuracy:0.6828093306288032


## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [34]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

classifier_performance(gnb, X, y)

training loss:9.660625358034274	 validating loss:10.117222675862747
training accuracy:0.17055934083346538	 validating accuracy:0.15781467866649765
Confusion matrix:


array([[ 527,   42,    4,   23, 1132,   26,  645],
       [ 154,   37,    4,   14,  324,   19,  186],
       [  24,    5,   40,   48,  208,   16,   89],
       [  13,    8,   36,  211,  524,   30,  160],
       [   0,    0,    0,    9,  108,    7,    4],
       [ 145,   42,   89,  192, 1944,  268,  386],
       [  15,    6,    1,   12,   54,    4,   54]], dtype=int64)

training loss:12.918177933585676	 validating loss:13.382202406166483
training accuracy:0.12913959752812548	 validating accuracy:0.12663201926733425
Confusion matrix:


array([[  97,  321,   43,   32, 1052,   51,  567],
       [   7,  174,   12,   11,  232,   12,   65],
       [   4,   28,   48,   51,  213,   16,   88],
       [   4,   31,   52,  169,  561,   31,  197],
       [   2,    5,    1,    5,  108,   10,   17],
       [  12,  227,   98,  203, 2097,  249,  449],
       [   3,    7,    3,   13,   53,    4,  154]], dtype=int64)

training loss:10.017706600449246	 validating loss:11.093700743446883
training accuracy:0.16884804309935034	 validating accuracy:0.13144885283305868
Confusion matrix:


array([[ 343,  114,   23,   54, 1181,   88,  621],
       [  62,   46,   19,   12,  317,   23,  130],
       [   3,    8,   71,   64,  224,   24,   95],
       [   7,    3,   91,  178,  551,   37,  144],
       [   3,    0,    0,    3,   59,    6,    6],
       [  85,   56,  153,  202, 1972,  280,  355],
       [   7,    8,    6,   14,   77,    4,   60]], dtype=int64)

training loss:12.065742239223294	 validating loss:13.062874120110456
training accuracy:0.13170654412929805	 validating accuracy:0.13525161617442008
Confusion matrix:


array([[ 276,  325,   29,   55, 1354,   86,  172],
       [  53,  129,    8,   13,  259,   11,   67],
       [  16,   10,   61,   43,  272,    8,  111],
       [  14,   13,   69,  176,  624,   33,  109],
       [   7,   13,    8,   10,  164,    5,   11],
       [  50,  182,  110,  165, 2147,  181,  196],
       [   4,    9,   14,    7,  129,    1,   80]], dtype=int64)

training loss:10.554333319913015	 validating loss:10.499130838152825
training accuracy:0.14044872607428063	 validating accuracy:0.12677484787018256
Confusion matrix:


array([[ 287,   96,  302,   39,  897,   80,  685],
       [  27,   67,   78,    9,  231,   35,  160],
       [   7,   12,  154,   17,  170,   24,   64],
       [   5,    4,  236,  135,  370,   14,  187],
       [   0,    0,    6,   16,   76,    2,   31],
       [  42,   36,  422,  166, 1804,  212,  482],
       [   4,    3,   25,    7,   89,    4,   69]], dtype=int64)

loss:10.499130838152825	 accuracy:0.12677484787018256


## Neural network

We don't use cross validation for neural network

In [35]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupShuffleSplit

# train-validation split

gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=42)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train, X_validate = X[train_idx,:], X[test_idx,:]
    y_train, y_validate = y[train_idx], y[test_idx]

parameters = {'learning_rate_init':np.arange(1e-4, 1e-2, 1e-3),
              'alpha': np.arange(1e-5,1e-3,1e-4),
              'learning_rate':['constant', 'invscaling', 'adaptive']}

ann = MLPClassifier(hidden_layer_sizes=(25,), activation = 'relu', solver = 'adam', max_iter=500, random_state=42)

lr_ann = RandomizedSearchCV(ann, parameters, random_state=0,  scoring = "neg_log_loss", n_iter = 5)
search_ann = lr_ann.fit(X_train, y_train)

In [37]:
best_estimator = search_ann.best_estimator_
best_score = search_ann.best_score_
best_param = search_ann.best_params_

In [38]:
best_param

{'learning_rate_init': 0.0011, 'learning_rate': 'invscaling', 'alpha': 0.00071}

In [76]:
proba_train = best_estimator.predict_proba(X_train)
proba_val = best_estimator.predict_proba(X_validate)

loss_train = log_loss(y_train, proba_train)
loss_val = log_loss(y_validate, proba_val)

acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))

print(f"training loss:{loss_train}\t validating loss:{loss_val}")

print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")

# Confusion matrix
print('Confusion matrix:')
display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))

training loss:0.7768802163017411	 validating loss:1.0472762670038298
training accuracy:0.7273501942872724	 validating accuracy:0.6436770235503662
Confusion matrix:


array([[1970,  197,   20,  128,    4,  381,   27],
       [ 389,  246,   15,   46,    0,  118,    2],
       [  83,   42,   61,  179,    2,  239,    4],
       [ 220,   54,   52,  730,   24,  231,   14],
       [  22,   11,    4,   47,   20,  106,    0],
       [ 411,   53,   37,  115,   16, 3472,    7],
       [ 131,   22,    8,   57,    0,   83,    6]], dtype=int64)

### Train the best model with the whole dataset

In [12]:
best_param = {'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}
best_estimator = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False, **best_param)
best_estimator.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.1775,
              max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=40, n_jobs=-1,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...)

In [13]:
proba = best_estimator.predict_proba(X_test)

In [14]:
result = pd.DataFrame({'id':df_test['id'].values, 'drive':proba[:,0], 'passenger':proba[:,1], 'bus':proba[:,2],'subway':proba[:,3],\
                      'bike':proba[:,4],'walk':proba[:,5],'other':proba[:,1],})
result.to_csv('xgboost_xgboost_test.csv',index=False)