In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [3]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
     'disability',
     'trip_distance'
]

df_selected = df[selected_features]

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df_selected, columns=[x for x in categorial_columns if x in selected_features])

In [13]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df['mode'].replace(str_to_val).to_numpy()
X = onehot.to_numpy()

## Model selection

In [None]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

groups = df['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### cross validation

In [None]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

In [26]:
import xgboost as xgb
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

clf = xgb.XGBClassifier(n_estimators = 30,  max_depth = 10,learning_rate = 0.1, n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)

for train_index, validate_index in group_kfold.split(X, y, groups):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    clf.fit(X_train, y_train)
    proba = clf.predict_proba(X_validate)
    loss = log_loss(y_validate, proba)
    
    print(f"cross entropy loss:{loss}")
    
    print(f"accuracy:{accuracy_score(y_validate, clf.predict(X_validate))}")
    # Confusion matrix
    print('Confusion matrix:')
    display(confusion_matrix(y_validate, clf.predict(X_validate)))

cross entropy loss:0.9412916558006671
accuracy:0.6673849664089239
Confusion matrix:


array([[1873,   21,   18,  103,    3,  346,   35],
       [ 458,  107,    4,   21,    1,   97,   50],
       [  58,   21,   32,  162,    2,  154,    1],
       [ 139,   36,   55,  567,    6,  178,    1],
       [  14,    6,    0,   64,    1,   43,    0],
       [ 217,   25,   31,  105,    3, 2682,    3],
       [  27,   12,    6,   42,    1,   55,    3]], dtype=int64)

cross entropy loss:0.9616968244548412
accuracy:0.6776524274305996
Confusion matrix:


array([[1634,   75,   11,   94,    3,  255,   91],
       [ 244,  167,    8,   16,    0,   77,    1],
       [  69,   28,   46,  126,    4,  172,    3],
       [ 111,   54,   42,  634,    6,  193,    5],
       [  22,    5,    3,   45,   17,   56,    0],
       [ 314,   37,   49,   84,    1, 2840,   10],
       [ 109,   37,    5,   35,    2,   41,    8]], dtype=int64)

cross entropy loss:0.9500715818630605
accuracy:0.6865255418937761
Confusion matrix:


array([[1856,   73,   13,  106,    8,  357,   11],
       [ 271,  180,   10,   26,    1,  119,    2],
       [  79,   33,   53,  158,    1,  163,    2],
       [ 133,   32,   42,  612,    8,  181,    3],
       [  12,    1,    5,    8,    5,   45,    1],
       [ 222,   34,   28,  105,    4, 2707,    3],
       [  46,   24,    5,   59,    0,   39,    3]], dtype=int64)

cross entropy loss:1.0468883413709544
accuracy:0.6559766763848397
Confusion matrix:


array([[1691,   93,   22,  107,    4,  372,    8],
       [ 214,  193,   15,   25,    0,   91,    2],
       [  98,   27,   69,  161,    0,  162,    4],
       [ 147,   29,   57,  601,    4,  194,    6],
       [  40,    2,   10,   48,    1,  116,    1],
       [ 240,   33,   23,  110,    2, 2619,    4],
       [  90,   16,    8,   57,    0,   72,    1]], dtype=int64)

cross entropy loss:0.9635069462268577
accuracy:0.672920892494929
Confusion matrix:


array([[1858,   78,   17,  142,    6,  282,    3],
       [ 293,  189,    2,   17,    0,  104,    2],
       [  70,   16,   47,  140,    0,  172,    3],
       [ 178,   37,   53,  501,   15,  164,    3],
       [  30,    9,    0,   39,   17,   32,    4],
       [ 297,   29,   29,   94,   16, 2694,    5],
       [  58,   16,    6,   30,    1,   88,    2]], dtype=int64)

### performance

In [None]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

Define performance evaluation function for later use

In [None]:
def classifier_performance(best_estimator):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{loss.mean()}\t accuracy:acc.mean()")

In [None]:
classfier_performance(best_estimator)

## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

for train_index, validate_index in group_kfold.split(X, y, groups):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]

    # loss and accuracy
    loss = []
    acc = []
    
    gnb.fit(X_train, y_train)

    proba_train = gnb.predict_proba(X_train)
    proba_val = gnb.predict_proba(X_validate)

    loss_train = log_loss(y_train, proba_train)
    loss_val = log_loss(y_validate, proba_val)
    loss.append(loss_val)

    acc_train = accuracy_score(y_train, gnb.predict(X_train))
    acc_val = accuracy_score(y_validate, gnb.predict(X_validate))
    acc.append(acc_val)

    print(f"training loss:{loss_train}\t validating loss:{loss_val}")

    print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
    # Confusion matrix
    print('Confusion matrix:')
    display(confusion_matrix(y_validate, clf.predict(X_validate)))

print(f"loss:{loss.mean()}\t accuracy:acc.mean()")

### Train the best model with the whole dataset

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

best_estimator.fit(X, y)