In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [3]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
     'disability',
     'trip_distance'
]

df_selected = df[selected_features]

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df_selected, columns=[x for x in categorial_columns if x in selected_features])

In [4]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df['mode'].replace(str_to_val).to_numpy()
X = onehot.to_numpy()

## Model selection

In [5]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

from scipy.stats import uniform
from scipy.stats import randint

groups = df['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### hyperparameters search

In [31]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

### performance

In [33]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

NameError: name 'search_xgbo' is not defined

Define performance evaluation function for later use

In [45]:
def classifier_performance(best_estimator):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

In [40]:
best_param

{'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}

In [46]:
classifier_performance(best_estimator)

training loss:0.7684913363882653	 validating loss:0.8743614894932351
training accuracy:0.7263825067342735	 validating accuracy:0.678793256433008
Confusion matrix:


array([[1896,   26,    7,  108,    2,  359,    1],
       [ 485,  135,    4,   12,    0,  101,    1],
       [  57,   23,   30,  158,    1,  161,    0],
       [ 126,   52,   27,  604,    0,  173,    0],
       [  14,    7,    0,   67,    1,   39,    0],
       [ 227,   30,   16,  104,    2, 2687,    0],
       [  29,   19,    1,   42,    1,   52,    2]], dtype=int64)

training loss:0.7674487371250309	 validating loss:0.9062359865510708
training accuracy:0.7247662810964982	 validating accuracy:0.6886804411205476
Confusion matrix:


array([[1675,   77,    3,   89,    2,  254,   63],
       [ 246,  178,    5,   14,    0,   69,    1],
       [  74,   36,   34,  129,    0,  174,    1],
       [ 112,   60,   27,  647,    2,  197,    0],
       [  14,    7,    1,   55,    3,   68,    0],
       [ 304,   41,   22,   80,    0, 2888,    0],
       [ 114,   30,    4,   35,    1,   45,    8]], dtype=int64)

training loss:0.7612042850712493	 validating loss:0.8916342873520566
training accuracy:0.7263825067342735	 validating accuracy:0.691469134237546
Confusion matrix:


array([[1851,   62,   14,  102,    5,  379,   11],
       [ 260,  198,    8,   20,    1,  121,    1],
       [  74,   33,   42,  169,    1,  170,    0],
       [ 123,   53,   34,  620,    5,  176,    0],
       [  14,    1,    0,   10,    3,   49,    0],
       [ 223,   23,   17,   98,    1, 2740,    1],
       [  51,   23,    6,   60,    0,   35,    1]], dtype=int64)

training loss:0.7419027612509917	 validating loss:1.0116730029822825
training accuracy:0.7361432419584852	 validating accuracy:0.6571175053872481
Confusion matrix:


array([[1689,   94,    5,  110,    3,  389,    7],
       [ 221,  190,    4,   24,    1,   98,    2],
       [  90,   39,   49,  176,    0,  161,    6],
       [ 137,   36,   36,  615,    2,  208,    4],
       [  42,    2,    3,   51,    0,  120,    0],
       [ 220,   48,   26,   98,    0, 2638,    1],
       [  92,   20,    2,   59,    0,   68,    3]], dtype=int64)

training loss:0.7609750109368534	 validating loss:0.9189487922696763
training accuracy:0.7298136645962733	 validating accuracy:0.6806541582150102
Confusion matrix:


array([[1840,   90,    8,  145,    2,  299,    2],
       [ 287,  194,    2,   15,    0,  107,    2],
       [  66,   34,   45,  128,    0,  175,    0],
       [ 164,   46,   37,  538,    8,  157,    1],
       [  23,    8,    0,   41,   21,   38,    0],
       [ 281,   33,   15,   93,    9, 2731,    2],
       [  57,   21,    5,   30,    0,   88,    0]], dtype=int64)

loss:0.9189487922696763	 accuracy:0.6806541582150102


## Random forest

### hyperparameters search

In [55]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

distributions = {"n_estimators": randint(low=10,high=100),
                 "criterion": ["gini", "entropy"],
                "max_depth": randint(low=10,high=20),
                "min_samples_leaf": randint(low=5,high=100)}


lr_rf = RandomizedSearchCV(rf, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 50, cv=group_kfold)

search_rf = lr_rf.fit(X, y, groups = groups)

### performance

In [56]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [57]:
best_param

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 16,
 'n_estimators': 55}

In [58]:
classifier_performance(best_estimator)

training loss:0.8248800571157937	 validating loss:0.9332547615909293
training accuracy:0.7190302646173348	 validating accuracy:0.679807326657371
Confusion matrix:


array([[1913,    7,    1,  119,    0,  359,    0],
       [ 509,   91,    0,   19,    0,  119,    0],
       [  74,   17,    3,  167,    0,  169,    0],
       [ 141,   33,    4,  621,    0,  183,    0],
       [  14,    6,    0,   63,    0,   45,    0],
       [ 226,   16,    1,   88,    0, 2735,    0],
       [  32,   11,    1,   45,    0,   57,    0]], dtype=int64)

training loss:0.8185719295592064	 validating loss:0.9435579512450608
training accuracy:0.7133893202345112	 validating accuracy:0.6959056914691343
Confusion matrix:


array([[1767,   42,    0,   80,    0,  274,    0],
       [ 264,  144,    0,   13,    0,   92,    0],
       [  81,   27,    4,  140,    0,  196,    0],
       [ 132,   49,    3,  644,    0,  217,    0],
       [  19,    6,    0,   57,    0,   66,    0],
       [ 303,   30,    0,   71,    0, 2931,    0],
       [ 136,   17,    0,   36,    0,   48,    0]], dtype=int64)

training loss:0.8144111872542065	 validating loss:0.9419571537957047
training accuracy:0.7190619553161147	 validating accuracy:0.6915958930155913
Confusion matrix:


array([[1880,   29,    0,  104,    0,  411,    0],
       [ 295,  158,    0,   22,    0,  134,    0],
       [  80,   29,    2,  175,    0,  203,    0],
       [ 134,   34,    2,  623,    0,  218,    0],
       [  14,    1,    0,   10,    0,   52,    0],
       [ 209,   12,    0,   89,    0, 2793,    0],
       [  54,   11,    0,   64,    0,   47,    0]], dtype=int64)

training loss:0.7988670513301406	 validating loss:1.0520815516178068
training accuracy:0.7244176834099192	 validating accuracy:0.659779439726201
Confusion matrix:


array([[1732,   52,    0,  105,    0,  408,    0],
       [ 239,  162,    0,   25,    0,  114,    0],
       [  95,   29,    3,  199,    0,  195,    0],
       [ 143,   27,    0,  631,    0,  237,    0],
       [  48,    0,    0,   53,    0,  117,    0],
       [ 233,   31,    0,   90,    0, 2677,    0],
       [  94,   15,    0,   53,    0,   82,    0]], dtype=int64)

training loss:0.8130402369709366	 validating loss:0.9650992631159193
training accuracy:0.7143174039802256	 validating accuracy:0.6838235294117647
Confusion matrix:


array([[1897,   36,    2,  121,    0,  330,    0],
       [ 311,  161,    0,   10,    0,  125,    0],
       [  67,   26,    3,  140,    0,  212,    0],
       [ 179,   39,    2,  550,    0,  181,    0],
       [  24,    6,    0,   54,    0,   47,    0],
       [ 266,   20,    0,   95,    0, 2783,    0],
       [  66,    9,    0,   36,    0,   90,    0]], dtype=int64)

loss:0.9650992631159193	 accuracy:0.6838235294117647


## Bagging

Try decisiontree and svm as base classifiers

### hyperparameters search

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

bag = BaggingClassifier(random_state=42)

distributions = {
    "base_estimator": [DecisionTreeClassifier(),SVC()],
    "n_estimators": np.arange(10,100,10),
    "max_samples": np.arange(0.1,1.0,0.1),
    "max_features": np.arange(5,15)
}

lr_bag = RandomizedSearchCV(bag, distributions, random_state=0,  scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_bag = lr_bag.fit(X, y, groups = groups)

### performance

In [None]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [None]:
best_param

In [None]:
classifier_performance(best_estimator)

## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [49]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

for train_index, validate_index in group_kfold.split(X, y, groups):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]

    # loss and accuracy
    loss = []
    acc = []
    
    gnb.fit(X_train, y_train)

    proba_train = gnb.predict_proba(X_train)
    proba_val = gnb.predict_proba(X_validate)

    loss_train = log_loss(y_train, proba_train)
    loss_val = log_loss(y_validate, proba_val)
    loss.append(loss_val)

    acc_train = accuracy_score(y_train, gnb.predict(X_train))
    acc_val = accuracy_score(y_validate, gnb.predict(X_validate))
    acc.append(acc_val)

    print(f"training loss:{loss_train}\t validating loss:{loss_val}")

    print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
    # Confusion matrix
    print('Confusion matrix:')
    display(confusion_matrix(y_validate, gnb.predict(X_validate)))

print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

training loss:17.485942507541232	 validating loss:18.1379970980222
training accuracy:0.12492473459039771	 validating accuracy:0.10229433388262137
Confusion matrix:


array([[ 363,   36,   23,    5, 1570,   18,  384],
       [  75,   19,    5,    6,  505,   11,  117],
       [  31,    3,   18,   28,  300,    9,   41],
       [  12,    3,   14,   80,  764,   21,   88],
       [   0,    0,    0,   10,  111,    6,    1],
       [  80,   23,   48,  107, 2429,  185,  194],
       [  12,    3,    1,    6,   87,    6,   31]], dtype=int64)

training loss:19.850054294354518	 validating loss:20.513092865587502
training accuracy:0.08128664237046426	 validating accuracy:0.0759285080491824
Confusion matrix:


array([[  90,  148,   22,   23, 1243,   27,  610],
       [   8,   79,   17,    6,  290,   16,   97],
       [   3,   11,   14,   24,  279,    5,  112],
       [   4,    8,   19,   59,  758,    5,  192],
       [   2,    0,    0,    5,  118,    0,   23],
       [  22,   79,   41,   89, 2576,  106,  422],
       [   0,    2,    3,   10,   87,    2,  133]], dtype=int64)

training loss:17.904354892957322	 validating loss:19.496134453009095
training accuracy:0.11608302963080336	 validating accuracy:0.0872100392952212
Confusion matrix:


array([[ 326,   91,   71,   28, 1523,  103,  282],
       [  42,   31,   49,    4,  414,   15,   54],
       [   3,    4,   66,   22,  342,   13,   39],
       [   9,    2,   74,   50,  796,   17,   63],
       [   4,    0,    0,    1,   53,   16,    3],
       [  85,   40,  196,   45, 2473,  140,  124],
       [   6,    8,   12,    5,  122,    1,   22]], dtype=int64)

training loss:19.477417773644916	 validating loss:19.603181415183315
training accuracy:0.10591031532245286	 validating accuracy:0.11243503612625175
Confusion matrix:


array([[ 306,  234,   57,   53, 1385,   54,  208],
       [  49,   81,    8,   19,  335,    8,   40],
       [  19,    6,   27,   17,  348,    8,   96],
       [  15,    8,   48,  119,  764,   19,   65],
       [   7,   10,    4,   11,  175,    6,    5],
       [  63,  110,   92,  116, 2258,  159,  233],
       [   5,    5,    7,    5,  201,    1,   20]], dtype=int64)

training loss:18.536038087528713	 validating loss:18.847581331060834
training accuracy:0.0765623019394093	 validating accuracy:0.057682555780933065
Confusion matrix:


array([[ 163,   47,   32,   12, 1558,    8,  566],
       [  16,   19,   28,    1,  386,    1,  156],
       [   7,    4,   39,    8,  253,   10,  127],
       [   2,    1,   49,   46,  586,    3,  264],
       [   1,    0,    2,    0,  100,    0,   28],
       [  23,   16,  128,   61, 2338,   30,  568],
       [   1,    2,   21,    3,  114,    2,   58]], dtype=int64)

loss:18.847581331060834	 accuracy:0.057682555780933065


### Train the best model with the whole dataset

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

best_estimator.fit(X, y)