In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [15]:
df = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [16]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
#      'disability',
#      'trip_distance',
     'trip_distance_category',
]

df_selected = df[selected_features]
df_selected.loc[:,'trip_distance_category'] = df_selected['trip_distance_category'].replace({"short":0, "medium":1, "long":2})

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df_selected, columns=[x for x in categorial_columns if x in selected_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [17]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df['mode'].replace(str_to_val).to_numpy()
X = onehot.to_numpy()

## Model selection

In [18]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

from scipy.stats import uniform
from scipy.stats import randint

groups = df['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### hyperparameters search

In [6]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

KeyboardInterrupt: 

### performance

In [16]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

Define performance evaluation function for later use

In [8]:
def classifier_performance(best_estimator):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

In [18]:
best_param

{'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}

In [9]:
classifier_performance(best_estimator)

training loss:0.8078737196742879	 validating loss:0.9120875205960333
training accuracy:0.7119315480906354	 validating accuracy:0.6599061985042464
Confusion matrix:


array([[1879,   24,    8,   85,    1,  401,    1],
       [ 489,  111,    1,   13,    0,  123,    1],
       [  53,   19,   27,  143,    1,  187,    0],
       [ 125,   49,   34,  529,    1,  244,    0],
       [  12,    8,    0,   47,    2,   59,    0],
       [ 250,   49,   15,   91,    2, 2658,    1],
       [  19,   18,    1,   42,    0,   66,    0]], dtype=int64)

training loss:0.8066418215780601	 validating loss:0.9400890618869637
training accuracy:0.7085723340199651	 validating accuracy:0.6670046900747877
Confusion matrix:


array([[1663,   80,    5,   75,    1,  278,   61],
       [ 240,  152,    6,   12,    0,  102,    1],
       [  66,   29,   41,  117,    0,  195,    0],
       [ 114,   49,   47,  566,    0,  269,    0],
       [  21,    6,    1,   44,    4,   72,    0],
       [ 338,   49,   20,   99,    0, 2829,    0],
       [ 119,   22,    2,   38,    0,   49,    7]], dtype=int64)

training loss:0.7980519462778817	 validating loss:0.9296593008401661
training accuracy:0.7158295040405641	 validating accuracy:0.6771453923184181
Confusion matrix:


array([[1832,   61,    2,   90,    4,  424,   11],
       [ 259,  184,    5,   19,    6,  136,    0],
       [  69,   31,   32,  157,    7,  193,    0],
       [ 122,   49,   31,  554,    0,  255,    0],
       [  12,    0,    0,    6,    2,   56,    1],
       [ 216,   31,   12,  101,    5, 2737,    1],
       [  50,   25,    5,   61,    0,   34,    1]], dtype=int64)

training loss:0.7839982146570482	 validating loss:1.0368645929002214
training accuracy:0.7196323878941531	 validating accuracy:0.6491317023703892
Confusion matrix:


array([[1693,   84,    4,   88,    3,  424,    1],
       [ 224,  184,    7,   17,    0,  108,    0],
       [  77,   35,   61,  151,    0,  196,    1],
       [ 135,   33,   42,  559,    3,  263,    3],
       [  34,    0,    2,   36,    3,  143,    0],
       [ 243,   56,   17,   95,    1, 2617,    2],
       [  97,   24,    4,   53,    0,   62,    4]], dtype=int64)

training loss:0.8032316415924718	 validating loss:0.9420689532351935
training accuracy:0.7139371276460895	 validating accuracy:0.6744421906693712
Confusion matrix:


array([[1853,   65,    6,  101,    1,  358,    2],
       [ 300,  174,    1,   12,    0,  119,    1],
       [  62,   24,   56,  122,    0,  183,    1],
       [ 164,   38,   49,  499,    7,  193,    1],
       [  22,    8,    0,   56,    3,   42,    0],
       [ 279,   35,   14,   90,    9, 2735,    2],
       [  62,   18,    6,   28,    0,   87,    0]], dtype=int64)

loss:0.9420689532351935	 accuracy:0.6744421906693712


## Random forest

### hyperparameters search

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

distributions = {"n_estimators": randint(low=10,high=100),
                 "criterion": ["gini", "entropy"],
                "max_depth": randint(low=10,high=20),
                "min_samples_leaf": randint(low=5,high=100)}


lr_rf = RandomizedSearchCV(rf, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 50, cv=group_kfold)

search_rf = lr_rf.fit(X, y, groups = groups)

### performance

In [21]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [22]:
best_param

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 6,
 'n_estimators': 75}

In [11]:
classifier_performance(best_estimator)

training loss:0.7285994731800532	 validating loss:0.9426608635083578
training accuracy:0.7412454444620504	 validating accuracy:0.6632019267334263
Confusion matrix:


array([[1904,   12,    3,   87,    0,  393,    0],
       [ 504,   81,    0,   18,    0,  135,    0],
       [  66,   18,   14,  160,    0,  172,    0],
       [ 133,   30,   13,  574,    0,  232,    0],
       [  14,    4,    0,   48,    0,   62,    0],
       [ 259,   26,    5,  116,    1, 2659,    0],
       [  28,   10,    0,   44,    0,   64,    0]], dtype=int64)

training loss:0.7222845895396695	 validating loss:0.957966634313351
training accuracy:0.7409919188718112	 validating accuracy:0.6847509190011408
Confusion matrix:


array([[1784,   38,    3,   77,    0,  258,    3],
       [ 259,  133,    0,   16,    0,  105,    0],
       [  72,   25,   17,  136,    1,  197,    0],
       [ 115,   39,   14,  615,    0,  262,    0],
       [  25,    6,    0,   39,    4,   74,    0],
       [ 343,   31,    5,  106,    0, 2849,    1],
       [ 127,   11,    0,   42,    0,   57,    0]], dtype=int64)

training loss:0.7218147641860361	 validating loss:0.9539374794895784
training accuracy:0.7448581841229599	 validating accuracy:0.6766383572062366
Confusion matrix:


array([[1871,   31,    0,   81,    1,  436,    4],
       [ 290,  145,    2,   23,    0,  149,    0],
       [  70,   26,   11,  174,    0,  208,    0],
       [ 133,   29,   10,  581,    0,  258,    0],
       [  12,    0,    0,    6,    0,   59,    0],
       [ 244,   16,    5,  108,    0, 2730,    0],
       [  55,   19,    0,   62,    0,   40,    0]], dtype=int64)

training loss:0.705390526114108	 validating loss:1.057668971774758
training accuracy:0.7477103470131516	 validating accuracy:0.6443148688046647
Confusion matrix:


array([[1720,   55,    2,   92,    0,  428,    0],
       [ 240,  161,    3,   23,    0,  113,    0],
       [  90,   22,   15,  186,    0,  208,    0],
       [ 143,   24,   19,  581,    0,  271,    0],
       [  45,    0,    1,   34,    1,  137,    0],
       [ 284,   27,    1,  114,    0, 2605,    0],
       [  95,   12,    1,   63,    0,   73,    0]], dtype=int64)

training loss:0.7229635221974636	 validating loss:0.9727892802016374
training accuracy:0.7405247813411079	 validating accuracy:0.6730476673427992
Confusion matrix:


array([[1887,   42,    5,   89,    1,  362,    0],
       [ 309,  143,    0,    8,    0,  147,    0],
       [  68,   21,   10,  149,    0,  200,    0],
       [ 167,   32,   11,  537,    3,  201,    0],
       [  26,    7,    0,   38,    5,   55,    0],
       [ 287,   23,    6,  114,    7, 2727,    0],
       [  65,    7,    1,   34,    0,   94,    0]], dtype=int64)

loss:0.9727892802016374	 accuracy:0.6730476673427992


## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [12]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

classifier_performance(gnb)

training loss:10.859964709685467	 validating loss:11.376391676883683
training accuracy:0.15116463318016163	 validating accuracy:0.14082900240841678
Confusion matrix:


array([[ 461,   38,    5,   14, 1252,   19,  610],
       [ 132,   35,    4,   11,  355,   20,  181],
       [  20,    5,   33,   46,  225,   10,   91],
       [   9,    6,   33,  196,  553,   22,  163],
       [   0,    0,    0,   11,  108,    5,    4],
       [ 123,   35,   90,  159, 2020,  226,  413],
       [  13,    4,    1,   13,   60,    3,   52]], dtype=int64)

training loss:14.823649358042017	 validating loss:15.219015566267347
training accuracy:0.1095864363809222	 validating accuracy:0.10698440867030042
Confusion matrix:


array([[  40,  333,   53,   33, 1111,   35,  558],
       [   5,  156,   10,   10,  254,   12,   66],
       [   1,   27,   32,   39,  224,   13,  112],
       [   2,   27,   36,  148,  606,   16,  210],
       [   2,    3,    2,    5,  114,    2,   20],
       [   6,  201,   95,  129, 2204,  204,  496],
       [   0,    6,    3,   12,   63,    3,  150]], dtype=int64)

training loss:11.379321738368045	 validating loss:12.503909552297303
training accuracy:0.14973855173506576	 validating accuracy:0.11573076435543161
Confusion matrix:


array([[ 296,  112,   24,   51, 1270,   76,  595],
       [  43,   39,   15,   12,  335,   28,  137],
       [   3,    8,   63,   55,  241,   18,  101],
       [   5,    4,   87,  164,  575,   29,  147],
       [   3,    0,    0,    2,   62,    4,    6],
       [  71,   47,  150,  150, 2064,  234,  387],
       [   2,   10,    6,   12,   86,    5,   55]], dtype=int64)

training loss:13.798238684968434	 validating loss:14.65201213332427
training accuracy:0.11177309459673586	 validating accuracy:0.11699835213588541
Confusion matrix:


array([[ 221,  338,   26,   54, 1424,   71,  163],
       [  43,  114,    7,   12,  294,    5,   65],
       [  15,    6,   46,   29,  295,    5,  125],
       [  12,    9,   62,  150,  671,   26,  108],
       [   7,   11,    8,   10,  164,    6,   12],
       [  39,  166,  103,  121, 2228,  153,  221],
       [   4,    8,   11,    6,  139,    1,   75]], dtype=int64)

training loss:11.981576611542009	 validating loss:11.95016127863398
training accuracy:0.12038914944859931	 validating accuracy:0.10927991886409737
Confusion matrix:


array([[ 235,   92,  298,   37, 1001,   43,  680],
       [  23,   53,   74,    7,  273,   23,  154],
       [   6,    7,  155,   15,  184,   19,   62],
       [   5,    2,  233,  111,  414,    8,  178],
       [   0,    0,    3,   13,   82,    0,   33],
       [  35,   35,  407,  114, 1917,  162,  494],
       [   2,    2,   24,    6,   98,    5,   64]], dtype=int64)

loss:11.95016127863398	 accuracy:0.10927991886409737


## Neural network

We don't use cross validation for neural network

In [13]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupShuffleSplit

# train-validation split

gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=42)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train, X_validate = X[train_idx,:], X[test_idx,:]
    y_train, y_validate = y[train_idx], y[test_idx]

parameters = {'learning_rate_init':np.arange(1e-4, 1e-2, 1e-3),
              'alpha': np.arange(1e-5,1e-3,1e-4),
              'learning_rate':['constant', 'invscaling', 'adaptive']}

ann = MLPClassifier(hidden_layer_sizes=(20,), activation = 'relu', solver = 'adam', max_iter=500, random_state=42)

lr_ann = RandomizedSearchCV(ann, parameters, random_state=0,  scoring = "neg_log_loss", n_iter = 5)
search_ann = lr_ann.fit(X_train, y_train)

In [11]:
best_estimator = search_ann.best_estimator_
best_score = search_ann.best_score_
best_param = search_ann.best_params_

In [12]:
best_param

{'learning_rate_init': 0.0011, 'learning_rate': 'invscaling', 'alpha': 0.00071}

In [24]:
proba_train = best_estimator.predict_proba(X_train)
proba_val = best_estimator.predict_proba(X_validate)

loss_train = log_loss(y_train, proba_train)
loss_val = log_loss(y_validate, proba_val)

acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))

print(f"training loss:{loss_train}\t validating loss:{loss_val}")

print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")

# Confusion matrix
print('Confusion matrix:')
display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))

training loss:0.802232187215104	 validating loss:1.0583596271213607
training accuracy:0.7119435544345217	 validating accuracy:0.6289333069463685
Confusion matrix:


array([[1936,  187,   25,  143,    5,  423,    8],
       [ 360,  244,   24,   27,    0,  147,   14],
       [  69,   28,   64,  186,    1,  251,   11],
       [ 214,   42,   70,  668,   26,  295,   10],
       [  18,    7,    1,   44,   20,  120,    0],
       [ 390,   69,   42,  171,   20, 3411,    8],
       [ 118,   10,    5,   64,    1,   96,   13]], dtype=int64)

### Train the best model with the whole dataset

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

best_estimator.fit(X, y)