In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [3]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
     'disability',
     'trip_distance',
#      'trip_distance_category',
]

df_selected = df[selected_features]
#df_selected.loc[:,'trip_distance_category'] = df_selected['trip_distance_category'].replace({"short":0, "medium":1, "long":2})

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df_selected, columns=[x for x in categorial_columns if x in selected_features])

In [4]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df['mode'].replace(str_to_val).to_numpy()
X = onehot.to_numpy()

## Model selection

In [5]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

from scipy.stats import uniform
from scipy.stats import randint

groups = df['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### hyperparameters search

In [6]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

### performance

In [7]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

Define performance evaluation function for later use

In [8]:
def classifier_performance(best_estimator):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

In [9]:
best_param

{'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}

In [10]:
classifier_performance(best_estimator)

training loss:0.7684913363882653	 validating loss:0.8743614894932351
training accuracy:0.7263825067342735	 validating accuracy:0.678793256433008
Confusion matrix:


array([[1896,   26,    7,  108,    2,  359,    1],
       [ 485,  135,    4,   12,    0,  101,    1],
       [  57,   23,   30,  158,    1,  161,    0],
       [ 126,   52,   27,  604,    0,  173,    0],
       [  14,    7,    0,   67,    1,   39,    0],
       [ 227,   30,   16,  104,    2, 2687,    0],
       [  29,   19,    1,   42,    1,   52,    2]], dtype=int64)

training loss:0.7674487371250309	 validating loss:0.9062359865510708
training accuracy:0.7247662810964982	 validating accuracy:0.6886804411205476
Confusion matrix:


array([[1675,   77,    3,   89,    2,  254,   63],
       [ 246,  178,    5,   14,    0,   69,    1],
       [  74,   36,   34,  129,    0,  174,    1],
       [ 112,   60,   27,  647,    2,  197,    0],
       [  14,    7,    1,   55,    3,   68,    0],
       [ 304,   41,   22,   80,    0, 2888,    0],
       [ 114,   30,    4,   35,    1,   45,    8]], dtype=int64)

training loss:0.7612042850712493	 validating loss:0.8916342873520566
training accuracy:0.7263825067342735	 validating accuracy:0.691469134237546
Confusion matrix:


array([[1851,   62,   14,  102,    5,  379,   11],
       [ 260,  198,    8,   20,    1,  121,    1],
       [  74,   33,   42,  169,    1,  170,    0],
       [ 123,   53,   34,  620,    5,  176,    0],
       [  14,    1,    0,   10,    3,   49,    0],
       [ 223,   23,   17,   98,    1, 2740,    1],
       [  51,   23,    6,   60,    0,   35,    1]], dtype=int64)

training loss:0.7419027612509917	 validating loss:1.0116730029822825
training accuracy:0.7361432419584852	 validating accuracy:0.6571175053872481
Confusion matrix:


array([[1689,   94,    5,  110,    3,  389,    7],
       [ 221,  190,    4,   24,    1,   98,    2],
       [  90,   39,   49,  176,    0,  161,    6],
       [ 137,   36,   36,  615,    2,  208,    4],
       [  42,    2,    3,   51,    0,  120,    0],
       [ 220,   48,   26,   98,    0, 2638,    1],
       [  92,   20,    2,   59,    0,   68,    3]], dtype=int64)

training loss:0.7609750109368534	 validating loss:0.9189487922696763
training accuracy:0.7298136645962733	 validating accuracy:0.6806541582150102
Confusion matrix:


array([[1840,   90,    8,  145,    2,  299,    2],
       [ 287,  194,    2,   15,    0,  107,    2],
       [  66,   34,   45,  128,    0,  175,    0],
       [ 164,   46,   37,  538,    8,  157,    1],
       [  23,    8,    0,   41,   21,   38,    0],
       [ 281,   33,   15,   93,    9, 2731,    2],
       [  57,   21,    5,   30,    0,   88,    0]], dtype=int64)

loss:0.9189487922696763	 accuracy:0.6806541582150102


## Random forest

### hyperparameters search

In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

distributions = {"n_estimators": randint(low=10,high=100),
                 "criterion": ["gini", "entropy"],
                "max_depth": randint(low=10,high=20),
                "min_samples_leaf": randint(low=5,high=100)}


lr_rf = RandomizedSearchCV(rf, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_rf = lr_rf.fit(X, y, groups = groups)

### performance

In [12]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [13]:
best_param

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 14,
 'n_estimators': 30}

In [14]:
classifier_performance(best_estimator)

training loss:0.8069434604662493	 validating loss:0.9295177752840384
training accuracy:0.7200126762795119	 validating accuracy:0.6765115984281912
Confusion matrix:


array([[1926,    9,    0,  117,    0,  347,    0],
       [ 509,   91,    0,   24,    0,  114,    0],
       [  74,   18,    3,  163,    0,  172,    0],
       [ 138,   37,    1,  614,    0,  192,    0],
       [  11,    5,    0,   63,    0,   49,    0],
       [ 244,   16,    1,  102,    0, 2703,    0],
       [  32,   15,    1,   44,    0,   54,    0]], dtype=int64)

training loss:0.7988278367176339	 validating loss:0.9400654330948389
training accuracy:0.7183647599429568	 validating accuracy:0.6967930029154519
Confusion matrix:


array([[1749,   45,    0,   86,    0,  283,    0],
       [ 256,  148,    0,   13,    0,   96,    0],
       [  84,   31,    6,  140,    1,  186,    0],
       [ 114,   59,    5,  654,    0,  213,    0],
       [  19,    8,    0,   55,    4,   62,    0],
       [ 292,   27,    2,   77,    0, 2936,    1],
       [ 131,   16,    0,   41,    0,   49,    0]], dtype=int64)

training loss:0.8002450324573407	 validating loss:0.9381064382821448
training accuracy:0.7203929646648709	 validating accuracy:0.6857649892255039
Confusion matrix:


array([[1858,   35,    1,  106,    1,  423,    0],
       [ 291,  155,    0,   23,    0,  140,    0],
       [  79,   26,    0,  188,    0,  196,    0],
       [ 134,   37,    1,  633,    0,  206,    0],
       [  12,    1,    0,   13,    0,   51,    0],
       [ 224,   18,    1,   96,    0, 2764,    0],
       [  55,   14,    0,   66,    0,   41,    0]], dtype=int64)

training loss:0.7827780297626349	 validating loss:1.0521699035707388
training accuracy:0.7279670416732689	 validating accuracy:0.6580048168335657
Confusion matrix:


array([[1742,   53,    1,  107,    0,  394,    0],
       [ 245,  168,    0,   26,    0,  101,    0],
       [  92,   28,    4,  208,    0,  189,    0],
       [ 144,   28,    3,  625,    0,  238,    0],
       [  46,    1,    4,   57,    0,  110,    0],
       [ 243,   28,    1,  107,    0, 2652,    0],
       [  86,   13,    0,   69,    0,   76,    0]], dtype=int64)

training loss:0.7990194625909764	 validating loss:0.9713459713711736
training accuracy:0.7200532386867791	 validating accuracy:0.6807809330628803
Confusion matrix:


array([[1881,   43,    3,  130,    0,  329,    0],
       [ 314,  165,    0,   10,    0,  118,    0],
       [  69,   24,    6,  139,    0,  210,    0],
       [ 186,   36,    2,  547,    0,  180,    0],
       [  25,    7,    0,   58,    0,   41,    0],
       [ 276,   23,    0,   93,    1, 2771,    0],
       [  68,    9,    0,   32,    0,   92,    0]], dtype=int64)

loss:0.9713459713711736	 accuracy:0.6807809330628803


## Bagging

Try decisiontree and svm as base classifiers

### hyperparameters search

In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

bag = BaggingClassifier(random_state=42)

distributions = {
    "base_estimator": [DecisionTreeClassifier(),SVC()],
    "n_estimators": np.arange(10,50,10),
    "max_samples": np.arange(0.1,1.0,0.1),
    "max_features": np.arange(5,15)
}

lr_bag = RandomizedSearchCV(bag, distributions, random_state=0,  scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_bag = lr_bag.fit(X, y, groups = groups) 

KeyboardInterrupt: 

### performance

In [None]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [None]:
best_param

In [None]:
classifier_performance(best_estimator)

## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [17]:
classifier_performance(gnb)

training loss:17.485942507541232	 validating loss:18.1379970980222
training accuracy:0.12492473459039771	 validating accuracy:0.10229433388262137
Confusion matrix:


array([[ 363,   36,   23,    5, 1570,   18,  384],
       [  75,   19,    5,    6,  505,   11,  117],
       [  31,    3,   18,   28,  300,    9,   41],
       [  12,    3,   14,   80,  764,   21,   88],
       [   0,    0,    0,   10,  111,    6,    1],
       [  80,   23,   48,  107, 2429,  185,  194],
       [  12,    3,    1,    6,   87,    6,   31]], dtype=int64)

training loss:19.850054294354518	 validating loss:20.513092865587502
training accuracy:0.08128664237046426	 validating accuracy:0.0759285080491824
Confusion matrix:


array([[  90,  148,   22,   23, 1243,   27,  610],
       [   8,   79,   17,    6,  290,   16,   97],
       [   3,   11,   14,   24,  279,    5,  112],
       [   4,    8,   19,   59,  758,    5,  192],
       [   2,    0,    0,    5,  118,    0,   23],
       [  22,   79,   41,   89, 2576,  106,  422],
       [   0,    2,    3,   10,   87,    2,  133]], dtype=int64)

training loss:17.904354892957322	 validating loss:19.496134453009095
training accuracy:0.11608302963080336	 validating accuracy:0.0872100392952212
Confusion matrix:


array([[ 326,   91,   71,   28, 1523,  103,  282],
       [  42,   31,   49,    4,  414,   15,   54],
       [   3,    4,   66,   22,  342,   13,   39],
       [   9,    2,   74,   50,  796,   17,   63],
       [   4,    0,    0,    1,   53,   16,    3],
       [  85,   40,  196,   45, 2473,  140,  124],
       [   6,    8,   12,    5,  122,    1,   22]], dtype=int64)

training loss:19.477417773644916	 validating loss:19.603181415183315
training accuracy:0.10591031532245286	 validating accuracy:0.11243503612625175
Confusion matrix:


array([[ 306,  234,   57,   53, 1385,   54,  208],
       [  49,   81,    8,   19,  335,    8,   40],
       [  19,    6,   27,   17,  348,    8,   96],
       [  15,    8,   48,  119,  764,   19,   65],
       [   7,   10,    4,   11,  175,    6,    5],
       [  63,  110,   92,  116, 2258,  159,  233],
       [   5,    5,    7,    5,  201,    1,   20]], dtype=int64)

training loss:18.536038087528713	 validating loss:18.847581331060834
training accuracy:0.0765623019394093	 validating accuracy:0.057682555780933065
Confusion matrix:


array([[ 163,   47,   32,   12, 1558,    8,  566],
       [  16,   19,   28,    1,  386,    1,  156],
       [   7,    4,   39,    8,  253,   10,  127],
       [   2,    1,   49,   46,  586,    3,  264],
       [   1,    0,    2,    0,  100,    0,   28],
       [  23,   16,  128,   61, 2338,   30,  568],
       [   1,    2,   21,    3,  114,    2,   58]], dtype=int64)

loss:18.847581331060834	 accuracy:0.057682555780933065


## Neural network

We don't use cross validation for neural network

In [18]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupShuffleSplit

# train-validation split

gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=42)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train, X_validate = X[train_idx,:], X[test_idx,:]
    y_train, y_validate = y[train_idx], y[test_idx]

parameters = {'learning_rate_init':np.arange(1e-4, 1e-2, 1e-3),
              'alpha': np.arange(1e-5,1e-3,1e-4),
              'learning_rate':['constant', 'invscaling', 'adaptive']}

ann = MLPClassifier(hidden_layer_sizes=(20,), activation = 'relu', solver = 'adam', max_iter=500, random_state=42)

lr_ann = RandomizedSearchCV(ann, parameters, random_state=0,  scoring = "neg_log_loss", n_iter = 5)
search_ann = lr_ann.fit(X_train, y_train)

In [20]:
best_estimator = search_ann.best_estimator_
best_score = search_ann.best_score_
best_param = search_ann.best_params_

In [25]:
best_param

{'learning_rate_init': 0.0081,
 'learning_rate': 'adaptive',
 'alpha': 0.0006100000000000001}

In [26]:
proba_train = best_estimator.predict_proba(X_train)
proba_val = best_estimator.predict_proba(X_validate)

loss_train = log_loss(y_train, proba_train)
loss_val = log_loss(y_validate, proba_val)

acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))

print(f"training loss:{loss_train}\t validating loss:{loss_val}")

print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")

# Confusion matrix
print('Confusion matrix:')
display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))

training loss:0.8361108601167359	 validating loss:0.8832892817248519
training accuracy:0.7067625605017384	 validating accuracy:0.6946368493963981
Confusion matrix:


array([[1971,  182,   29,  105,    9,  424,    7],
       [ 246,  421,   15,   22,    1,  111,    0],
       [  66,   37,   94,  153,    4,  247,    9],
       [ 197,   54,   33,  771,   19,  243,    8],
       [  14,    9,    1,   33,   46,  107,    0],
       [ 263,   47,   34,   95,   17, 3645,   10],
       [  66,   24,    8,   46,    4,   87,   72]], dtype=int64)

### Train the best model with the whole dataset

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

best_estimator.fit(X, y)