In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

Choice of features are chosen based on correlation and our understanding of the problem

In [3]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
     'disability',
#      'trip_distance',
     'trip_distance_category',
]

df_selected = df[selected_features]
df_selected.loc[:,'trip_distance_category'] = df_selected['trip_distance_category'].replace({"short":0, "medium":1, "long":2})

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df_selected, columns=[x for x in categorial_columns if x in selected_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [4]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive': 0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df['mode'].replace(str_to_val).to_numpy()
X = onehot.to_numpy()

## Model selection

In [5]:
# train-validation split
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint

from sklearn.metrics import log_loss
from sklearn.metrics import log_loss,confusion_matrix, accuracy_score
from sklearn.metrics import confusion_matrix

from scipy.stats import uniform
from scipy.stats import randint

groups = df['person_id']
group_kfold = GroupKFold(n_splits=5)

## XGboost

The range of parameters tried is partially taken from [this link](https://kevinvecmanis.io/machine%20learning/hyperparameter%20tuning/dataviz/python/2019/05/11/XGBoost-Tuning-Visual-Guide.html)

### hyperparameters search

In [6]:
import xgboost as xgb

xgbo = xgb.XGBClassifier(n_jobs=-1, random_state=42, objective="multi:softprob", eval_metric="mlogloss", use_label_encoder=False)
distributions = {'n_estimators': np.arange(10,50,10), 
                 'max_depth': np.arange(5,20,1),
                 'learning_rate': np.arange(0.0005,0.3,0.0005)}

lr_xgbo = RandomizedSearchCV(xgbo, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_xgbo = lr_xgbo.fit(X, y, groups = groups)

KeyboardInterrupt: 

### performance

In [16]:
best_estimator = search_xgbo.best_estimator_
best_score = search_xgbo.best_score_
best_param = search_xgbo.best_params_

Define performance evaluation function for later use

In [8]:
def classifier_performance(best_estimator):
    for train_index, validate_index in group_kfold.split(X, y, groups):
        X_train, X_validate = X[train_index], X[validate_index]
        y_train, y_validate = y[train_index], y[validate_index]
        
        # loss and accuracy
        loss = []
        acc = []

        best_estimator.fit(X_train, y_train)
        
        proba_train = best_estimator.predict_proba(X_train)
        proba_val = best_estimator.predict_proba(X_validate)
        
        loss_train = log_loss(y_train, proba_train)
        loss_val = log_loss(y_validate, proba_val)
        loss.append(loss_val)
        
        acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
        acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))
        acc.append(acc_val)
        
        print(f"training loss:{loss_train}\t validating loss:{loss_val}")

        print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")
        
        # Confusion matrix
        print('Confusion matrix:')
        display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(f"loss:{np.mean(loss)}\t accuracy:{np.mean(acc)}")

In [18]:
best_param

{'n_estimators': 40, 'max_depth': 5, 'learning_rate': 0.1775}

In [19]:
classifier_performance(best_estimator)

training loss:0.8005575378485186	 validating loss:0.9092358116035468
training accuracy:0.714847092378387	 validating accuracy:0.6613005450627456
Confusion matrix:


array([[1873,   21,    9,   86,    1,  408,    1],
       [ 488,  111,    2,   13,    0,  123,    1],
       [  52,   20,   32,  141,    1,  184,    0],
       [ 132,   49,   27,  531,    1,  242,    0],
       [  12,   10,    0,   47,    2,   57,    0],
       [ 236,   47,   15,   97,    3, 2667,    1],
       [  18,   19,    1,   42,    0,   65,    1]], dtype=int64)

training loss:0.8029525680026839	 validating loss:0.9357357819966375
training accuracy:0.710695610838219	 validating accuracy:0.6723285587526936
Confusion matrix:


array([[1683,   77,    4,   72,    1,  298,   28],
       [ 235,  163,    6,   12,    0,   97,    0],
       [  65,   38,   39,  118,    0,  188,    0],
       [ 109,   49,   37,  583,    0,  267,    0],
       [  20,    7,    1,   43,    2,   75,    0],
       [ 340,   42,   20,  101,    0, 2831,    1],
       [ 122,   26,    2,   38,    0,   46,    3]], dtype=int64)

training loss:0.7930538069004657	 validating loss:0.9265771156578542
training accuracy:0.7171288226905403	 validating accuracy:0.6779059449866903
Confusion matrix:


array([[1834,   56,    6,   86,    1,  430,   11],
       [ 261,  183,    6,   19,    0,  139,    1],
       [  64,   38,   39,  151,    1,  196,    0],
       [ 125,   47,   31,  559,    0,  249,    0],
       [  12,    0,    0,    6,    1,   57,    1],
       [ 216,   38,    9,  106,    2, 2731,    1],
       [  47,   27,    6,   60,    0,   35,    1]], dtype=int64)

training loss:0.778716409909066	 validating loss:1.0358064133900096
training accuracy:0.7206147995563302	 validating accuracy:0.6463430092533908
Confusion matrix:


array([[1691,   85,    5,   88,    3,  423,    2],
       [ 224,  186,    5,   18,    0,  107,    0],
       [  79,   27,   49,  154,    0,  211,    1],
       [ 130,   32,   43,  557,    3,  271,    2],
       [  34,    0,    2,   35,    0,  147,    0],
       [ 245,   57,   14,   98,    1, 2616,    0],
       [  91,   20,    4,   57,    0,   72,    0]], dtype=int64)

training loss:0.797030219192868	 validating loss:0.9418681305984288
training accuracy:0.7151730257320319	 validating accuracy:0.6724137931034483
Confusion matrix:


array([[1822,   74,    6,  127,    1,  354,    2],
       [ 290,  179,    1,   12,    0,  125,    0],
       [  61,   26,   58,  123,    0,  179,    1],
       [ 160,   39,   45,  507,    7,  192,    1],
       [  24,    7,    0,   53,    5,   42,    0],
       [ 279,   32,   15,   93,   10, 2733,    2],
       [  61,   19,    4,   30,    1,   86,    0]], dtype=int64)

loss:0.9418681305984288	 accuracy:0.6724137931034483


## Random forest

### hyperparameters search

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

distributions = {"n_estimators": randint(low=10,high=100),
                 "criterion": ["gini", "entropy"],
                "max_depth": randint(low=10,high=20),
                "min_samples_leaf": randint(low=5,high=100)}


lr_rf = RandomizedSearchCV(rf, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 50, cv=group_kfold)

search_rf = lr_rf.fit(X, y, groups = groups)

### performance

In [21]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [22]:
best_param

{'criterion': 'entropy',
 'max_depth': 18,
 'min_samples_leaf': 6,
 'n_estimators': 75}

In [23]:
classifier_performance(best_estimator)

training loss:0.7321596664889587	 validating loss:0.9445053341850436
training accuracy:0.7382982094755189	 validating accuracy:0.6647230320699709
Confusion matrix:


array([[1889,   10,    0,   87,    0,  413,    0],
       [ 500,   87,    0,   20,    0,  131,    0],
       [  68,   16,   12,  153,    0,  181,    0],
       [ 139,   25,   10,  569,    0,  239,    0],
       [  14,    6,    0,   46,    0,   62,    0],
       [ 250,   22,    4,  103,    0, 2687,    0],
       [  30,   10,    1,   39,    0,   66,    0]], dtype=int64)

training loss:0.7281890941282304	 validating loss:0.9557127195165052
training accuracy:0.7389954048486769	 validating accuracy:0.6791735327671441
Confusion matrix:


array([[1755,   43,    3,   74,    0,  285,    3],
       [ 257,  128,    1,   18,    0,  109,    0],
       [  72,   24,   18,  132,    0,  202,    0],
       [ 115,   39,   10,  602,    0,  279,    0],
       [  26,    7,    0,   40,    2,   73,    0],
       [ 331,   30,    4,  116,    0, 2853,    1],
       [ 136,    9,    0,   42,    0,   50,    0]], dtype=int64)

training loss:0.7247329477902607	 validating loss:0.9500746423658423
training accuracy:0.7415623514498495	 validating accuracy:0.6756242869818735
Confusion matrix:


array([[1875,   21,    0,   79,    0,  445,    4],
       [ 290,  123,    1,   27,    0,  168,    0],
       [  74,   21,   10,  175,    0,  209,    0],
       [ 132,   19,   12,  583,    0,  265,    0],
       [  12,    0,    0,    6,    0,   59,    0],
       [ 233,   14,    3,  114,    0, 2739,    0],
       [  55,   12,    1,   63,    0,   45,    0]], dtype=int64)

training loss:0.7134106104666947	 validating loss:1.0584109383469869
training accuracy:0.7466011725558549	 validating accuracy:0.6505260489288883
Confusion matrix:


array([[1736,   59,    1,   93,    0,  408,    0],
       [ 240,  153,    2,   27,    0,  118,    0],
       [  95,   24,    8,  194,    0,  200,    0],
       [ 135,   21,    5,  615,    1,  261,    0],
       [  39,    0,    0,   35,    0,  144,    0],
       [ 262,   30,    2,  117,    0, 2620,    0],
       [  93,   16,    2,   58,    0,   75,    0]], dtype=int64)

training loss:0.7275434453560324	 validating loss:0.9685817928134446
training accuracy:0.739890987450881	 validating accuracy:0.678498985801217
Confusion matrix:


array([[1905,   32,    2,  110,    0,  337,    0],
       [ 307,  144,    0,    8,    0,  148,    0],
       [  66,   15,   11,  153,    0,  203,    0],
       [ 174,   30,   11,  536,    4,  196,    0],
       [  25,    6,    0,   36,    9,   54,    1],
       [ 280,   19,    5,  111,    2, 2747,    0],
       [  70,    6,    0,   34,    0,   91,    0]], dtype=int64)

loss:0.9685817928134446	 accuracy:0.678498985801217


## Bagging

Try decisiontree and svm as base classifiers

### hyperparameters search

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

bag = BaggingClassifier(random_state=42)

distributions = {
    "base_estimator": [DecisionTreeClassifier(),SVC()],
    "n_estimators": np.arange(10,50,10),
    "max_samples": np.arange(0.1,1.0,0.1),
    "max_features": np.arange(5,15)
}

lr_bag = RandomizedSearchCV(bag, distributions, random_state=0,  scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)

search_bag = lr_bag.fit(X, y, groups = groups)

### performance

In [None]:
best_estimator = search_rf.best_estimator_
best_score = search_rf.best_score_
best_param = search_rf.best_params_

In [None]:
best_param

In [None]:
classifier_performance(best_estimator)

## Naive bayes

Naive bayes has no parameters to tune, so we just try it with cross validation

### Cross validation

In [9]:
classifier_performance(gnb)

training loss:19.887780103044047	 validating loss:20.636529365739793
training accuracy:0.10850895262240533	 validating accuracy:0.0902522499683103
Confusion matrix:


array([[ 301,   31,   25,    5, 1679,   13,  345],
       [  50,   17,    8,    5,  541,    9,  108],
       [  26,    2,   15,   24,  315,    6,   42],
       [   7,    1,   10,   64,  812,   20,   68],
       [   0,    0,    0,   11,  113,    4,    0],
       [  61,   19,   36,   78, 2498,  170,  204],
       [   8,    2,    3,    2,   92,    7,   32]], dtype=int64)

training loss:22.544061979544832	 validating loss:23.05684066911958
training accuracy:0.06220884170495959	 validating accuracy:0.059703384459373815
Confusion matrix:


array([[  52,  129,   19,   16, 1332,   22,  593],
       [   4,   65,   17,    6,  311,   11,   99],
       [   3,   10,   11,   15,  311,    3,   95],
       [   1,    5,   14,   34,  813,    4,  174],
       [   2,    0,    0,    5,  118,    0,   23],
       [   7,   67,   27,   62, 2674,   77,  421],
       [   0,    2,    3,    9,  107,    2,  114]], dtype=int64)

training loss:20.358133907281662	 validating loss:21.947092445838145
training accuracy:0.0994454127713516	 validating accuracy:0.07567499049309165
Confusion matrix:


array([[ 278,   82,   64,   20, 1602,  113,  265],
       [  32,   28,   44,    4,  431,   11,   59],
       [   3,    4,   62,   19,  355,   12,   34],
       [   8,    2,   64,   33,  829,   21,   54],
       [   2,    0,    1,    1,   54,   15,    4],
       [  70,   35,  193,   29, 2516,  128,  132],
       [   3,    9,   13,    2,  133,    2,   14]], dtype=int64)

training loss:21.968633654291697	 validating loss:21.841619871363694
training accuracy:0.09161781017271431	 validating accuracy:0.09937888198757763
Confusion matrix:


array([[ 231,  231,   57,   49, 1460,   47,  222],
       [  42,   83,    7,   17,  349,    6,   36],
       [  15,    5,   28,   14,  369,    4,   86],
       [  10,    7,   43,  111,  796,   11,   60],
       [   7,    6,    3,   12,  181,    7,    2],
       [  36,   97,   91,   87, 2351,  133,  236],
       [   4,    5,    6,    6,  205,    1,   17]], dtype=int64)

training loss:21.221948321337052	 validating loss:21.623841082484738
training accuracy:0.06467866649765497	 validating accuracy:0.047160243407707914
Confusion matrix:


array([[ 113,   31,   24,    9, 1677,    5,  527],
       [  11,   12,   23,    1,  409,    2,  149],
       [   6,    2,   32,    5,  274,   12,  117],
       [   1,    1,   34,   38,  631,    3,  243],
       [   1,    0,    2,    0,  104,    0,   24],
       [  13,   12,  108,   27, 2434,   17,  553],
       [   1,    0,   20,    1,  121,    2,   56]], dtype=int64)

loss:21.623841082484738	 accuracy:0.047160243407707914


## Neural network

We don't use cross validation for neural network

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import GroupShuffleSplit

# train-validation split

gss = GroupShuffleSplit(n_splits=2, train_size=.75, random_state=42)
for train_idx, test_idx in gss.split(X, y, groups):
    X_train, X_validate = X[train_idx,:], X[test_idx,:]
    y_train, y_validate = y[train_idx], y[test_idx]

parameters = {'learning_rate_init':np.arange(1e-4, 1e-2, 1e-3),
              'alpha': np.arange(1e-5,1e-3,1e-4),
              'learning_rate':['constant', 'invscaling', 'adaptive']}

ann = MLPClassifier(hidden_layer_sizes=(20,), activation = 'relu', solver = 'adam', max_iter=500, random_state=42)

lr_ann = RandomizedSearchCV(ann, parameters, random_state=0,  scoring = "neg_log_loss", n_iter = 5)
search_ann = lr_ann.fit(X_train, y_train)

In [11]:
best_estimator = search_ann.best_estimator_
best_score = search_ann.best_score_
best_param = search_ann.best_params_

In [12]:
best_param

{'learning_rate_init': 0.0011, 'learning_rate': 'invscaling', 'alpha': 0.00071}

In [13]:
proba_train = best_estimator.predict_proba(X_train)
proba_val = best_estimator.predict_proba(X_validate)

loss_train = log_loss(y_train, proba_train)
loss_val = log_loss(y_validate, proba_val)

acc_train = accuracy_score(y_train, best_estimator.predict(X_train))
acc_val = accuracy_score(y_validate, best_estimator.predict(X_validate))

print(f"training loss:{loss_train}\t validating loss:{loss_val}")

print(f"training accuracy:{acc_train}\t validating accuracy:{acc_val}")

# Confusion matrix
print('Confusion matrix:')
display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))

training loss:0.7912387898137064	 validating loss:1.087770511732495
training accuracy:0.7169541209353057	 validating accuracy:0.6302196714822877
Confusion matrix:


array([[1952,  190,   14,  119,    5,  439,    8],
       [ 360,  232,   17,   27,    4,  149,   27],
       [  79,   42,   64,  180,    1,  239,    5],
       [ 211,   38,   41,  697,   14,  315,    9],
       [  18,    3,    4,   46,   21,  118,    0],
       [ 406,   57,   86,  149,    9, 3395,    9],
       [ 116,   14,    6,   64,    2,   97,    8]], dtype=int64)

### Train the best model with the whole dataset

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

best_estimator.fit(X, y)