In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, normalize

In [2]:
df = pd.read_csv('data/data_processed.csv')

## Select feature and encoding

In [3]:
# covert categorial features into one-hot encoding
selected_features = [
     'travel_date_dow',
     'o_purpose_category',
     'd_purpose_category',
#      'num_non_hh_travelers',
     'num_hh_travelers',
     'num_travelers',
#      'o_location_type',
#      'd_location_type',
     'o_congestion',
     'd_congestion',
#      'age',
#      'employment',
#      'student',
     'license',
#      'planning_apps',
     'industry',
#      'gender',
#      'education',
#      'survey_language',
     'num_bicycles',
     'num_vehicles',
     'res_type',
#      'rent_own',
     'income_aggregate',
#      'num_people',
#      'num_adults',
#      'num_kids',
#      'num_workers',
#      'num_students',
     'disability',
     'trip_distance'
]

df_selected = df[selected_features]

categorial_columns = ['travel_date_dow',
       'o_purpose_category', 'd_purpose_category', 'o_location_type',
       'd_location_type', 'age', 'employment', 'license', 'planning_apps', 'industry', 'gender'
                    , 'survey_language',
       'res_type', 'rent_own',  'disability']

onehot = pd.get_dummies(df_selected, columns=[x for x in categorial_columns if x in selected_features])

In [16]:
classes = np.array(['drive', 'passenger', 'bus', 'subway', 'bike', 'walk', 'other'])


# Transfer string
str_to_val = {
    'drive':0,
    'passenger': 1,
    'bus': 2,
    'subway': 3,
    'bike': 4,
    'walk': 5,
    'other': 6,
}

y = df['mode'].replace(str_to_val).to_numpy()
X = onehot.to_numpy()

## Tune parameter

### cross validation

In [19]:
from sklearn.model_selection import GroupKFold

groups = df['person_id']
group_kfold = GroupKFold(n_splits=5)

groups


0           0
1           0
2           0
3           0
4           0
         ... 
39439    1794
39440    1794
39441    1794
39442    1794
39443    1794
Name: person_id, Length: 39444, dtype: int64

## RandomForestClassifier:

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier()

#dictionary of parameters to perform hyperparameters search:
param_distribution = {
    #"n_estimators": randint(low=1,high=10),
    #"criterion": ["gini", "entropy"],
    "max_depth": randint(low=10,high=20),
    #"min_samples_split": randint(low=1,high=10),
    #"min_samples_leaf": randint(low=1,high=10)
    
}


paramSearch = RandomizedSearchCV(RF,param_distributions=param_distribution,  scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)


#logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,random_state=0)

#distributions = dict(C=uniform(loc=0, scale=4),penalty=['l2', 'l1'])

#lr = RandomizedSearchCV(logistic, distributions, random_state=0, scoring = "neg_log_loss", n_iter = 10)

search = paramSearch.fit(X, y, groups=groups)

### performance

In [64]:
best_estimator = search.best_estimator_
best_score = search.best_score_
best_param = search.best_params_

In [65]:
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score


for train_index, validate_index in group_kfold.split(X, y, groups):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    
    best_estimator.fit(X_train, y_train)
    proba = best_estimator.predict_proba(X_validate)
    loss = log_loss(y_validate, proba)
    acc_validate = accuracy_score(y_validate,best_estimator.predict(X_validate))
    acc_train = accuracy_score(y_train,best_estimator.predict(X_train))
    
    #Accuracy score:
    print(f"accuracy on validate set:{acc_validate}")
    print(f"accuracy on train set:{acc_train}")
    
    #Entropoy loss score: 
    print(f"cross entropy loss:{loss}")
    
    # Confusion matrix
    print('Confusion matrix:')
    display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(best_param)

accuracy on validate set:0.6763848396501457
accuracy on train set:0.8619236254159405
cross entropy loss:0.9172347639555577
Confusion matrix:


array([[1908,   12,    3,  113,    0,  363,    0],
       [ 510,  103,    0,   22,    0,  103,    0],
       [  69,   19,   22,  159,    0,  161,    0],
       [ 134,   33,   21,  602,    0,  191,    1],
       [  13,    7,    0,   63,    0,   45,    0],
       [ 243,   16,   11,   94,    1, 2701,    0],
       [  30,   11,    6,   44,    0,   55,    0]])

{'max_depth': 17}
accuracy on validate set:0.6959056914691343
accuracy on train set:0.8631595626683568
cross entropy loss:0.9354572770821021
Confusion matrix:


array([[1761,   47,    5,   85,    0,  259,    6],
       [ 262,  149,    2,   15,    0,   85,    0],
       [  75,   28,   19,  138,    0,  188,    0],
       [ 112,   51,   23,  658,    0,  201,    0],
       [  21,    7,    0,   48,    9,   63,    0],
       [ 309,   29,    9,   92,    1, 2894,    1],
       [ 136,   13,    1,   39,    0,   48,    0]])

{'max_depth': 17}
accuracy on validate set:0.691469134237546
accuracy on train set:0.8651560766914911
cross entropy loss:0.9321803402627333
Confusion matrix:


array([[1878,   45,    6,  111,    1,  379,    4],
       [ 290,  155,    2,   26,    0,  136,    0],
       [  74,   23,   26,  183,    0,  183,    0],
       [ 136,   33,   13,  628,    2,  199,    0],
       [  14,    1,    0,   11,    1,   50,    0],
       [ 218,   12,    9,   97,    0, 2767,    0],
       [  52,   15,    1,   64,    0,   44,    0]])

{'max_depth': 17}
accuracy on validate set:0.661427303840791
accuracy on train set:0.8707970210743147
cross entropy loss:1.046976609803078
Confusion matrix:


array([[1724,   55,   10,  114,    0,  394,    0],
       [ 242,  173,    3,   23,    1,   98,    0],
       [  89,   31,   22,  203,    0,  176,    0],
       [ 141,   29,   17,  632,    1,  218,    0],
       [  48,    0,    7,   50,    0,  113,    0],
       [ 223,   30,    7,  104,    0, 2667,    0],
       [  94,   15,    1,   60,    0,   74,    0]])

{'max_depth': 17}
accuracy on validate set:0.6847109533468559
accuracy on train set:0.865952592217011
cross entropy loss:0.9508344421820869
Confusion matrix:


array([[1889,   43,    3,  125,    1,  324,    1],
       [ 307,  165,    0,   15,    0,  120,    0],
       [  65,   22,   20,  140,    0,  201,    0],
       [ 166,   35,   14,  552,    4,  180,    0],
       [  26,    6,    0,   42,   15,   42,    0],
       [ 279,   21,    9,   96,    6, 2753,    0],
       [  64,   10,    3,   35,    0,   82,    7]])

{'max_depth': 17}


In [40]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import matplotlib.pyplot as plt

best_estimator.fit(X, y)

RandomForestClassifier(criterion='entropy', max_depth=8, n_estimators=7)

## BaggingClassifier:

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier


BAG = BaggingClassifier()

#dictionary of parameters to perform hyperparameters search:
param_distribution = {
    "base_estimator": [DecisionTreeClassifier(),SVC()],
    #"n_estimators": [5,10],
    #"max_samples": [1,5],
    #"max_features": [5,15],
    #"warm_start": [True, False]
}


paramSearch = RandomizedSearchCV(BAG,param_distributions=param_distribution,  scoring = "neg_log_loss", n_iter = 10, cv=group_kfold)



search = paramSearch.fit(X, y, groups=groups)



### Performance:


In [None]:
best_estimator = search.best_estimator_
best_score = search.best_score_
best_param = search.best_params_

In [None]:
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score


for train_index, validate_index in group_kfold.split(X, y, groups):
    X_train, X_validate = X[train_index], X[validate_index]
    y_train, y_validate = y[train_index], y[validate_index]
    
    
    best_estimator.fit(X_train, y_train)
    proba = best_estimator.predict_proba(X_validate)
    loss = log_loss(y_validate, proba)
    acc_validate = accuracy_score(y_validate,best_estimator.predict(X_validate))
    acc_train = accuracy_score(y_train,best_estimator.predict(X_train))
    
    #Accuracy score:
    print(f"accuracy on validate set:{acc_validate}")
    print(f"accuracy on train set:{acc_train}")
    
    #Entropoy loss score: 
    print(f"cross entropy loss:{loss}")
    
    # Confusion matrix
    print('Confusion matrix:')
    display(confusion_matrix(y_validate, best_estimator.predict(X_validate)))
    
    print(best_param)