In [1]:
from functools import partial
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE
import optuna

import warnings

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


## Get Data

In [2]:
jams = pd.read_csv('./../dataset/jams_bogor.csv')
jams.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102322 entries, 0 to 102321
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   level             102322 non-null  int64  
 1   median_length     102322 non-null  float64
 2   median_delay      102322 non-null  float64
 3   median_speed_kmh  102322 non-null  float64
 4   total_records     102322 non-null  int64  
 5   isWeekend         102322 non-null  int64  
 6   longitude         102322 non-null  float64
 7   latitude          102322 non-null  float64
 8   isRushHour        102322 non-null  int64  
 9   hour_sin          102322 non-null  float64
 10  hour_cos          102322 non-null  float64
 11  day_sin           102322 non-null  float64
 12  day_cos           102322 non-null  float64
 13  week_sin          102322 non-null  float64
 14  week_cos          102322 non-null  float64
 15  month_sin         102322 non-null  float64
 16  month_cos         10

In [3]:
# Sorting Columns
col_feature = list(set(jams.columns) - set(['level']))
col_feature.sort() 

features = jams[col_feature].copy()
labels = jams['level'].copy()

In [4]:
X_train = features.loc[jams.is_train==1]
y_train = labels.loc[jams.is_train==1]
X_test = features.loc[jams.is_train==0]
y_test = labels.loc[jams.is_train==0]

y_test.value_counts()  / y_test.shape[0] * 100

1    54.132326
2    30.335212
3    15.532462
Name: level, dtype: float64

## Hyperparameter Tuning with Optuna

In [5]:

def objective(trial, X, y):

    # parameter space
    sm_neighbors = trial.suggest_int('sm_neighbors', 3, 9)
    c = trial.suggest_float('C', 0.0, 5.0)
    penalty = trial.suggest_categorical('penalty', [None, 'l2'])
    
    
    sm = SMOTE(k_neighbors = sm_neighbors, random_state = 123)
    X_sm, y_sm = sm.fit_resample(X, y)
    model = LogisticRegression(C = c, penalty = penalty)
    
    
    f1_scores = []
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123)
    for train_idx, val_idx in skf.split(X, y):
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        X_sm, y_sm = sm.fit_resample(X_train, y_train)
        model.fit(X_sm, y_sm)

        y_pred = model.predict(X_val)
        f1 = f1_score(y_val, y_pred, average='macro')
        f1_scores.append(f1)


    return np.mean(f1_scores)

In [6]:
optimize_func = partial(objective, X=X_train, y=y_train)

study = optuna.create_study(direction='maximize')
study.optimize(optimize_func, n_trials=100)

[32m[I 2023-03-01 18:53:29,928][0m A new study created in memory with name: no-name-692f3ca7-435d-415e-a632-89815b560133[0m
[32m[I 2023-03-01 18:53:44,946][0m Trial 0 finished with value: 0.7862172307162365 and parameters: {'sm_ratio': 0.5550215255398996, 'sm_neighbors': 5, 'C': 1.8199131863016738, 'penalty': None}. Best is trial 0 with value: 0.7862172307162365.[0m
[32m[I 2023-03-01 18:54:01,493][0m Trial 1 finished with value: 0.7862272662035752 and parameters: {'sm_ratio': 0.6784196827802418, 'sm_neighbors': 6, 'C': 3.1440006543627526, 'penalty': None}. Best is trial 1 with value: 0.7862272662035752.[0m
[32m[I 2023-03-01 18:54:15,198][0m Trial 2 finished with value: 0.786030036960118 and parameters: {'sm_ratio': 0.5499466216820574, 'sm_neighbors': 8, 'C': 4.945569955667482, 'penalty': None}. Best is trial 1 with value: 0.7862272662035752.[0m
[32m[I 2023-03-01 18:54:32,270][0m Trial 3 finished with value: 0.7866921079092093 and parameters: {'sm_ratio': 0.953301443824764

In [7]:
trial = study.best_trial
best_param = trial.params

print('F1-Macro: {}'.format(trial.value))
print("Best hyperparameters: {}".format(best_param))

F1-Macro: 0.7869496858112687
Best hyperparameters: {'sm_ratio': 0.9327821000982957, 'sm_neighbors': 3, 'C': 4.805794046023033, 'penalty': 'l2'}


## Training Model & Evaluation

In [8]:
def calculate_metrics_score(y_true, y_pred):
    print('Confusion Matrix:\n', confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred))

In [9]:
sm = SMOTE(k_neighbors=best_param['sm_neighbors'], random_state = 123)
model = LogisticRegression(C=best_param['C'], penalty=best_param['penalty'])

X_sm, y_sm = sm.fit_resample(X_train, y_train)
model.fit(X_sm, y_sm)

calculate_metrics_score(y_sm, model.predict(X_sm))

Confusion Matrix:
 [[33250  4929   595]
 [ 5209 26979  6586]
 [   16  4976 33782]]
              precision    recall  f1-score   support

           1       0.86      0.86      0.86     38774
           2       0.73      0.70      0.71     38774
           3       0.82      0.87      0.85     38774

    accuracy                           0.81    116322
   macro avg       0.81      0.81      0.81    116322
weighted avg       0.81      0.81      0.81    116322



In [10]:
y_pred = model.predict(X_test)
calculate_metrics_score(y_test, y_pred)

Confusion Matrix:
 [[14229  2137   251]
 [ 1245  6499  1568]
 [    3   577  4188]]
              precision    recall  f1-score   support

           1       0.92      0.86      0.89     16617
           2       0.71      0.70      0.70      9312
           3       0.70      0.88      0.78      4768

    accuracy                           0.81     30697
   macro avg       0.77      0.81      0.79     30697
weighted avg       0.82      0.81      0.81     30697



In [11]:
model_file = '../model/model_1.pkl'

# joblib.dump(model, model_file)
model_load = joblib.load(model_file)