In [74]:
!pip install scikit-optimize

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [75]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from skopt import BayesSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
import joblib

In [76]:
URL_data_smt_minmax_be = 'https://drive.google.com/file/d/1meYhVZS95gm0Y7e8kUMEV3nPUymnIvWU/view?usp=share'
path_data_smt_minmax_be = 'https://drive.google.com/uc?export=download&id='+URL_data_smt_minmax_be.split('/')[-2]
data_smt_minmax_be = pd.read_csv(path_data_smt_minmax_be)

URL_data_smt_standard_be = 'https://drive.google.com/file/d/12FdtMnw73CpuUvzhmFYrPd5x6QzCId6z/view?usp=share'
path_data_smt_standard_be = 'https://drive.google.com/uc?export=download&id='+URL_data_smt_standard_be.split('/')[-2]
data_smt_standard_be = pd.read_csv(path_data_smt_standard_be)

URL_data_test_minmax_be = 'https://drive.google.com/file/d/1xzEL_joTrtUhRnUfu9x_5j7bgZJNlxqD/view?usp=share'
path_data_test_minmax_be = 'https://drive.google.com/uc?export=download&id='+URL_data_test_minmax_be.split('/')[-2]
data_test_minmax_be = pd.read_csv(path_data_test_minmax_be)

URL_data_test_standard_be = 'https://drive.google.com/file/d/19rJwtDQincWlBWw4pTS6r8Cvf11aKDfW/view?usp=share'
path_data_test_standard_be = 'https://drive.google.com/uc?export=download&id='+URL_data_test_standard_be.split('/')[-2]
data_test_standard_be = pd.read_csv(path_data_test_standard_be)

In [77]:
x_test_minmax_be = data_test_minmax_be.drop('level', axis=1)
y_test_minmax_be = data_test_minmax_be['level']

x_test_standard_be = data_test_standard_be.drop('level', axis=1)
y_test_standard_be = data_test_standard_be['level']

In [109]:
def DecisionTree(x_train, y_train, x_test, y_test):
    dt = DecisionTreeClassifier()

    # Define the hyperparameter search space
    max_depth = (10, 100)
    min_samples_split = (2, 10)
    min_samples_leaf = (1, 4)
    criterion = ['gini', 'entropy']

    search_space = {
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'criterion': criterion
    }

    # Define the hyperparameter search method
    dt_search = BayesSearchCV(
        dt, search_space, n_iter=100, cv=5, n_jobs=-1, random_state=42
    )

    dt_search.fit(x_train, y_train)
    # Select the hyperparameters that result in the best performance
    best_params = dt_search.best_params_

    print("Best parameters: ", best_params)

    dt = DecisionTreeClassifier(**best_params)

    dt.fit(x_train, y_train)

    # predict data train
    y_pred_train_labels = dt.predict(x_train)

    # Calculate the F1 score, recall, precision, and accuracy
    f1_train = f1_score(y_train, y_pred_train_labels, average='macro')
    recall_train = recall_score(y_train, y_pred_train_labels, average='macro')
    precision_train = precision_score(y_train, y_pred_train_labels, average='macro')
    accuracy_train = accuracy_score(y_train, y_pred_train_labels)
    
    print("\nEvaluation data training: ")
    print("Macro-Averaged Recall: ", f1_train)
    print("Macro-Averaged Precision: ", recall_train)
    print("Macro-Averaged Precision: ", precision_train)
    print("Accuracy: ", accuracy_train)

    # predict data test
    y_pred_labels = dt.predict(x_test)

    # Calculate the macro-averaged F1 score, recall, precision, and accuracy
    f1 = f1_score(y_test, y_pred_labels, average='macro')
    recall = recall_score(y_test, y_pred_labels, average='macro')
    precision = precision_score(y_test, y_pred_labels, average='macro')
    accuracy = accuracy_score(y_test, y_pred_labels)

    print("\nEvaluation data training: ")
    print("Macro-Averaged F1 Score: ", f1)
    print("Macro-Averaged Recall: ", recall)
    print("Macro-Averaged Precision: ", precision)
    print("Accuracy: ", accuracy)

    joblib.dump(dt, 'model_dt2.joblib')

## Data dengan kombinasi SMOTE-TOMEK

In [110]:
x_train_smt_minmax_be = data_smt_minmax_be.drop('level', axis=1)
x_train_smt_minmax_be.sample(6)

Unnamed: 0,precip,Libur,ROAD_CLOSED,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,...,street_1,street_2,street_3,street_4,street_5,street_6,street_7,street_8,street_9,street_10
148197,0.008185,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.507042,0.0,0.0,1.0,0.0,0.0,1.0,1.0
12348,0.050318,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
64927,0.000428,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
111327,0.035981,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
20742,0.060046,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0
65162,0.070001,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.571919,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.571919


In [111]:
y_train_smt_minmax_be = data_smt_minmax_be['level']

In [112]:
DecisionTree(x_train_smt_minmax_be, y_train_smt_minmax_be, x_test_minmax_be, y_test_minmax_be)



Best parameters:  OrderedDict([('criterion', 'entropy'), ('max_depth', 96), ('min_samples_leaf', 1), ('min_samples_split', 2)])

Evaluation data training: 
Macro-Averaged Recall:  0.9986721154848656
Macro-Averaged Precision:  0.9986626921983927
Macro-Averaged Precision:  0.998684575675493
Accuracy:  0.998697787724545

Evaluation data training: 
Macro-Averaged F1 Score:  0.5754025541561091
Macro-Averaged Recall:  0.5839899150845491
Macro-Averaged Precision:  0.5679004147057906
Accuracy:  0.7525862461832931


In [113]:
model_smt_minmax = joblib.load('model_dt2.joblib')

In [114]:
joblib.dump(model_smt_minmax, 'model_smt_minmax.joblib')

['model_smt_minmax.joblib']

## Data dengan kombinasi SMOTE-ENN

In [115]:
x_train_smt_standard_be = data_smt_standard_be.drop('level', axis=1)
x_train_smt_standard_be.sample(6)

Unnamed: 0,precip,Libur,ROAD_CLOSED,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,...,street_1,street_2,street_3,street_4,street_5,street_6,street_7,street_8,street_9,street_10
176508,0.673418,-0.223265,14.95197,-0.40902,-0.405794,-0.40954,-0.409085,-0.407522,2.447789,-0.408239,...,-0.890382,-0.886265,-0.90457,-0.992528,1.004424,-0.99311,-0.995868,-0.998166,0.996724,-1.003757
14336,-0.275802,-0.223265,-0.066881,-0.40902,-0.405794,-0.40954,-0.409085,2.453853,-0.408532,-0.408239,...,-0.890382,-0.886265,-0.90457,1.007528,-0.995596,-0.99311,1.004149,-0.998166,-1.003287,0.996257
128814,-0.275299,-0.223265,-0.066881,-0.40902,-0.405794,-0.40954,-0.409085,2.453853,-0.408532,-0.408239,...,-0.890382,1.128331,-0.90457,-0.992528,-0.995596,1.006938,-0.995868,-0.998166,0.996724,0.996257
123461,-0.256815,-0.223265,-0.066881,-0.40902,-0.405794,-0.40954,-0.409085,-0.407522,-0.408532,2.449546,...,-0.890382,-0.886265,-0.90457,-0.992528,-0.995596,-0.99311,1.004149,-0.399748,0.996724,-1.003757
120421,-0.197284,-0.223265,-0.066881,-0.40902,-0.405794,-0.40954,2.444479,-0.407522,-0.408532,-0.408239,...,-0.890382,-0.886265,1.105498,1.007528,1.004424,-0.99311,1.004149,1.001838,-1.003287,-1.003757
136821,0.062311,-0.223265,-0.066881,-0.40902,-0.405794,-0.40954,2.444479,-0.407522,-0.408532,-0.408239,...,-0.890382,1.128331,-0.90457,-0.992528,1.004424,-0.244965,-0.995868,-0.998166,0.996724,0.996257


In [116]:
y_train_smt_standard_be = data_smt_standard_be['level']

In [117]:
DecisionTree(x_train_smt_standard_be, y_train_smt_standard_be, x_test_standard_be, y_test_standard_be)



Best parameters:  OrderedDict([('criterion', 'entropy'), ('max_depth', 36), ('min_samples_leaf', 1), ('min_samples_split', 2)])

Evaluation data training: 
Macro-Averaged Recall:  0.9986568085470843
Macro-Averaged Precision:  0.9986502381321744
Macro-Averaged Precision:  0.998666381848881
Accuracy:  0.998680089044739

Evaluation data training: 
Macro-Averaged F1 Score:  0.5776669930553794
Macro-Averaged Recall:  0.5818536619232264
Macro-Averaged Precision:  0.5740320031627757
Accuracy:  0.7562320557808868


In [118]:
model_smt_standard = joblib.load('model_dt2.joblib')

In [119]:
joblib.dump(model_smt_standard, 'model_smt_standard.joblib')

['model_smt_standard.joblib']