# Thailand Crash 1%

# Library

In [1]:
# Import standard data processing libraries
import pandas as pd
import numpy as np
import random
import os

# Disable GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Set log level
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

import tempfile
import warnings
from sklearn.exceptions import ConvergenceWarning
import pickle
import re

# Filter warnings
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='xgboost')

# Import visualization library
import matplotlib.pyplot as plt

# Import Machine Learning and Data Preprocessing libraries
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import (precision_recall_curve, average_precision_score, 
                             confusion_matrix, accuracy_score, precision_score, 
                             recall_score, f1_score, balanced_accuracy_score)
from imblearn.combine import SMOTEENN
from sklearn.utils.class_weight import compute_class_weight

# Import Deep Learning libraries - Keras & TensorFlow
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import (Dense, Input, GlobalMaxPooling1D, LSTM, GRU, 
                                     Bidirectional, Dropout, BatchNormalization, SimpleRNN)
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.metrics import BinaryAccuracy, AUC
from tensorflow.keras.initializers import glorot_uniform
from tensorflow.keras.layers import Activation

# Function

In [2]:
# Fungsi Setiap Arsiterktur
# Model GRU
def make_model_GRU(look_back, n_features, units, learning_rate, num_layers, dropout_rate=0.2):
    model = Sequential()
    # Menambahkan layer GRU
    for i in range(num_layers):
        return_sequences = i < num_layers - 1  # True untuk layer bukan terakhir
        model.add(GRU(
            units=units,
            activation='relu',
            input_shape=(look_back, n_features),
            return_sequences=return_sequences,
            kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4),
            recurrent_regularizer=l1_l2(l1=1e-5, l2=1e-4),
            bias_regularizer=l1_l2(l1=1e-5, l2=1e-4)
        ))
        model.add(Dropout(dropout_rate))

    # Menambahkan layer Dense
    model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)))
    model.add(Dense(1, activation='sigmoid'))

    # Mengonfigurasi optimizer
    adam_optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=METRICS)
    return model

# Model LSTM
def make_model_LSTM(look_back, n_features, units, learning_rate, num_layers, dropout_rate=0.2):
    model = Sequential()
    # Menambahkan layer
    for i in range(num_layers):
        return_sequences = i < num_layers - 1  # True untuk layer bukan terakhir
        model.add(LSTM(
            units=units,
            activation='relu',
            input_shape=(look_back, n_features),
            return_sequences=return_sequences,
            kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4),
            recurrent_regularizer=l1_l2(l1=1e-5, l2=1e-4),
            bias_regularizer=l1_l2(l1=1e-5, l2=1e-4)
        ))
        model.add(Dropout(dropout_rate))

    # Menambahkan layer Dense
    model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)))
    model.add(Dense(1, activation='sigmoid'))

    # Mengonfigurasi optimizer
    adam_optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=METRICS)
    return model

# Model RNN
def make_model_RNN(look_back, n_features, units, learning_rate, num_layers, dropout_rate=0.2):
    model = Sequential()
    # Menambahkan layer
    for i in range(num_layers):
        return_sequences = i < num_layers - 1  # True untuk layer bukan terakhir
        model.add(SimpleRNN(
            units=units,
            activation='relu',
            input_shape=(look_back, n_features),
            return_sequences=return_sequences,
            kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4),
            recurrent_regularizer=l1_l2(l1=1e-5, l2=1e-4),
            bias_regularizer=l1_l2(l1=1e-5, l2=1e-4)
        ))
        model.add(Dropout(dropout_rate))

    # Menambahkan layer Dense
    model.add(Dense(32, activation='relu', kernel_regularizer=l1_l2(l1=1e-5, l2=1e-4)))
    model.add(Dense(1, activation='sigmoid'))

    # Mengonfigurasi optimizer
    adam_optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=adam_optimizer, loss='binary_crossentropy', metrics=METRICS)
    return model

# Fungsi split data
def split_data(data, start, end):
    return data[(data.index >= start) & (data.index <= end)]

# Fungsi untuk mengubah fitur X menjadi format dengan lag untuk RNN, GRU, LSTM
def create_lagged_features_lstm(X, look_back):
    """
    Creates lagged features for LSTM, GRU, RNN
    
    Args:
    X (array-like): The input features.
    look_back (int): The number of lag days.

    Returns:
    numpy.array: Transformed features with lag.
    """
    X_lagged = []
    for i in range(look_back, len(X)):
        X_lagged.append(X[i-look_back:i, :])
    return np.array(X_lagged)

# Metrik PRC AUC
METRICS = [
    'accuracy',
    AUC(name='prc', curve='PR'),
]

# Time Steps
look_back = 5

In [6]:
# Fungsi untuk random forest dan xgboost
# Fungsi untuk mengubah fitur X menjadi format dengan lag untuk random forest dan xgboost
def create_lagged_features(X, look_back):
    X_lagged = []
    for i in range(look_back, len(X)):
        # Convert the DataFrame slice to a NumPy array before flattening
        X_lagged.append(X.iloc[i-look_back:i].to_numpy().flatten())
    return np.array(X_lagged)

# Fungsi untuk menjalankan cross-validation random forest baseline
def time_series_cv_rf(X, y, param_grid, periods):
    best_score = 0
    best_params = {}

    for params in ParameterGrid(param_grid):
        print(f"Testing parameters: {params}")
        scores = []

        for start_train, end_train, start_val, end_val in periods:
            X_train = split_data(X, start_train, end_train)
            y_train = split_data(y, start_train, end_train)
            X_val = split_data(X, start_val, end_val)
            y_val = split_data(y, start_val, end_val)

            # Buat fitur lagged 
            X_train_lagged = create_lagged_features(X_train, look_back)
            y_train_lagged = y_train[look_back:]
            X_val_lagged = create_lagged_features(X_val, look_back)
            y_val_lagged = y_val[look_back:]

            # Inisialisasi dan latih model
            try:
                model = RandomForestClassifier(**params, random_state=1501211036)
                model.fit(X_train_lagged, y_train_lagged)
            except ValueError as e:
                print(f"Error during model training: {e}")
                continue

            # Prediksi dan evaluasi
            try:
                predictions = model.predict(X_val_lagged)
                score = average_precision_score(y_val_lagged, predictions)
                scores.append(score)
            except ValueError as e:
                print(f"Error during prediction or evaluation: {e}")
                continue

        avg_score = np.mean(scores)
        print(f"Average Score for {params}: {avg_score}")

        if avg_score > best_score:
            best_score = avg_score
            best_params = params

    print(f"Best parameters: {best_params}")
    return best_params

# Fungsi untuk menjalankan cross-validation xgboost baseline
def time_series_cv_xgb(X, y, param_grid, periods):
    best_score = 0
    best_params = {}

    for params in ParameterGrid(param_grid):
        print(f"Testing parameters: {params}")
        scores = []

        for start_train, end_train, start_val, end_val in periods:
            X_train = split_data(X, start_train, end_train)
            y_train = split_data(y, start_train, end_train)
            X_val = split_data(X, start_val, end_val)
            y_val = split_data(y, start_val, end_val)

            # Buat fitur lagged 
            X_train_lagged = create_lagged_features(X_train, look_back)
            y_train_lagged = y_train[look_back:]
            X_val_lagged = create_lagged_features(X_val, look_back)
            y_val_lagged = y_val[look_back:]

            # Inisialisasi dan latih model
            try:
                model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=1501211036)
                # membuat list dataset untuk evaluasi
                eval_set = [(X_train_lagged, y_train_lagged), (X_val_lagged, y_val_lagged)]
                model.fit(X_train_lagged, y_train_lagged, eval_set=eval_set)
            except ValueError as e:
                print(f"Error during model training: {e}")
                continue

            # Prediksi dan evaluasi
            try:
                predictions = model.predict(X_val_lagged)
                score = average_precision_score(y_val_lagged, predictions)
                scores.append(score)
            except ValueError as e:
                print(f"Error during prediction or evaluation: {e}")
                continue

        avg_score = np.mean(scores)
        print(f"Average Score for {params}: {avg_score}")

        if avg_score > best_score:
            best_score = avg_score
            best_params = params

    print(f"Best parameters: {best_params}")
    return best_params

# Fungsi untuk menjalankan cross-validation random forest dengan SMOTE-ENN
def time_series_cv_rf_smote(X, y, param_grid, periods):
    best_score = 0
    best_params = {}

    for params in ParameterGrid(param_grid):
        print(f"Testing parameters: {params}")
        scores = []

        for start_train, end_train, start_val, end_val in periods:
            X_train = split_data(X, start_train, end_train)
            y_train = split_data(y, start_train, end_train)
            X_val = split_data(X, start_val, end_val)
            y_val = split_data(y, start_val, end_val)

            # resampling
            smote_enn = SMOTEENN(random_state=1501211036)
            X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)
            # Buat fitur lagged 
            X_train_lagged = create_lagged_features(X_train_resampled, look_back)
            y_train_lagged = y_train_resampled[look_back:]
            X_val_lagged = create_lagged_features(X_val, look_back)
            y_val_lagged = y_val[look_back:]

            # Inisialisasi dan latih model
            try:
                model = RandomForestClassifier(**params, random_state=1501211036)
                model.fit(X_train_lagged, y_train_lagged)
            except ValueError as e:
                print(f"Error during model training: {e}")
                continue

            # Prediksi dan evaluasi
            try:
                predictions = model.predict(X_val_lagged)
                score = average_precision_score(y_val_lagged, predictions)
                scores.append(score)
            except ValueError as e:
                print(f"Error during prediction or evaluation: {e}")
                continue

        avg_score = np.mean(scores)
        print(f"Average Score for {params}: {avg_score}")

        if avg_score > best_score:
            best_score = avg_score
            best_params = params

    print(f"Best parameters: {best_params}")
    return best_params
# Fungsi untuk menjalankan cross-validation xgboost dengan SMOTE-ENN
def time_series_cv_xgb_smote(X, y, param_grid, periods):
    best_score = 0
    best_params = {}

    for params in ParameterGrid(param_grid):
        print(f"Testing parameters: {params}")
        scores = []

        for start_train, end_train, start_val, end_val in periods:
            X_train = split_data(X, start_train, end_train)
            y_train = split_data(y, start_train, end_train)
            X_val = split_data(X, start_val, end_val)
            y_val = split_data(y, start_val, end_val)

            # resampling
            smote_enn = SMOTEENN(random_state=1501211036)
            X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

            # Buat fitur lagged 
            X_train_lagged = create_lagged_features(X_train_resampled, look_back)
            y_train_lagged = y_train_resampled[look_back:]
            X_val_lagged = create_lagged_features(X_val, look_back)
            y_val_lagged = y_val[look_back:]

            # Inisialisasi dan latih model
            try:
                model = XGBClassifier(**params, use_label_encoder=False, eval_metric='logloss', random_state=1501211036)
                # membuat list dataset untuk evaluasi
                eval_set = [(X_train_lagged, y_train_lagged), (X_val_lagged, y_val_lagged)]
                model.fit(X_train_lagged, y_train_lagged, eval_set=eval_set)
            except ValueError as e:
                print(f"Error during model training: {e}")
                continue

            # Prediksi dan evaluasi
            try:
                predictions = model.predict(X_val_lagged)
                score = average_precision_score(y_val_lagged, predictions)
                scores.append(score)
            except ValueError as e:
                print(f"Error during prediction or evaluation: {e}")
                continue

        avg_score = np.mean(scores)
        print(f"Average Score for {params}: {avg_score}")

        if avg_score > best_score:
            best_score = avg_score
            best_params = params

    print(f"Best parameters: {best_params}")
    return best_params

# Definisikan periode untuk cross-validation
periods = [
    (pd.Timestamp('2010-01-01'), pd.Timestamp('2014-12-31'), pd.Timestamp('2015-01-01'), pd.Timestamp('2019-12-31'))
]

# Data

In [7]:
# Data Loading
data = pd.read_excel('Data/df.thailand.1persen.xlsx', index_col=0)

# Data Normalization
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)

# Data Imputation with KNNImputer
imputer = KNNImputer(n_neighbors=5)  # You can change '5' to the desired number of neighbors
data_imputed = imputer.fit_transform(data_scaled)

# Converting imputed data back to DataFrame
data_imputed_df = pd.DataFrame(data_imputed, columns=data.columns, index=data.index)

# Binarize a specific column based on a threshold
# Here, replacing values in the second column based on a 0.5 threshold
data_imputed_df.iloc[:, 1:2] = (data_imputed_df.iloc[:, 1:2] >= 0.5).astype(int)

# Updating the scaled data
data_scaled = data_imputed_df

# Ensuring the index is of datetime type
data_scaled.index = pd.to_datetime(data_scaled.index)

# Extracting the year from the index and creating a 'year' column
data_scaled['year'] = data_scaled.index.year

# Calculating the binary classification proportion per year
proportion_per_year = data_scaled.groupby('year')['idn_crash'].value_counts(normalize=True).unstack()

# Separating the target variable (y) and features (X)
y = data_imputed_df['idn_crash']
X = data_imputed_df.drop('idn_crash', axis=1)

# Latih model pada periode train 
X_train_full = split_data(X, pd.Timestamp('2010-01-01'), pd.Timestamp('2014-12-31'))
y_train_full = split_data(y, pd.Timestamp('2010-01-01'), pd.Timestamp('2014-12-31'))

X_train_full_lagged = create_lagged_features_lstm(X_train_full.to_numpy(), look_back)
y_train_full_lagged = y_train_full[look_back:]

# Uji model pada periode test
X_test = split_data(X, pd.Timestamp('2015-01-01'), pd.Timestamp('2019-12-31'))
y_test = split_data(y, pd.Timestamp('2015-01-01'), pd.Timestamp('2019-12-31'))

X_test_lagged = create_lagged_features_lstm(X_test.to_numpy(), look_back)
y_test_lagged = y_test[look_back:]

# Menghitung jumlah fitur
n_features = X_train_full_lagged.shape[2]

proportion_per_year

idn_crash,0.0,1.0
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2010,1.0,
2011,0.975945,0.024055
2012,1.0,
2013,0.975945,0.024055
2014,0.996564,0.003436
2015,0.996564,0.003436
2016,0.993151,0.006849
2017,1.0,
2018,1.0,
2019,1.0,


# Baseline

## RNN

In [8]:
prc_scores = []
balanced_acc_scores = []
far_scores = []
hr_scores = []
histories = []
model_paths = []

import pickle
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import average_precision_score, balanced_accuracy_score, confusion_matrix

# Loop untuk melatih dan mengevaluasi model dengan variasi units dan num_layers
for units in [64, 128]:
    for dropout_rate in [0.1,0.2]:
        for i in range(10):  # 5 iterasi untuk setiap konfigurasi
            model_path = f'Model/tl1_rnn_units_{units}_dropout_{dropout_rate}_iteration_{i}.h5'
            model_checkpoint = ModelCheckpoint(model_path, monitor='val_prc', mode='max', save_best_only=True, verbose=0)
            early_stopping = EarlyStopping(monitor='val_prc', patience=10, mode='max', restore_best_weights=True, verbose=3)

            # Buat dan latih model dengan konfigurasi tertentu
            model = make_model_RNN(look_back=look_back, n_features=n_features, units=units, 
                                   learning_rate=0.001, num_layers=1, dropout_rate=dropout_rate)
            history = model.fit(X_train_full_lagged, y_train_full_lagged, 
                                epochs=50, batch_size=32, 
                                validation_data=(X_test_lagged, y_test_lagged),
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=0)
            histories.append(history)

            # Evaluasi model
            model.load_weights(model_path)  # Muat model terbaik dari iterasi ini
            predictions_test = model.predict(X_test_lagged)
            predictions_test = (predictions_test > 0.5).astype(int)

            prc_score = average_precision_score(y_test_lagged, predictions_test)
            balanced_acc_score = balanced_accuracy_score(y_test_lagged, predictions_test)
            cm = confusion_matrix(y_test_lagged, predictions_test)
            TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
            FAR = FP / (FP + TN)
            HR = TP / (TP + FN)
 
            prc_scores.append(prc_score)
            balanced_acc_scores.append(balanced_acc_score)
            far_scores.append(FAR)
            hr_scores.append(HR)

            # Simpan model path
            model_paths.append(model_path)
            with open(f'Model/tl1_rnn_units_{units}_dropout_{dropout_rate}_iteration_{i}_history.pkl', 'wb') as file:
                pickle.dump(history.history, file)
                
# Pilih model dengan PRC tertinggi
best_model_index = np.argmax(prc_scores)
best_model_path = model_paths[best_model_index]

Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 7.
Epoch 17: early s

## LSTM

In [9]:
prc_scores = []
balanced_acc_scores = []
far_scores = []
hr_scores = []
histories = []
model_paths = []

import pickle
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import average_precision_score, balanced_accuracy_score, confusion_matrix

# Loop untuk melatih dan mengevaluasi model dengan variasi units dan num_layers
for units in [64, 128]:
    for dropout_rate in [0.1,0.2]:
        for i in range(10):  # 5 iterasi untuk setiap konfigurasi
            model_path = f'Model/tl1_lstm_units_{units}_dropout_{dropout_rate}_iteration_{i}.h5'
            model_checkpoint = ModelCheckpoint(model_path, monitor='val_prc', mode='max', save_best_only=True, verbose=0)
            early_stopping = EarlyStopping(monitor='val_prc', patience=10, mode='max', restore_best_weights=True, verbose=3)

            # Buat dan latih model dengan konfigurasi tertentu
            model = make_model_LSTM(look_back=look_back, n_features=n_features, units=units, 
                                   learning_rate=0.001, num_layers=1, dropout_rate=dropout_rate)
            history = model.fit(X_train_full_lagged, y_train_full_lagged, 
                                epochs=50, batch_size=32, 
                                validation_data=(X_test_lagged, y_test_lagged),
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=0)
            histories.append(history)

            # Evaluasi model
            model.load_weights(model_path)  # Muat model terbaik dari iterasi ini
            predictions_test = model.predict(X_test_lagged)
            predictions_test = (predictions_test > 0.5).astype(int)

            prc_score = average_precision_score(y_test_lagged, predictions_test)
            balanced_acc_score = balanced_accuracy_score(y_test_lagged, predictions_test)
            cm = confusion_matrix(y_test_lagged, predictions_test)
            TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
            FAR = FP / (FP + TN)
            HR = TP / (TP + FN)

            prc_scores.append(prc_score)
            balanced_acc_scores.append(balanced_acc_score)
            far_scores.append(FAR)
            hr_scores.append(HR)

            # Simpan model path
            model_paths.append(model_path)
            with open(f'Model/tl1_lstm_units_{units}_dropout_{dropout_rate}_iteration_{i}_history.pkl', 'wb') as file:
                pickle.dump(history.history, file)
                
# Pilih model dengan PRC tertinggi
best_model_index = np.argmax(prc_scores)
best_model_path = model_paths[best_model_index]

Restoring model weights from the end of the best epoch: 13.
Epoch 23: early stopping
Restoring model weights from the end of the best epoch: 5.
Epoch 15: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 15.
Epoch 25: early stopping
Restoring model weights from the end of the best epoch: 7.
Epoch 17: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 7.
Epoch 17: early stopping
Restoring model weights from the end of the best epoch: 5.
Epoch 15: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 10.
Epoch 20: earl

## GRU

In [10]:
prc_scores = []
balanced_acc_scores = []
far_scores = []
hr_scores = []
histories = []
model_paths = []

import pickle
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import average_precision_score, balanced_accuracy_score, confusion_matrix

# Loop untuk melatih dan mengevaluasi model dengan variasi units dan num_layers
for units in [64, 128]:
    for dropout_rate in [0.1,0.2]:
        for i in range(10):  # 5 iterasi untuk setiap konfigurasi
            model_path = f'Model/tl1_gru_units_{units}_dropout_{dropout_rate}_iteration_{i}.h5'
            model_checkpoint = ModelCheckpoint(model_path, monitor='val_prc', mode='max', save_best_only=True, verbose=0)
            early_stopping = EarlyStopping(monitor='val_prc', patience=10, mode='max', restore_best_weights=True, verbose=3)

            # Buat dan latih model dengan konfigurasi tertentu
            model = make_model_GRU(look_back=look_back, n_features=n_features, units=units, 
                                   learning_rate=0.001, num_layers=1, dropout_rate=dropout_rate)
            history = model.fit(X_train_full_lagged, y_train_full_lagged, 
                                epochs=50, batch_size=32, 
                                validation_data=(X_test_lagged, y_test_lagged),
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=0)
            histories.append(history)

            # Evaluasi model
            model.load_weights(model_path)  # Muat model terbaik dari iterasi ini
            predictions_test = model.predict(X_test_lagged)
            predictions_test = (predictions_test > 0.5).astype(int)

            prc_score = average_precision_score(y_test_lagged, predictions_test)
            balanced_acc_score = balanced_accuracy_score(y_test_lagged, predictions_test)
            cm = confusion_matrix(y_test_lagged, predictions_test)
            TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
            FAR = FP / (FP + TN)
            HR = TP / (TP + FN)

            prc_scores.append(prc_score)
            balanced_acc_scores.append(balanced_acc_score)
            far_scores.append(FAR)
            hr_scores.append(HR)

            # Simpan model path
            model_paths.append(model_path)
            with open(f'Model/tl1_gru_units_{units}_dropout_{dropout_rate}_iteration_{i}_history.pkl', 'wb') as file:
                pickle.dump(history.history, file)
                
# Pilih model dengan PRC tertinggi
best_model_index = np.argmax(prc_scores)
best_model_path = model_paths[best_model_index]

Restoring model weights from the end of the best epoch: 25.
Epoch 35: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 13.
Epoch 23: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 15.
Epoch 25: early stopping
Restoring model weights from the end of the best epoch: 7.
Epoch 17: early stopping
Restoring model weights from the end of the best epoch: 23.
Epoch 33: early stopping
Restoring model weights from the end of the best epoch: 16.
Epoch 26: early stopping
Restoring model weights from the end of the best epoch: 3.
Epoch 13: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 7.
Epoch 17: ea

## Random Forest

In [11]:
# Mendefinisikan grid hyperparameter
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50]
}

# Jalankan cross-validation
best_params = time_series_cv_rf(X, y, param_grid, periods)

# Menyimpan best_params ke dalam file
with open('Model/tl1_rf.pkl', 'wb') as f:
    pickle.dump(best_params, f)

Testing parameters: {'max_depth': 10, 'n_estimators': 100}
Average Score for {'max_depth': 10, 'n_estimators': 100}: 0.002066115702479339
Testing parameters: {'max_depth': 10, 'n_estimators': 200}
Average Score for {'max_depth': 10, 'n_estimators': 200}: 0.002066115702479339
Testing parameters: {'max_depth': 10, 'n_estimators': 300}
Average Score for {'max_depth': 10, 'n_estimators': 300}: 0.002066115702479339
Testing parameters: {'max_depth': 10, 'n_estimators': 400}
Average Score for {'max_depth': 10, 'n_estimators': 400}: 0.002066115702479339
Testing parameters: {'max_depth': 10, 'n_estimators': 500}
Average Score for {'max_depth': 10, 'n_estimators': 500}: 0.002066115702479339
Testing parameters: {'max_depth': 20, 'n_estimators': 100}
Average Score for {'max_depth': 20, 'n_estimators': 100}: 0.002066115702479339
Testing parameters: {'max_depth': 20, 'n_estimators': 200}
Average Score for {'max_depth': 20, 'n_estimators': 200}: 0.002066115702479339
Testing parameters: {'max_depth': 

## XGBoost

In [12]:
# Mendefinisikan grid hyperparameter
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7]
}
                            
# Jalankan cross-validation
best_params = time_series_cv_xgb(X, y, param_grid, periods)

# Menyimpan best_params ke dalam file
with open('Model/tl1_xgb.pkl', 'wb') as f:
    pickle.dump(best_params, f)

Testing parameters: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100}
[0]	validation_0-logloss:0.69219	validation_1-logloss:0.69218
[1]	validation_0-logloss:0.69123	validation_1-logloss:0.69121
[2]	validation_0-logloss:0.69028	validation_1-logloss:0.69024
[3]	validation_0-logloss:0.68933	validation_1-logloss:0.68928
[4]	validation_0-logloss:0.68838	validation_1-logloss:0.68832
[5]	validation_0-logloss:0.68743	validation_1-logloss:0.68736
[6]	validation_0-logloss:0.68648	validation_1-logloss:0.68640
[7]	validation_0-logloss:0.68554	validation_1-logloss:0.68544
[8]	validation_0-logloss:0.68460	validation_1-logloss:0.68449
[9]	validation_0-logloss:0.68366	validation_1-logloss:0.68353
[10]	validation_0-logloss:0.68272	validation_1-logloss:0.68258
[11]	validation_0-logloss:0.68178	validation_1-logloss:0.68163
[12]	validation_0-logloss:0.68084	validation_1-logloss:0.68069
[13]	validation_0-logloss:0.67991	validation_1-logloss:0.67974
[14]	validation_0-logloss:0.67898	validation_1

# Imbalanced Handling With Smote

In [13]:
# Handling imbalanced data with SMOTE
smote_enn = SMOTEENN(random_state=1501211036)
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train_full, y_train_full)

X_train_full_lagged = create_lagged_features_lstm(X_train_resampled.to_numpy(), look_back)
y_train_full_lagged = y_train_resampled[look_back:]

# Menghitung jumlah fitur
n_features = X_train_full_lagged.shape[2]

## RNN

In [14]:
prc_scores = []
balanced_acc_scores = []
far_scores = []
hr_scores = []
histories = []
model_paths = []

import pickle
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import average_precision_score, balanced_accuracy_score, confusion_matrix

# Loop untuk melatih dan mengevaluasi model dengan variasi units dan num_layers
for units in [64, 128]:
    for dropout_rate in [0.1,0.2]:
        for i in range(10):  # 5 iterasi untuk setiap konfigurasi
            model_path = f'Model/tl1_rnn_smote_units_{units}_dropout_{dropout_rate}_iteration_{i}.h5'
            model_checkpoint = ModelCheckpoint(model_path, monitor='val_prc', mode='max', save_best_only=True, verbose=0)
            early_stopping = EarlyStopping(monitor='val_prc', patience=10, mode='max', restore_best_weights=True, verbose=3)

            # Buat dan latih model dengan konfigurasi tertentu
            model = make_model_RNN(look_back=look_back, n_features=n_features, units=units, 
                                   learning_rate=0.001, num_layers=1, dropout_rate=dropout_rate)
            history = model.fit(X_train_full_lagged, y_train_full_lagged, 
                                epochs=50, batch_size=32, 
                                validation_data=(X_test_lagged, y_test_lagged),
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=0)
            histories.append(history)

            # Evaluasi model
            model.load_weights(model_path)  # Muat model terbaik dari iterasi ini
            predictions_test = model.predict(X_test_lagged)
            predictions_test = (predictions_test > 0.5).astype(int)

            prc_score = average_precision_score(y_test_lagged, predictions_test)
            balanced_acc_score = balanced_accuracy_score(y_test_lagged, predictions_test)
            cm = confusion_matrix(y_test_lagged, predictions_test)
            TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
            FAR = FP / (FP + TN)
            HR = TP / (TP + FN)

            prc_scores.append(prc_score)
            balanced_acc_scores.append(balanced_acc_score)
            far_scores.append(FAR)
            hr_scores.append(HR)

            # Simpan model path
            model_paths.append(model_path)
            with open(f'Model/tl1_rnn_smote_units_{units}_dropout_{dropout_rate}_iteration_{i}_history.pkl', 'wb') as file:
                pickle.dump(history.history, file)
                
# Pilih model dengan PRC tertinggi
best_model_index = np.argmax(prc_scores)
best_model_path = model_paths[best_model_index]

Restoring model weights from the end of the best epoch: 22.
Epoch 32: early stopping
Restoring model weights from the end of the best epoch: 8.
Epoch 18: early stopping
Restoring model weights from the end of the best epoch: 16.
Epoch 26: early stopping
Restoring model weights from the end of the best epoch: 20.
Epoch 30: early stopping
Restoring model weights from the end of the best epoch: 17.
Epoch 27: early stopping
Restoring model weights from the end of the best epoch: 11.
Epoch 21: early stopping
Restoring model weights from the end of the best epoch: 17.
Epoch 27: early stopping
Restoring model weights from the end of the best epoch: 20.
Epoch 30: early stopping
Restoring model weights from the end of the best epoch: 19.
Epoch 29: early stopping
Restoring model weights from the end of the best epoch: 15.
Epoch 25: early stopping
Restoring model weights from the end of the best epoch: 8.
Epoch 18: early stopping
Restoring model weights from the end of the best epoch: 21.
Epoch 3

## LSTM

In [15]:
prc_scores = []
balanced_acc_scores = []
far_scores = []
hr_scores = []
histories = []
model_paths = []

import pickle
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import average_precision_score, balanced_accuracy_score, confusion_matrix

# Loop untuk melatih dan mengevaluasi model dengan variasi units dan num_layers
for units in [64, 128]:
    for dropout_rate in [0.1,0.2]:
        for i in range(10):  # 5 iterasi untuk setiap konfigurasi
            model_path = f'Model/tl1_lstm_smote_units_{units}_dropout_{dropout_rate}_iteration_{i}.h5'
            model_checkpoint = ModelCheckpoint(model_path, monitor='val_prc', mode='max', save_best_only=True, verbose=0)
            early_stopping = EarlyStopping(monitor='val_prc', patience=10, mode='max', restore_best_weights=True, verbose=3)

            # Buat dan latih model dengan konfigurasi tertentu
            model = make_model_LSTM(look_back=look_back, n_features=n_features, units=units, 
                                   learning_rate=0.001, num_layers=1, dropout_rate=dropout_rate)
            history = model.fit(X_train_full_lagged, y_train_full_lagged, 
                                epochs=50, batch_size=32, 
                                validation_data=(X_test_lagged, y_test_lagged),
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=0)
            histories.append(history)

            # Evaluasi model
            model.load_weights(model_path)  # Muat model terbaik dari iterasi ini
            predictions_test = model.predict(X_test_lagged)
            predictions_test = (predictions_test > 0.5).astype(int)

            prc_score = average_precision_score(y_test_lagged, predictions_test)
            balanced_acc_score = balanced_accuracy_score(y_test_lagged, predictions_test)
            cm = confusion_matrix(y_test_lagged, predictions_test)
            TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
            FAR = FP / (FP + TN)
            HR = TP / (TP + FN)

            prc_scores.append(prc_score)
            balanced_acc_scores.append(balanced_acc_score)
            far_scores.append(FAR)
            hr_scores.append(HR)

            # Simpan model path
            model_paths.append(model_path)
            with open(f'Model/tl1_lstm_smote_units_{units}_dropout_{dropout_rate}_iteration_{i}_history.pkl', 'wb') as file:
                pickle.dump(history.history, file)
                
# Pilih model dengan PRC tertinggi
best_model_index = np.argmax(prc_scores)
best_model_path = model_paths[best_model_index]

Restoring model weights from the end of the best epoch: 14.
Epoch 24: early stopping
Restoring model weights from the end of the best epoch: 3.
Epoch 13: early stopping
Restoring model weights from the end of the best epoch: 38.
Epoch 48: early stopping
Restoring model weights from the end of the best epoch: 1.
Epoch 11: early stopping
Restoring model weights from the end of the best epoch: 22.
Epoch 32: early stopping
Restoring model weights from the end of the best epoch: 11.
Epoch 21: early stopping
Restoring model weights from the end of the best epoch: 4.
Epoch 14: early stopping
Restoring model weights from the end of the best epoch: 15.
Epoch 25: early stopping
Restoring model weights from the end of the best epoch: 3.
Epoch 13: early stopping
Restoring model weights from the end of the best epoch: 14.
Epoch 24: early stopping
Restoring model weights from the end of the best epoch: 5.
Epoch 15: early stopping
Restoring model weights from the end of the best epoch: 2.
Epoch 12: e

## GRU

In [16]:
prc_scores = []
balanced_acc_scores = []
far_scores = []
hr_scores = []
histories = []
model_paths = []

import pickle
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import average_precision_score, balanced_accuracy_score, confusion_matrix

# Loop untuk melatih dan mengevaluasi model dengan variasi units dan num_layers
for units in [64, 128]:
    for dropout_rate in [0.1,0.2]:
        for i in range(10):  # 5 iterasi untuk setiap konfigurasi
            model_path = f'Model/tl1_gru_smote_units_{units}_dropout_{dropout_rate}_iteration_{i}.h5'
            model_checkpoint = ModelCheckpoint(model_path, monitor='val_prc', mode='max', save_best_only=True, verbose=0)
            early_stopping = EarlyStopping(monitor='val_prc', patience=10, mode='max', restore_best_weights=True, verbose=3)

            # Buat dan latih model dengan konfigurasi tertentu
            model = make_model_GRU(look_back=look_back, n_features=n_features, units=units, 
                                   learning_rate=0.001, num_layers=1, dropout_rate=dropout_rate)
            history = model.fit(X_train_full_lagged, y_train_full_lagged, 
                                epochs=50, batch_size=32, 
                                validation_data=(X_test_lagged, y_test_lagged),
                                callbacks=[early_stopping, model_checkpoint],
                                verbose=0)
            histories.append(history)

            # Evaluasi model
            model.load_weights(model_path)  # Muat model terbaik dari iterasi ini
            predictions_test = model.predict(X_test_lagged)
            predictions_test = (predictions_test > 0.5).astype(int)

            prc_score = average_precision_score(y_test_lagged, predictions_test)
            balanced_acc_score = balanced_accuracy_score(y_test_lagged, predictions_test)
            cm = confusion_matrix(y_test_lagged, predictions_test)
            TP, FN, FP, TN = cm[1, 1], cm[1, 0], cm[0, 1], cm[0, 0]
            FAR = FP / (FP + TN)
            HR = TP / (TP + FN)

            prc_scores.append(prc_score)
            balanced_acc_scores.append(balanced_acc_score)
            far_scores.append(FAR)
            hr_scores.append(HR)

            # Simpan model path
            model_paths.append(model_path)
            with open(f'Model/tl1_gru_smote_units_{units}_dropout_{dropout_rate}_iteration_{i}_history.pkl', 'wb') as file:
                pickle.dump(history.history, file)
                
# Pilih model dengan PRC tertinggi
best_model_index = np.argmax(prc_scores)
best_model_path = model_paths[best_model_index]

Restoring model weights from the end of the best epoch: 22.
Epoch 32: early stopping
Restoring model weights from the end of the best epoch: 34.
Epoch 44: early stopping
Restoring model weights from the end of the best epoch: 2.
Epoch 12: early stopping
Restoring model weights from the end of the best epoch: 11.
Epoch 21: early stopping
Restoring model weights from the end of the best epoch: 17.
Epoch 27: early stopping
Restoring model weights from the end of the best epoch: 14.
Epoch 24: early stopping
Restoring model weights from the end of the best epoch: 23.
Epoch 33: early stopping
Restoring model weights from the end of the best epoch: 23.
Epoch 33: early stopping
Restoring model weights from the end of the best epoch: 15.
Epoch 25: early stopping
Restoring model weights from the end of the best epoch: 13.
Epoch 23: early stopping
Restoring model weights from the end of the best epoch: 3.
Epoch 13: early stopping
Restoring model weights from the end of the best epoch: 21.
Epoch 3

## Random Forest

In [17]:

# Mendefinisikan grid hyperparameter
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [10, 20, 30, 40, 50]
}

# Jalankan cross-validation
best_params = time_series_cv_rf_smote(X, y, param_grid, periods)

# Menyimpan best_params ke dalam file
with open('Model/tl1_rf_smote.pkl', 'wb') as f:
    pickle.dump(best_params, f)

Testing parameters: {'max_depth': 10, 'n_estimators': 100}
Average Score for {'max_depth': 10, 'n_estimators': 100}: 0.0035747081201626656
Testing parameters: {'max_depth': 10, 'n_estimators': 200}
Average Score for {'max_depth': 10, 'n_estimators': 200}: 0.002715047686034146
Testing parameters: {'max_depth': 10, 'n_estimators': 300}
Average Score for {'max_depth': 10, 'n_estimators': 300}: 0.0028853005114799334
Testing parameters: {'max_depth': 10, 'n_estimators': 400}
Average Score for {'max_depth': 10, 'n_estimators': 400}: 0.0026817296487088578
Testing parameters: {'max_depth': 10, 'n_estimators': 500}
Average Score for {'max_depth': 10, 'n_estimators': 500}: 0.0026937177654881004
Testing parameters: {'max_depth': 20, 'n_estimators': 100}
Average Score for {'max_depth': 20, 'n_estimators': 100}: 0.0035747081201626656
Testing parameters: {'max_depth': 20, 'n_estimators': 200}
Average Score for {'max_depth': 20, 'n_estimators': 200}: 0.002715047686034146
Testing parameters: {'max_dep

## XGBoost

In [18]:
# Mendefinisikan grid hyperparameter
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6, 7]
}

# Jalankan cross-validation
best_params = time_series_cv_xgb_smote(X, y, param_grid, periods)

# Menyimpan best_params ke dalam file
with open('Model/tl1_xgb_smote.pkl', 'wb') as f:
    pickle.dump(best_params, f)


Testing parameters: {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100}
[0]	validation_0-logloss:0.69215	validation_1-logloss:0.69258
[1]	validation_0-logloss:0.69116	validation_1-logloss:0.69202
[2]	validation_0-logloss:0.69017	validation_1-logloss:0.69104
[3]	validation_0-logloss:0.68918	validation_1-logloss:0.69005
[4]	validation_0-logloss:0.68819	validation_1-logloss:0.68907
[5]	validation_0-logloss:0.68721	validation_1-logloss:0.68809
[6]	validation_0-logloss:0.68623	validation_1-logloss:0.68712
[7]	validation_0-logloss:0.68524	validation_1-logloss:0.68659
[8]	validation_0-logloss:0.68427	validation_1-logloss:0.68614
[9]	validation_0-logloss:0.68329	validation_1-logloss:0.68557
[10]	validation_0-logloss:0.68231	validation_1-logloss:0.68508
[11]	validation_0-logloss:0.68134	validation_1-logloss:0.68464
[12]	validation_0-logloss:0.68037	validation_1-logloss:0.68420
[13]	validation_0-logloss:0.67940	validation_1-logloss:0.68365
[14]	validation_0-logloss:0.67843	validation_1