In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/clean_raw_data.csv
/kaggle/input/clean_data_mean.csv
/kaggle/input/raw_data.csv
/kaggle/input/clean_data_GCN.csv
/kaggle/input/FillZero_minmax_baseline/clean_data_minmax_fill-zero.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/val/val_week1_2.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/test/test_week2.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/train/clean_data_week2.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/train/5-folds/data_part_2.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/train/5-folds/data_part_3.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/train/5-folds/data_part_4.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/train/5-folds/data_part_1.csv
/kaggle/input/FillZero_minmax_baseline/clean_week2/train/5-folds/data_part_5.csv
/kaggle/input/FillZero_minmax_baseline/clean_week3/val/val_week1_2_3.csv
/kaggle/input/FillZero_minmax_baseline/clean_week3/test/test_week3.csv
/kaggle/input/FillZero_minmax_bas

In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from keras_tuner import RandomSearch
from sklearn.model_selection import StratifiedKFold
import time
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support, roc_auc_score

In [3]:
# Biến global cho base path
BASE_PATH = "/kaggle/input/GCN_minmax_baseline_version4"
# Tuần và số phần fold
weeks = ['week1', 'week2', 'week3', 'week4']
fold_parts = 5

# Tạo five_fold_files
five_fold_files = {
    week: [
        f"{BASE_PATH}/clean_{week}/train/5-folds/data_part_{i}.csv"
        for i in range(1, fold_parts + 1)
    ]
    for week in weeks
}

# Tạo file_validation
file_validation = {
    'week1': [f"{BASE_PATH}/clean_week1/val/val_week1.csv"],
    'week2': [f"{BASE_PATH}/clean_week2/val/val_week1_2.csv"],
    'week3': [f"{BASE_PATH}/clean_week3/val/val_week1_2_3.csv"],
    'week4': [f"{BASE_PATH}/clean_week4/val/val_week1_2_3_4.csv"]
}

# Tạo file_test
file_test = {
    week: [f"{BASE_PATH}/clean_{week}/test/test_{week}.csv"]
    for week in weeks
}


## Tìm siêu tham số tốt nhất cho từng tuần

In [4]:
# Định nghĩa Focal Loss
def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), 1. - K.epsilon())
        cross_entropy = -y_true * K.log(y_pred)
        loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy
        return K.sum(loss, axis=-1)
    return focal_loss_fixed

# Tạo hàm train cho từng tuần
def train_week_model(week_number, file_paths_train, file_validataion):
    # Đọc dữ liệu
    train_data = pd.read_csv(file_paths_train)
    val_data = pd.read_csv(file_validataion)
    
    # Tách đặc trưng và nhãn
    X_train = train_data.drop(columns=["classification_encoded", "user_id", "course_id", "school", "enroll_time", "classification"])
    y_train = train_data["classification_encoded"]

    X_val = val_data.drop(columns=["classification_encoded", "user_id", "course_id", "school", "enroll_time", "classification"])
    y_val = val_data["classification_encoded"]
    
    # Áp dụng Over-sampling cho dữ liệu huấn luyện bằng SMOTE
    oversampler = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_res, y_train_res = oversampler.fit_resample(X_train, y_train)
    
    # Reshape dữ liệu cho mô hình BiLSTM
    X_train_res = X_train_res.values.reshape(X_train_res.shape[0], X_train_res.shape[1], 1)
    X_val = X_val.values.reshape(X_val.shape[0], X_val.shape[1], 1)
    
    # One-hot encode nhãn
    y_train_res = tf.keras.utils.to_categorical(y_train_res, num_classes=5)
    y_val = tf.keras.utils.to_categorical(y_val, num_classes=5)
    
    def build_model(hp):
        inputs = tf.keras.Input(shape=(X_train_res.shape[1], 1))  # Khởi tạo đầu vào
        
        # GRU layer 1
        x = layers.GRU(
            units=hp.Int('units_1', min_value=32, max_value=256, step=32),
            return_sequences=True
        )(inputs)
        x = layers.Dropout(rate=hp.Float('dropout_1', min_value=0.1, max_value=0.5, step=0.1))(x)
        
        # GRU layer 2
        x = layers.GRU(
            units=hp.Int('units_2', min_value=32, max_value=256, step=32),
            return_sequences=False
        )(x)
        x = layers.Dropout(rate=hp.Float('dropout_2', min_value=0.1, max_value=0.5, step=0.1))(x)
        
        # Lớp đầu ra
        outputs = layers.Dense(5, activation='softmax')(x)
        
        # Khởi tạo mô hình
        model = tf.keras.Model(inputs=inputs, outputs=outputs)
        
        # Compile với Focal Loss
        model.compile(optimizer=tf.keras.optimizers.Adam(
                          learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG')),
                      loss=focal_loss(gamma=2., alpha=0.25),
                      metrics=['accuracy'])
        
        return model

    
    # Khởi tạo RandomSearch tuner
    tuner = RandomSearch(
        build_model,
        objective='val_accuracy',
        max_trials=10,
        executions_per_trial=1,
        directory='my_dir',
        project_name=f'bilstm_tuning_week{week_number}'
    )
    
    # Tìm kiếm siêu tham số tốt nhất
    tuner.search(X_train_res, y_train_res,
                 epochs=20,
                 validation_data=(X_val, y_val),
                 batch_size=32)
    
    # Trả về kết quả tối ưu cho tuần
    best_params = tuner.get_best_hyperparameters(num_trials=1)[0]
    return best_params

In [5]:
# Định nghĩa đường dẫn đến dữ liệu cho từng tuần
file_paths_train = {
    week: f"{BASE_PATH}/clean_{week}/train/clean_data_{week}.csv"
    for week in weeks
}

# Định nghĩa file_validation theo quy luật riêng
file_validation = {
    f"week{idx + 1}": f"{BASE_PATH}/clean_week{idx + 1}/val/val_week{'_'.join(str(i) for i in range(1, idx + 2))}.csv"
    for idx in range(len(weeks))
}

In [6]:
# Tìm tham số tốt nhất cho từng tuần
best_params_week1 = train_week_model(1, file_paths_train["week1"], file_validation["week1"])
best_params_week2 = train_week_model(2, file_paths_train["week2"], file_validation["week2"])
best_params_week3 = train_week_model(3, file_paths_train["week3"], file_validation["week3"])
best_params_week4 = train_week_model(4, file_paths_train["week4"], file_validation["week4"])

# In thông tin chi tiết các tham số tối ưu
print("Best Parameters for Week 1:")
for param_name in best_params_week1.values.keys():
    print(f"{param_name}: {best_params_week1.get(param_name)}")

print("\nBest Parameters for Week 2:")
for param_name in best_params_week2.values.keys():
    print(f"{param_name}: {best_params_week2.get(param_name)}")

print("\nBest Parameters for Week 3:")
for param_name in best_params_week3.values.keys():
    print(f"{param_name}: {best_params_week3.get(param_name)}")

print("\nBest Parameters for Week 4:")
for param_name in best_params_week4.values.keys():
    print(f"{param_name}: {best_params_week4.get(param_name)}")


Trial 10 Complete [00h 02m 51s]
val_accuracy: 0.9676632285118103

Best val_accuracy So Far: 0.992068350315094
Total elapsed time: 00h 29m 31s
Best Parameters for Week 1:
units_1: 32
dropout_1: 0.1
units_2: 128
dropout_2: 0.2
learning_rate: 0.0009139492446116529

Best Parameters for Week 2:
units_1: 96
dropout_1: 0.1
units_2: 128
dropout_2: 0.30000000000000004
learning_rate: 0.0010163111487992763

Best Parameters for Week 3:
units_1: 64
dropout_1: 0.2
units_2: 256
dropout_2: 0.30000000000000004
learning_rate: 0.0014196495433366004

Best Parameters for Week 4:
units_1: 224
dropout_1: 0.1
units_2: 32
dropout_2: 0.5
learning_rate: 0.004147731932902771


## Danh sách tham số tốt nhất của từng tuần

In [7]:
# Danh sách tham số tốt nhất
best_params = {
    "week1": best_params_week1,
    "week2": best_params_week2,
    "week3": best_params_week3,
    "week4": best_params_week4
}

In [8]:
from tensorflow.keras import layers
import tensorflow as tf
from tensorflow.keras import backend as K

# Định nghĩa Focal Loss
def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        y_pred = K.clip(y_pred, K.epsilon(), 1. - K.epsilon())
        cross_entropy = -y_true * K.log(y_pred)
        loss = alpha * K.pow(1 - y_pred, gamma) * cross_entropy
        return K.sum(loss, axis=-1)
    
    return focal_loss_fixed

# Xây dựng mô hình BiLSTM
def build_GRU_model(params, input_shape):
    inputs = tf.keras.Input(shape=input_shape)  # Định nghĩa đầu vào
    
    # GRU layer 1
    x = layers.GRU(
        units=params.get('units_1'),
        return_sequences=True
    )(inputs)
    x = layers.Dropout(rate=params.get('dropout_1', 0.2))(x)
    
    # GRU layer 2
    x = layers.GRU(
        units=params.get('units_2', 32),
        return_sequences=False
    )(x)
    x = layers.Dropout(rate=params.get('dropout_2', 0.2))(x)
    
    # Lớp đầu ra
    outputs = layers.Dense(5, activation='softmax')(x)
    
    # Khởi tạo mô hình
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    # Compile với Focal Loss
    model.compile(optimizer=tf.keras.optimizers.Adam(
                      learning_rate=params['learning_rate']),
                  loss=focal_loss(gamma=params.get('gamma', 2.), alpha=params.get('alpha', 0.25)),
                  metrics=['accuracy'])
    
    return model


In [9]:
# Biến lưu kết quả tổng quát
overall_results_5folds = []

# Lặp qua từng tuần
for week, file_paths in five_fold_files.items():
    print(f"\nProcessing {week} with best parameters...")
    params = best_params[week].values
    print(f"best parameters for {week}: {params}")
    
    # Biến lưu kết quả cho từng tuần
    week_results = {
        "week": week,
        "accuracy_per_fold": [],
        "precision_per_label": [],
        "recall_per_label": [],
        "f1_score_per_label": [],
        "auc_roc_per_label": [],    # AUC từng lớp
        "auc_roc_macro": [],        # AUC macro
        "auc_roc_weighted": [],     # AUC weighted (tự tính)
        "precision_macro": [],
        "recall_macro": [],
        "f1_macro": [],
        "precision_weighted": [],
        "recall_weighted": [],
        "f1_weighted": [],
        "confusion_matrices": [],
        "train_times": [],
        "test_times": []
    }

    # Lặp qua từng fold
    for i in range(len(file_paths)):
        print(f"Fold {i+1}: Using file {file_paths[i]} as test set")
        
        # Tải dữ liệu
        test_data = pd.read_csv(file_paths[i])
        train_data = pd.concat([pd.read_csv(file_paths[j]) for j in range(len(file_paths)) if j != i])
        
        # Tách X và y
        X_train = train_data.drop(columns=["classification_encoded", "user_id",
                                           "course_id", "school", "enroll_time", "classification"])
        y_train = to_categorical(train_data['classification_encoded'], num_classes=5)
        
        X_test = test_data.drop(columns=["classification_encoded", "user_id",
                                         "course_id", "school", "enroll_time", "classification"])
        y_test = to_categorical(test_data['classification_encoded'], num_classes=5)

        # Reshape dữ liệu cho LSTM
        X_train = X_train.to_numpy().reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_test = X_test.to_numpy().reshape((X_test.shape[0], 1, X_test.shape[1]))

        # Xây dựng mô hình với tham số tốt nhất
        input_shape = (X_train.shape[1], X_train.shape[2])
        model = build_GRU_model(params, input_shape)
        
        # Bắt đầu tính thời gian huấn luyện
        start_train = time.time()
        model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), batch_size=32)
        end_train = time.time()
        
        # Bắt đầu tính thời gian kiểm thử
        start_test = time.time()
        y_pred = model.predict(X_test)
        end_test = time.time()
        
        # Tính thời gian và lưu lại
        train_time = end_train - start_train
        test_time = end_test - start_test
        week_results["train_times"].append(train_time)
        week_results["test_times"].append(test_time)

        # Đánh giá mô hình trên tập kiểm thử của fold hiện tại
        _, accuracy = model.evaluate(X_test, y_test, verbose=0)
        week_results["accuracy_per_fold"].append(accuracy)
        
        # Dự đoán
        y_pred_classes = y_pred.argmax(axis=1)
        y_test_classes = y_test.argmax(axis=1)
        
        # Tính các chỉ số cho mỗi fold
        precision, recall, f1, _ = precision_recall_fscore_support(y_test_classes, y_pred_classes, average=None)
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(y_test_classes, y_pred_classes, average='macro')
        precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(y_test_classes, y_pred_classes, average='weighted')
        conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)
        
        # Tính AUC-ROC
        try:
            # Tính AUC macro và theo từng lớp với OvR
            auc_macro = roc_auc_score(y_test, y_pred, multi_class="ovr", average="macro")
            auc_per_class = roc_auc_score(y_test, y_pred, multi_class="ovr", average=None)
            # Tính AUC weighted: tính trọng số theo số mẫu của từng lớp
            supports = np.bincount(y_test_classes, minlength=5)
            auc_weighted = np.sum(auc_per_class * supports) / np.sum(supports)
        except Exception as e:
            print(f"Lỗi khi tính AUC: {e}")
            auc_macro = np.nan
            auc_per_class = [np.nan] * 5
            auc_weighted = np.nan
            
        # Lưu kết quả của fold hiện tại
        week_results["precision_per_label"].append(precision)
        week_results["recall_per_label"].append(recall)
        week_results["f1_score_per_label"].append(f1)
        week_results["auc_roc_per_label"].append(auc_per_class)  # AUC từng lớp
        week_results["auc_roc_macro"].append(auc_macro)          # AUC macro
        week_results["auc_roc_weighted"].append(auc_weighted)      # AUC weighted
        week_results["confusion_matrices"].append(conf_matrix)
        week_results["precision_macro"].append(precision_macro)
        week_results["recall_macro"].append(recall_macro)
        week_results["f1_macro"].append(f1_macro)
        week_results["precision_weighted"].append(precision_weighted)
        week_results["recall_weighted"].append(recall_weighted)
        week_results["f1_weighted"].append(f1_weighted)

    # Tính trung bình cho từng nhãn
    average_precision_per_label = np.mean(week_results["precision_per_label"], axis=0)
    average_recall_per_label = np.nanmean(week_results["recall_per_label"], axis=0)
    average_f1_per_label = np.nanmean(week_results["f1_score_per_label"], axis=0)
    average_auc_per_label = np.nanmean(week_results["auc_roc_per_label"], axis=0)
    average_confusion_matrix = np.nanmean(week_results["confusion_matrices"], axis=0)
    average_train_time = sum(week_results["train_times"]) / len(week_results["train_times"])
    average_test_time = sum(week_results["test_times"]) / len(week_results["test_times"])
    average_accuracy = np.nanmean(week_results["accuracy_per_fold"])
    average_precision_macro = np.nanmean(week_results["precision_macro"])
    average_recall_macro = np.nanmean(week_results["recall_macro"])
    average_f1_macro = np.nanmean(week_results["f1_macro"])
    average_auc_macro = np.nanmean(week_results["auc_roc_macro"])
    average_precision_weighted = np.nanmean(week_results["precision_weighted"])
    average_recall_weighted = np.nanmean(week_results["recall_weighted"])
    average_f1_weighted = np.nanmean(week_results["f1_weighted"])
    average_auc_weighted = np.nanmean(week_results["auc_roc_weighted"])


    # Tạo DataFrame cho precision, recall, f1-score
    labels = np.unique(y_test_classes)  # Lấy nhãn từ y_test_classes
    metrics_df = pd.DataFrame({
        "Label": labels,
        "Average Precision": average_precision_per_label,
        "Average Recall": average_recall_per_label,
        "Average F1-Score": average_f1_per_label,
        "Average AUC": average_auc_per_label
    })
    
    # Tạo DataFrame cho confusion matrix
    confusion_df = pd.DataFrame(average_confusion_matrix, index=labels, columns=labels)
    # In kết quả Accuracy và Macro metrics
    print("\n=== Average Accuracy ===")
    print(f"{average_accuracy:.4f}")
    print("\n=== Average Macro Metrics ===")
    print(f"Macro Precision: {average_precision_macro:.4f}")
    print(f"Macro Recall: {average_recall_macro:.4f}")
    print(f"Macro F1-Score: {average_f1_macro:.4f}")
    print(f"Macro AUC-ROC: {average_auc_macro:.4f}")
    print("\n=== Average Weighted Metrics ===")
    print(f"Weighted Precision: {average_precision_weighted:.4f}")
    print(f"Weighted Recall: {average_recall_weighted:.4f}")
    print(f"Weighted F1-Score: {average_f1_weighted:.4f}")
    print(f"Weighted AUC-ROC: {average_auc_weighted:.4f}")
    print("\n=== Average Metrics per Label ===")
    print(metrics_df)
    print("\n=== Average Confusion Matrix ===")
    print(confusion_df)
    
    # Cập nhật kết quả cho tuần hiện tại
    week_results.update({
        "average_accuracy": average_accuracy,
        "average_precision_macro": average_precision_macro,
        "average_recall_macro": average_recall_macro,
        "average_f1_macro": average_f1_macro,
        "average_auc_macro": average_auc_macro,
        "average_precision_weighted": average_precision_weighted,
        "average_recall_weighted": average_recall_weighted,
        "average_f1_weighted": average_f1_weighted,
        "average_auc_weighted": average_auc_weighted,
        "average_metrics_df": metrics_df,
        "average_confusion_matrix": confusion_df,
        "average_train_times": average_train_time,
        "average_test_times": average_test_time,
    })
    overall_results_5folds.append(week_results)


Processing week1 with best parameters...
best parameters for week1: {'units_1': 32, 'dropout_1': 0.1, 'units_2': 128, 'dropout_2': 0.2, 'learning_rate': 0.0009139492446116529}
Fold 1: Using file /kaggle/input/GCN_minmax_baseline_version4/clean_week1/train/5-folds/data_part_1.csv as test set
Epoch 1/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.5724 - loss: 0.1545 - val_accuracy: 0.6885 - val_loss: 0.1126
Epoch 2/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6770 - loss: 0.1182 - val_accuracy: 0.7000 - val_loss: 0.1085
Epoch 3/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7000 - loss: 0.1099 - val_accuracy: 0.7484 - val_loss: 0.0998
Epoch 4/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7243 - loss: 0.1049 - val_accuracy: 0.7205 - val_loss: 0.0996
Epoch 5/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 2: Using file /kaggle/input/GCN_minmax_baseline_version4/clean_week4/train/5-folds/data_part_2.csv as test set
Epoch 1/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6107 - loss: 0.1465 - val_accuracy: 0.6988 - val_loss: 0.1235
Epoch 2/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.6587 - loss: 0.1264 - val_accuracy: 0.7751 - val_loss: 0.0975
Epoch 3/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.7224 - loss: 0.1087 - val_accuracy: 0.8330 - val_loss: 0.0807
Epoch 4/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7720 - loss: 0.0990 - val_accuracy: 0.8425 - val_loss: 0.0749
Epoch 5/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7683 - loss: 0.0995 - val_accuracy: 0.8445 - val_loss: 0.0688
Epoch 6/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 4: Using file /kaggle/input/GCN_minmax_baseline_version4/clean_week4/train/5-folds/data_part_4.csv as test set
Epoch 1/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.6022 - loss: 0.1461 - val_accuracy: 0.6281 - val_loss: 0.1197
Epoch 2/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6524 - loss: 0.1271 - val_accuracy: 0.6705 - val_loss: 0.1298
Epoch 3/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6840 - loss: 0.1199 - val_accuracy: 0.8040 - val_loss: 0.0938
Epoch 4/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7374 - loss: 0.1072 - val_accuracy: 0.8280 - val_loss: 0.0780
Epoch 5/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7764 - loss: 0.0927 - val_accuracy: 0.7506 - val_loss: 0.1135
Epoch 6/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Fold 5: Using file /kaggle/input/GCN_minmax_baseline_version4/clean_week4/train/5-folds/data_part_5.csv as test set
Epoch 1/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.6260 - loss: 0.1398 - val_accuracy: 0.6678 - val_loss: 0.1191
Epoch 2/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6616 - loss: 0.1279 - val_accuracy: 0.6365 - val_loss: 0.1213
Epoch 3/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.6880 - loss: 0.1197 - val_accuracy: 0.7719 - val_loss: 0.0936
Epoch 4/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7539 - loss: 0.1032 - val_accuracy: 0.8158 - val_loss: 0.0764
Epoch 5/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7879 - loss: 0.0916 - val_accuracy: 0.8337 - val_loss: 0.0723
Epoch 6/50
[1m328/328[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[

## Kết quả cross validation trên 5-folds

In [10]:
# Duyệt qua các tuần trong overall_results
for week_result in overall_results_5folds:
    week = week_result["week"]
    average_train_time = np.mean(week_result["train_times"])
    average_test_time = np.mean(week_result["test_times"])
    average_metrics_df = week_result["average_metrics_df"]
    average_accuracy = np.mean(week_results["accuracy_per_fold"])
    average_confusion_matrix = week_result["average_confusion_matrix"]
    
    # In kết quả
    print(f"\n=== Results for {week} ===")
    print(f"Average Accurancy: {average_accuracy}")
    print(f"Average Train Time: {average_train_time:.4f} seconds")
    print(f"Average Test Time: {average_test_time:.4f} seconds")
    print(f"Average AUC Macro: {average_auc_macro}")
    print(f"Average AUC Weighted: {average_auc_weighted}")
    print("\nAverage Precision, Recall, F1-Score, AUC-ROC per Label:")
    print(average_metrics_df)
    print("\nAverage Confusion Matrix:")
    print(average_confusion_matrix)



=== Results for week1 ===
Average Accurancy: 0.9000896215438843
Average Train Time: 97.4689 seconds
Average Test Time: 0.5083 seconds
Average AUC Macro: 0.9608324136695947
Average AUC Weighted: 0.9850517354169097

Average Precision, Recall, F1-Score, AUC-ROC per Label:
   Label  Average Precision  Average Recall  Average F1-Score  Average AUC
0      0           0.716406        0.713667          0.713427     0.920054
1      1           0.841412        0.300731          0.399239     0.908544
2      2           0.741510        0.486571          0.580972     0.899078
3      3           0.691797        0.588024          0.631970     0.924109
4      4           0.875091        0.944487          0.907863     0.955513

Average Confusion Matrix:
       0     1     2     3       4
0  428.2   4.8  14.4  18.2   134.4
1   36.4  26.4   1.2   4.4    19.2
2   49.0   1.2  80.0   6.4    27.8
3   27.8   0.8   3.0  98.2    37.2
4   59.8   2.0  11.6  15.6  1514.4

=== Results for week2 ===
Average Accuran

## Kiểm tra trên tập test

In [11]:
# Mảng lưu dữ liệu của các tuần
results = []

def process_week(week_num, best_params, results):
    print(f"\n=== Processing Week {week_num} ===")
    params = best_params[f"week{week_num}"].values
    # Đường dẫn tới dữ liệu tuần tương ứng
    train_path = f"{BASE_PATH}/clean_week{week_num}/train/clean_data_week{week_num}.csv"
    test_path = f"{BASE_PATH}/clean_week{week_num}/test/test_week{week_num}.csv"
    
    # Load dữ liệu
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Tách X và y
    X_train = train_data.drop(columns=["classification_encoded", "user_id",
                                       "course_id", "school", "enroll_time", "classification"])
    y_train = train_data['classification_encoded']
    
    X_test = test_data.drop(columns=["classification_encoded", "user_id",
                                     "course_id", "school", "enroll_time", "classification"])
    y_test = test_data['classification_encoded']

    # Áp dụng SMOTE cho tập huấn luyện
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Chuyển đổi nhãn sang dạng one-hot
    y_train_resampled = to_categorical(y_train_resampled, num_classes=5)
    y_test = to_categorical(y_test, num_classes=5)

    # Reshape dữ liệu cho LSTM
    X_train_resampled = X_train_resampled.to_numpy().reshape((X_train_resampled.shape[0], 1, X_train_resampled.shape[1]))
    X_test = X_test.to_numpy().reshape((X_test.shape[0], 1, X_test.shape[1]))

    # Xây dựng mô hình với tham số tốt nhất
    input_shape = (X_train_resampled.shape[1], X_train_resampled.shape[2])
    model = build_GRU_model(params, input_shape)
    
    # Huấn luyện mô hình
    start_train = time.time()
    model.fit(X_train_resampled, y_train_resampled, epochs=50, validation_split=0.1, batch_size=32)
    end_train = time.time()
    
    # Kiểm thử mô hình
    start_test = time.time()
    y_pred = model.predict(X_test)
    end_test = time.time()
    
    # Tính thời gian huấn luyện và kiểm thử
    train_time = end_train - start_train
    test_time = end_test - start_test
    
    # Đánh giá mô hình
    y_pred_classes = y_pred.argmax(axis=1)
    y_test_classes = y_test.argmax(axis=1)
    
    # Tính các chỉ số Precision, Recall, F1 cho từng lớp và macro
    precision, recall, f1, _ = precision_recall_fscore_support(y_test_classes, y_pred_classes, average=None)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(y_test_classes, y_pred_classes, average='macro')
    precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(y_test_classes, y_pred_classes, average='weighted')
    conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    
    # Tính AUC-ROC (với one-vs-rest)
    try:
        auc_macro = roc_auc_score(y_test, y_pred, multi_class="ovr", average="macro")
        auc_per_class = roc_auc_score(y_test, y_pred, multi_class="ovr", average=None)
        # Tính AUC weighted tự tính theo trọng số mẫu của từng lớp
        supports = np.bincount(y_test_classes, minlength=5)
        auc_weighted = np.sum(auc_per_class * supports) / np.sum(supports)
    except Exception as e:
        print(f"Lỗi khi tính AUC: {e}")
        auc_macro = np.nan
        auc_per_class = [np.nan] * 5
        auc_weighted = np.nan

    # Lưu kết quả vào mảng
    results.append({
        "week": week_num,
        "train_time": train_time,
        "test_time": test_time,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "accuracy": accuracy,
        "precision_macro": precision_macro,
        "recall_macro": recall_macro,
        "f1_macro": f1_macro,
        "precision_weighted": precision_weighted,
        "recall_weighted": recall_weighted,
        "f1_weighted": f1_weighted,
        "auc_macro": auc_macro,
        "auc_weighted": auc_weighted,
        "auc_per_class": auc_per_class,
        "confusion_matrix": conf_matrix
    })
    
    # In kết quả chi tiết
    print("\n=== Precision, Recall, F1-Score per Label ===")
    print(pd.DataFrame({
        "Label": np.unique(y_test_classes),
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }))

    print("\n=== Macro Averages & Accuracy ===")
    print(f"Macro Precision: {precision_macro:.4f}")
    print(f"Macro Recall: {recall_macro:.4f}")
    print(f"Macro F1-Score: {f1_macro:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    
    print("\n=== Weighted Averages ===")
    print(f"Weighted Precision: {precision_weighted:.4f}")
    print(f"Weighted Recall: {recall_weighted:.4f}")
    print(f"Weighted F1-Score: {f1_weighted:.4f}")
    
    print("\n=== AUC-ROC ===")
    print(f"AUC Macro: {auc_macro:.4f}")
    print(f"AUC Weighted: {auc_weighted:.4f}")
    print(f"AUC per Label: {auc_per_class}")
    
    print("\n=== Confusion Matrix ===")
    print(pd.DataFrame(conf_matrix, index=np.unique(y_test_classes), columns=np.unique(y_test_classes)))
    
    print(f"\nTrain Time: {train_time:.2f} seconds")
    print(f"Test Time: {test_time:.2f} seconds")

In [12]:
process_week(1, best_params, results)


=== Processing Week 1 ===
Epoch 1/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 5ms/step - accuracy: 0.4440 - loss: 0.2048 - val_accuracy: 0.4268 - val_loss: 0.3206
Epoch 2/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5448 - loss: 0.1737 - val_accuracy: 0.6169 - val_loss: 0.2365
Epoch 3/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.5991 - loss: 0.1561 - val_accuracy: 0.5944 - val_loss: 0.2097
Epoch 4/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.6237 - loss: 0.1463 - val_accuracy: 0.5448 - val_loss: 0.2395
Epoch 5/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.6361 - loss: 0.1405 - val_accuracy: 0.5455 - val_loss: 0.2183
Epoch 6/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.6496 - loss: 0.1348 - val_accuracy: 0.5700 - val_lo

In [13]:
process_week(2, best_params, results)


=== Processing Week 2 ===
Epoch 1/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 6ms/step - accuracy: 0.4909 - loss: 0.1928 - val_accuracy: 0.4772 - val_loss: 0.2817
Epoch 2/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.6230 - loss: 0.1420 - val_accuracy: 0.6391 - val_loss: 0.1977
Epoch 3/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.7100 - loss: 0.1028 - val_accuracy: 0.6460 - val_loss: 0.1633
Epoch 4/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.7573 - loss: 0.0824 - val_accuracy: 0.7109 - val_loss: 0.1176
Epoch 5/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6ms/step - accuracy: 0.7865 - loss: 0.0727 - val_accuracy: 0.6116 - val_loss: 0.1448
Epoch 6/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7957 - loss: 0.0682 - val_accuracy: 0.6316 - val_lo

In [14]:
process_week(3, best_params, results)


=== Processing Week 3 ===
Epoch 1/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - accuracy: 0.4910 - loss: 0.1928 - val_accuracy: 0.4298 - val_loss: 0.2843
Epoch 2/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6061 - loss: 0.1471 - val_accuracy: 0.6685 - val_loss: 0.1806
Epoch 3/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.7052 - loss: 0.1088 - val_accuracy: 0.6979 - val_loss: 0.1171
Epoch 4/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.7768 - loss: 0.0805 - val_accuracy: 0.7790 - val_loss: 0.0929
Epoch 5/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8064 - loss: 0.0691 - val_accuracy: 0.6845 - val_loss: 0.1098
Epoch 6/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.8228 - loss: 0.0634 - val_accuracy: 0.7847 - val_lo

In [15]:
process_week(4, best_params, results)


=== Processing Week 4 ===
Epoch 1/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 0.4135 - loss: 0.2136 - val_accuracy: 0.6171 - val_loss: 0.2506
Epoch 2/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.4807 - loss: 0.1891 - val_accuracy: 0.4709 - val_loss: 0.2631
Epoch 3/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5296 - loss: 0.1692 - val_accuracy: 0.0706 - val_loss: 0.3387
Epoch 4/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5309 - loss: 0.1600 - val_accuracy: 0.5927 - val_loss: 0.2216
Epoch 5/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.5444 - loss: 0.1544 - val_accuracy: 0.6406 - val_loss: 0.1948
Epoch 6/50
[1m1128/1128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - accuracy: 0.5503 - loss: 0.1544 - val_accuracy: 0.5121 - val_lo

In [16]:
# Hiển thị dữ liệu của các tuần
print("\n=== Summary Results for All Weeks ===")
for result in results:
    print(f"Week {result['week']}:")
    print(f"  Train Time: {result['train_time']:.2f} seconds")
    print(f"  Test Time: {result['test_time']:.2f} seconds")
    print(f"  Accurancy: {result['accuracy']}")
    print(f"  Precision: {result['precision']}")
    print(f"  Recall: {result['recall']}")
    print(f"  F1-Score: {result['f1_score']}")
    print(f"  Macro Precision: {result['precision_macro']}")
    print(f"  Macro Recall: {result['recall_macro']}")
    print(f"  Macro F1-Score: {result['f1_macro']}")
    print(f"  Confusion Matrix:\n{result['confusion_matrix']}")
    print("\n=== AUC-ROC ===")
    print(f"AUC Macro: {auc_macro:.4f}")
    print(f"AUC Weighted: {auc_weighted:.4f}")
    print(f"AUC per Label: {auc_per_class}")


=== Summary Results for All Weeks ===
Week 1:
  Train Time: 316.54 seconds
  Test Time: 0.45 seconds
  Accurancy: 0.8164634146341463
  Precision: [0.75956284 0.42727273 0.46994536 0.83333333 0.95570321]
  Recall: [0.74133333 0.87037037 0.83495146 0.61904762 0.86041874]
  F1-Score: [0.75033738 0.57317073 0.6013986  0.71038251 0.90556139]
  Macro Precision: 0.6891634937689074
  Macro Recall: 0.7852243045661392
  Macro F1-Score: 0.708170122756627
  Confusion Matrix:
[[278  28  41   2  26]
 [  3  47   2   0   2]
 [  6   4  86   2   5]
 [  5   7  21  65   7]
 [ 74  24  33   9 863]]

=== AUC-ROC ===
AUC Macro: 0.9830
AUC Weighted: 0.9931
AUC per Label: [0.98846522 0.97634893 0.97270238 0.97792846 0.99939392]
Week 2:
  Train Time: 303.73 seconds
  Test Time: 0.45 seconds
  Accurancy: 0.8987804878048781
  Precision: [0.73031496 0.9137931  0.92929293 0.91666667 0.98976109]
  Recall: [0.98933333 0.98148148 0.89320388 0.83809524 0.86739781]
  F1-Score: [0.8403171  0.94642857 0.91089109 0.8756218