In [1]:
five_fold_files = {
    "week1": [
        "/kaggle/input/new-raw/clean_week1/train/5-folds/data_part_1.csv",
        "/kaggle/input/new-raw/clean_week1/train/5-folds/data_part_2.csv",
        "/kaggle/input/new-raw/clean_week1/train/5-folds/data_part_3.csv",
        "/kaggle/input/new-raw/clean_week1/train/5-folds/data_part_4.csv",
        "/kaggle/input/new-raw/clean_week1/train/5-folds/data_part_5.csv"
    ],
    "week2": [
        "/kaggle/input/new-raw/clean_week2/train/5-folds/data_part_1.csv",
        "/kaggle/input/new-raw/clean_week2/train/5-folds/data_part_2.csv",
        "/kaggle/input/new-raw/clean_week2/train/5-folds/data_part_3.csv",
        "/kaggle/input/new-raw/clean_week2/train/5-folds/data_part_4.csv",
        "/kaggle/input/new-raw/clean_week2/train/5-folds/data_part_5.csv"
    ],
    "week3": [
        "/kaggle/input/new-raw/clean_week3/train/5-folds/data_part_1.csv",
        "/kaggle/input/new-raw/clean_week3/train/5-folds/data_part_2.csv",
        "/kaggle/input/new-raw/clean_week3/train/5-folds/data_part_3.csv",
        "/kaggle/input/new-raw/clean_week3/train/5-folds/data_part_4.csv",
        "/kaggle/input/new-raw/clean_week3/train/5-folds/data_part_5.csv"
    ],
    "week4": [
        "/kaggle/input/new-raw/clean_week4/train/5-folds/data_part_1.csv",
        "/kaggle/input/new-raw/clean_week4/train/5-folds/data_part_2.csv",
        "/kaggle/input/new-raw/clean_week4/train/5-folds/data_part_3.csv",
        "/kaggle/input/new-raw/clean_week4/train/5-folds/data_part_4.csv",
        "/kaggle/input/new-raw/clean_week4/train/5-folds/data_part_5.csv"
    ]
}

file_validation = {
    "week1": [
        "/kaggle/input/new-raw/clean_week1/val/val_week1.csv"
    ],
    "week2": [
        "/kaggle/input/new-raw/clean_week2/val/val_week1_2.csv"
    ],
    "week3": [
        "/kaggle/input/new-raw/clean_week3/val/val_week1_2_3.csv"
    ],
    "week4": [
        "/kaggle/input/new-raw/clean_week4/val/val_week1_2_3_4.csv"
    ]
}

file_test = {
    "week1": [
        "/kaggle/input/new-raw/clean_week1/test/test_week1.csv"
    ],
    "week2": [
        "/kaggle/input/new-raw/clean_week2/test/test_week2.csv"
    ],
    "week3": [
        "/kaggle/input/new-raw/clean_week3/test/test_week3.csv"
    ],
    "week4": [
        "/kaggle/input/new-raw/clean_week4/test/test_week4.csv"
    ]
}

In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
import time
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_recall_fscore_support
from sklearn.tree import DecisionTreeClassifier
import numpy as np

## Tìm siêu tham số tốt nhất cho từng tuần

In [3]:
# Tạo hàm train cho từng tuần
def train_week_model(week_number, file_paths_train, file_validation):
       # Đọc dữ liệu
    train_data = pd.read_csv(file_paths_train)
    val_data = pd.read_csv(file_validation)
    # Tách đặc trưng và nhãn
    X_train = train_data.drop(columns=["classification_encoded", "user_id", "course_id", "school", "enroll_time", "classification"])
    y_train = train_data["classification_encoded"]

    X_val = val_data.drop(columns=["classification_encoded", "user_id", "course_id", "school", "enroll_time", "classification"])
    y_val = val_data["classification_encoded"]
    
    # Áp dụng Over-sampling cho dữ liệu huấn luyện bằng SMOTE
    oversampler = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_res, y_train_res = oversampler.fit_resample(X_train, y_train)
    
    model = DecisionTreeClassifier(random_state=42)
       # # # Khởi tạo RandomSearch tuner
    param_grid = {
        'criterion': ['gini', 'entropy'],  # Thêm lựa chọn entropy
        'max_depth': [3, 5, 7],  # Tăng độ sâu lên 7 để tăng khả năng học
        'min_samples_split': [2, 5, 10],  # Giảm giới hạn số mẫu để chia nhánh
        'min_samples_leaf': [1, 2, 5],  # Giảm yêu cầu số mẫu tối thiểu trong lá
}

    # Khởi tạo RandomSearch tuner
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=10, scoring='accuracy',
                              verbose=2, n_jobs=-1)
        
    
   # Huấn luyện mô hình với các siêu tham số tốt nhất
    grid_search.fit(X_train_res, y_train_res)

    # Đánh giá mô hình
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    # Trả về kết quả tối ưu cho tuần
    best_params = grid_search.best_params_
    return best_params

In [4]:
# Định nghĩa đường dẫn đến dữ liệu cho từng tuần
file_paths_train = {
    "week1": "/kaggle/input/new-raw/clean_week1/train/clean_data_week1.csv",
    "week2": "/kaggle/input/new-raw/clean_week2/train/clean_data_week2.csv",
    "week3": "/kaggle/input/new-raw/clean_week3/train/clean_data_week3.csv",
    "week4": "/kaggle/input/new-raw/clean_week4/train/clean_data_week4.csv"
}

file_validation = {
    "week1": "/kaggle/input/new-raw/clean_week1/val/val_week1.csv",
    "week2": "/kaggle/input/new-raw/clean_week2/val/val_week1_2.csv",
    "week3": "/kaggle/input/new-raw/clean_week3/val/val_week1_2_3.csv",
    "week4": "/kaggle/input/new-raw/clean_week4/val/val_week1_2_3_4.csv"
}

In [5]:
# Tìm tham số tốt nhất cho từng tuần
best_params_week1 = train_week_model(1, file_paths_train["week1"], file_validation["week1"])
best_params_week2 = train_week_model(2, file_paths_train["week2"], file_validation["week2"])
best_params_week3 = train_week_model(3, file_paths_train["week3"], file_validation["week3"])
best_params_week4 = train_week_model(4, file_paths_train["week4"], file_validation["week4"])

# In thông tin chi tiết các tham số tối ưu
print("Best Parameters for Week 1:")
for param_name, param_value in best_params_week1.items():
       print(f"{param_name}: {param_value}")

print("\nBest Parameters for Week 2:")
for param_name, param_value in best_params_week2.items():
       print(f"{param_name}: {param_value}")

print("\nBest Parameters for Week 3:")
for param_name, param_value in best_params_week3.items():
       print(f"{param_name}: {param_value}")

print("\nBest Parameters for Week 4:")
for param_name, param_value in best_params_week4.items():
       print(f"{param_name}: {param_value}")

Fitting 10 folds for each of 54 candidates, totalling 540 fits
Fitting 10 folds for each of 54 candidates, totalling 540 fits
Fitting 10 folds for each of 54 candidates, totalling 540 fits
Fitting 10 folds for each of 54 candidates, totalling 540 fits
Best Parameters for Week 1:
criterion: entropy
max_depth: 7
min_samples_leaf: 2
min_samples_split: 2

Best Parameters for Week 2:
criterion: entropy
max_depth: 7
min_samples_leaf: 1
min_samples_split: 2

Best Parameters for Week 3:
criterion: entropy
max_depth: 7
min_samples_leaf: 1
min_samples_split: 2

Best Parameters for Week 4:
criterion: entropy
max_depth: 7
min_samples_leaf: 5
min_samples_split: 2


In [6]:
# Danh sách tham số tốt nhất
best_params = {
    "week1": best_params_week1,
    "week2": best_params_week2,
    "week3": best_params_week3,
    "week4": best_params_week4
}

## REPTree với các tham sốt nhất cho mỗi tuần

In [7]:
# Biến lưu kết quả tổng quát
overall_results_5folds = []

# Lặp qua từng tuần
for week, file_paths in five_fold_files.items():
    print(f"\nProcessing {week} with best parameters...")
    params = best_params[week]
    print(f"best parameters for {week}: {params}")
    
    # Biến lưu kết quả cho từng tuần
    week_results = {
        "week": week,
        "accuracy_per_fold": [],
        "precision_per_label": [],
        "recall_per_label": [],
        "f1_score_per_label": [],
        "confusion_matrices": [],
        "train_times": [],
        "test_times": []
    }

    # Lặp qua từng fold
    for i in range(len(file_paths)):
        print(f"Fold {i+1}: Using file {file_paths[i]} as test set")
        
        # Tải dữ liệu
        test_data = pd.read_csv(file_paths[i])
        train_data = pd.concat([pd.read_csv(file_paths[j]) for j in range(len(file_paths)) if j != i])
        
        # Tách X và y
        X_train = train_data.drop(columns=["classification_encoded", "user_id",
                                           "course_id", "school", "enroll_time", "classification"])
        y_train = train_data['classification_encoded']
        
        X_test = test_data.drop(columns=["classification_encoded", "user_id",
                                         "course_id", "school", "enroll_time", "classification"])
        y_test = test_data['classification_encoded']


        # Xây dựng mô hình với tham số tốt nhất
        model = DecisionTreeClassifier(**params)
        
        # Bắt đầu tính thời gian huấn luyện
        start_train = time.time()
        model.fit(X_train, y_train)
        end_train = time.time()
        
        # Bắt đầu tính thời gian kiểm thử
        start_test = time.time()
        y_pred = model.predict(X_test)
        end_test = time.time()
        
        # Tính thời gian và lưu lại
        train_time = end_train - start_train
        test_time = end_test - start_test
        week_results["train_times"].append(train_time)
        week_results["test_times"].append(test_time)

        # Đánh giá mô hình trên tập kiểm thử của fold hiện tại
        accuracy = accuracy_score(y_test, y_pred)
        week_results["accuracy_per_fold"].append(accuracy)
        # Dự đoán

        
        # Tính các chỉ số cho mỗi fold
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        week_results["precision_per_label"].append(precision)
        week_results["recall_per_label"].append(recall)
        week_results["f1_score_per_label"].append(f1)
        week_results["confusion_matrices"].append(conf_matrix)

    # Tính trung bình cho từng nhãn
    average_precision_per_label = np.mean(week_results["precision_per_label"], axis=0)
    average_recall_per_label = np.mean(week_results["recall_per_label"], axis=0)
    average_f1_per_label = np.mean(week_results["f1_score_per_label"], axis=0)
    average_confusion_matrix = np.mean(week_results["confusion_matrices"], axis=0)
    average_train_time = sum(week_results["train_times"]) / len(week_results["train_times"])
    average_test_time = sum(week_results["test_times"]) / len(week_results["test_times"])
    
    # Tạo DataFrame cho precision, recall, f1-score
    labels = np.unique(y_test)  # Lấy nhãn từ y_test_classes
    metrics_df = pd.DataFrame({
        "Label": labels,
        "Average Precision": average_precision_per_label,
        "Average Recall": average_recall_per_label,
        "Average F1-Score": average_f1_per_label
    })
    
    # Tạo DataFrame cho confusion matrix|
    confusion_df = pd.DataFrame(average_confusion_matrix, index=labels, columns=labels)
    
    # In kết quả
    print("\n=== Average Precision, Recall, F1-Score per Label ===")
    print(metrics_df)
    print("\n=== Average Confusion Matrix ===")
    print(confusion_df)

    # Lưu kết quả tuần vào kết quả tổng quát
    week_results["average_train_times"] = average_train_time
    week_results["average_test_times"] = average_test_time
    week_results["average_metrics_df"] = metrics_df
    week_results["average_confusion_matrix"] = confusion_df
    overall_results_5folds.append(week_results)


Processing week1 with best parameters...
best parameters for week1: {'criterion': 'entropy', 'max_depth': 7, 'min_samples_leaf': 2, 'min_samples_split': 2}
Fold 1: Using file /kaggle/input/new-raw/clean_week1/train/5-folds/data_part_1.csv as test set
Fold 2: Using file /kaggle/input/new-raw/clean_week1/train/5-folds/data_part_2.csv as test set
Fold 3: Using file /kaggle/input/new-raw/clean_week1/train/5-folds/data_part_3.csv as test set
Fold 4: Using file /kaggle/input/new-raw/clean_week1/train/5-folds/data_part_4.csv as test set
Fold 5: Using file /kaggle/input/new-raw/clean_week1/train/5-folds/data_part_5.csv as test set

=== Average Precision, Recall, F1-Score per Label ===
   Label  Average Precision  Average Recall  Average F1-Score
0      0           0.722966        0.825667          0.770833
1      1           0.919409        0.520664          0.661043
2      2           0.881437        0.541382          0.667166
3      3           0.910348        0.511377          0.654021
4  

## Kết quả 5-folds

In [8]:
# Duyệt qua các tuần trong overall_results
for week_result in overall_results_5folds:
    week = week_result["week"]
    average_train_time = np.mean(week_result["train_times"])
    average_test_time = np.mean(week_result["test_times"])
    average_metrics_df = week_result["average_metrics_df"]
    average_confusion_matrix = week_result["average_confusion_matrix"]
    
    # In kết quả
    print(f"\n=== Results for {week} ===")
    print(f"Average Train Time: {average_train_time:.4f} seconds")
    print(f"Average Test Time: {average_test_time:.4f} seconds")
    print("\nAverage Precision, Recall, F1-Score per Label:")
    print(average_metrics_df)
    print("\nAverage Confusion Matrix:")
    print(average_confusion_matrix)



=== Results for week1 ===
Average Train Time: 0.0239 seconds
Average Test Time: 0.0019 seconds

Average Precision, Recall, F1-Score per Label:
   Label  Average Precision  Average Recall  Average F1-Score
0      0           0.722966        0.825667          0.770833
1      1           0.919409        0.520664          0.661043
2      2           0.881437        0.541382          0.667166
3      3           0.910348        0.511377          0.654021
4      4           0.884438        0.933141          0.908114

Average Confusion Matrix:
       0     1     2     3       4
0  495.4   1.4   4.0   2.0    97.2
1   27.4  45.6   1.0   1.4    12.2
2   44.4   2.2  89.0   2.6    26.2
3   19.2   0.2   2.2  85.4    60.0
4   98.8   0.6   5.2   2.6  1496.2

=== Results for week2 ===
Average Train Time: 0.0336 seconds
Average Test Time: 0.0020 seconds

Average Precision, Recall, F1-Score per Label:
   Label  Average Precision  Average Recall  Average F1-Score
0      0           0.734074        0.8160

## Test trên tập test

In [9]:
# Mảng lưu dữ liệu của các tuần
results = []

def process_week(week_num, best_params, results):
    print(f"\n=== Processing Week {week_num} ===")
    params = best_params[week]
    # Đường dẫn tới dữ liệu tuần tương ứng
    train_path = f"/kaggle/input/new-raw/clean_week{week_num}/train/clean_data_week{week_num}.csv"
    test_path = f"/kaggle/input/new-raw/clean_week{week_num}/test/test_week{week_num}.csv"
    
    # Load dữ liệu
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Tách X và y
    X_train = train_data.drop(columns=["classification_encoded", "user_id",
                                       "course_id", "school", "enroll_time", "classification"])
    y_train = train_data['classification_encoded']
    
    X_test = test_data.drop(columns=["classification_encoded", "user_id",
                                     "course_id", "school", "enroll_time", "classification"])
    y_test = test_data['classification_encoded']

    # Áp dụng SMOTE cho tập huấn luyện
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

    # Xây dựng mô hình với tham số tốt nhất
    model = DecisionTreeClassifier(**params)
    
    # Huấn luyện mô hình
    start_train = time.time()
    model.fit(X_train, y_train)
    end_train = time.time()
    
    # Kiểm thử mô hình
    start_test = time.time()
    y_pred = model.predict(X_test)
    end_test = time.time()
    
    # Tính thời gian huấn luyện và kiểm thử
    train_time = end_train - start_train
    test_time = end_test - start_test
    
    
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Lưu kết quả vào mảng
    results.append({
        "week": week_num,
        "train_time": train_time,
        "test_time": test_time,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "confusion_matrix": conf_matrix
    })
    
    print("\n=== Precision, Recall, F1-Score per Label ===")
    print(pd.DataFrame({
        "Label": np.unique(y_test),
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }))
    print("\n=== Confusion Matrix ===")
    print(pd.DataFrame(conf_matrix, index=np.unique(y_test), columns=np.unique(y_test)))
    
    print(f"\nTrain Time: {train_time:.2f} seconds")
    print(f"Test Time: {test_time:.2f} seconds")

In [10]:
process_week(1, best_params, results)


=== Processing Week 1 ===

=== Precision, Recall, F1-Score per Label ===
   Label  Precision    Recall  F1-Score
0      0   0.724057  0.818667  0.768461
1      1   0.789474  0.555556  0.652174
2      2   0.888889  0.543689  0.674699
3      3   0.938776  0.438095  0.597403
4      4   0.878987  0.934197  0.905752

=== Confusion Matrix ===
     0   1   2   3    4
0  307   4   4   1   59
1   16  30   0   0    8
2   25   4  56   1   17
3   12   0   2  46   45
4   64   0   1   1  937

Train Time: 0.03 seconds
Test Time: 0.00 seconds


In [11]:
process_week(2, best_params, results)


=== Processing Week 2 ===

=== Precision, Recall, F1-Score per Label ===
   Label  Precision    Recall  F1-Score
0      0   0.733333  0.821333  0.774843
1      1   0.789474  0.555556  0.652174
2      2   0.850746  0.553398  0.670588
3      3   0.905660  0.457143  0.607595
4      4   0.886064  0.938185  0.911380

=== Confusion Matrix ===
     0   1   2   3    4
0  308   4   4   1   58
1   18  30   0   0    6
2   25   4  57   3   14
3   12   0   2  48   43
4   57   0   4   1  941

Train Time: 0.04 seconds
Test Time: 0.00 seconds


In [12]:
process_week(3, best_params, results)


=== Processing Week 3 ===

=== Precision, Recall, F1-Score per Label ===
   Label  Precision    Recall  F1-Score
0      0   0.733333  0.821333  0.774843
1      1   0.789474  0.555556  0.652174
2      2   0.850746  0.553398  0.670588
3      3   0.905660  0.457143  0.607595
4      4   0.886064  0.938185  0.911380

=== Confusion Matrix ===
     0   1   2   3    4
0  308   4   4   1   58
1   18  30   0   0    6
2   25   4  57   3   14
3   12   0   2  48   43
4   57   0   4   1  941

Train Time: 0.05 seconds
Test Time: 0.00 seconds


In [13]:
process_week(4, best_params, results)


=== Processing Week 4 ===

=== Precision, Recall, F1-Score per Label ===
   Label  Precision    Recall  F1-Score
0      0   0.728132  0.821333  0.771930
1      1   0.789474  0.555556  0.652174
2      2   0.850746  0.553398  0.670588
3      3   0.905660  0.457143  0.607595
4      4   0.886686  0.936191  0.910766

=== Confusion Matrix ===
     0   1   2   3    4
0  308   4   4   1   58
1   18  30   0   0    6
2   25   4  57   3   14
3   13   0   2  48   42
4   59   0   4   1  939

Train Time: 0.06 seconds
Test Time: 0.00 seconds


In [14]:
# Hiển thị dữ liệu của các tuần
print("\n=== Summary Results for All Weeks ===")
for result in results:
    print(f"Week {result['week']}:")
    print(f"  Train Time: {result['train_time']:.2f} seconds")
    print(f"  Test Time: {result['test_time']:.2f} seconds")
    print(f"  Precision: {result['precision']}")
    print(f"  Recall: {result['recall']}")
    print(f"  F1-Score: {result['f1_score']}")
    print(f"  Confusion Matrix:\n{result['confusion_matrix']}")


=== Summary Results for All Weeks ===
Week 1:
  Train Time: 0.03 seconds
  Test Time: 0.00 seconds
  Precision: [0.7240566  0.78947368 0.88888889 0.93877551 0.87898687]
  Recall: [0.81866667 0.55555556 0.54368932 0.43809524 0.93419741]
  F1-Score: [0.76846058 0.65217391 0.6746988  0.5974026  0.90575157]
  Confusion Matrix:
[[307   4   4   1  59]
 [ 16  30   0   0   8]
 [ 25   4  56   1  17]
 [ 12   0   2  46  45]
 [ 64   0   1   1 937]]
Week 2:
  Train Time: 0.04 seconds
  Test Time: 0.00 seconds
  Precision: [0.73333333 0.78947368 0.85074627 0.90566038 0.88606403]
  Recall: [0.82133333 0.55555556 0.55339806 0.45714286 0.93818544]
  F1-Score: [0.77484277 0.65217391 0.67058824 0.60759494 0.91138015]
  Confusion Matrix:
[[308   4   4   1  58]
 [ 18  30   0   0   6]
 [ 25   4  57   3  14]
 [ 12   0   2  48  43]
 [ 57   0   4   1 941]]
Week 3:
  Train Time: 0.05 seconds
  Test Time: 0.00 seconds
  Precision: [0.73333333 0.78947368 0.85074627 0.90566038 0.88606403]
  Recall: [0.82133333 0.