## 1.Environment Setup

In [1]:
# Import thư viện cần thiết
import os
import random
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# Đặt seed để đảm bảo kết quả nhất quán giữa các lần chạy
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)
random.seed(SEED)
print(f"Seed: {SEED}")

Seed: 42


## 2.Data Processing

In [None]:
# https://drive.google.com/file/d/1T6AWCoyeC2MqGvmqZPPcA4ni5Md1k0sI/view?usp=sharing
!gdown 1T6AWCoyeC2MqGvmqZPPcA4ni5Md1k0sI


Downloading...
From: https://drive.google.com/uc?id=1T6AWCoyeC2MqGvmqZPPcA4ni5Md1k0sI
To: /content/dataset.zip
  0% 0.00/21.1k [00:00<?, ?B/s]100% 21.1k/21.1k [00:00<00:00, 59.0MB/s]


In [None]:
!unzip dataset.zip


Archive:  dataset.zip
   creating: splits/
  inflating: splits/fe_train.csv     
  inflating: splits/raw_test.csv     
  inflating: splits/raw_val.csv      
  inflating: splits/fe_test.csv      
  inflating: splits/fe_val.csv       
  inflating: splits/fe_feature_names.json  
  inflating: splits/raw_train.csv    


In [None]:
def read_csv(file_path):
    df = pd.read_csv(file_path)
    display(df.head())

    X = df.drop("target", axis=1)
    y = df["target"]
    display(y.value_counts())

    print("Shape df: ", df.shape)
    print("Shape X: ", X.shape)
    print("Shape y: ", y.shape)

    return X, y

### 2.1 Original Dataset

In [None]:
X_train, y_train = read_csv("splits/raw_train.csv") 

In [None]:
X_val, y_val = read_csv("splits/raw_val.csv")

In [None]:
X_test, y_test = read_csv("splits/raw_test.csv")

### 2.2 Feature Engineering Dataset

In [None]:
X_train_fe, y_train_fe = read_csv("splits/fe_train.csv") 

In [None]:
X_val_fe, y_val_fe = read_csv("splits/fe_val.csv.csv") 

In [None]:
X_test_fe, y_test_fe = read_csv("splits/fe_test.csv") 

## 3.Naive Bayes Classifier

In [None]:
def evaluate_val(X_train, y_train, X_val, y_val):
    # Huấn luyện Naive Bayes
    nbc_model = GaussianNB()
    ### <Your Code>

    # Dự đoán và đánh giá trên tập val
    nb_pred = ### <Your Code>
    nb_accuracy = ### <Your Code>

    print(f"Độ chính xác Naive Bayes: {nb_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, nb_pred))
    return nbc_model, nb_accuracy


def evaluate_test(nbc_model, X_test, y_test):
    # Dự đoán và đánh giá trên tập test
    nb_test_pred = ### <Your Code>
    nb_test_accuracy = ### <Your Code>
    print(f"Độ chính xác Naive Bayes trên tập test: {nb_test_accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, nb_test_pred))
    return nb_test_accuracy


### 3.1 NBC on Original Dataset

In [None]:
# Huấn luyện và đánh giá với dữ liệu gốc
model, accuracy = ### <Your Code>
test_accuracy = ### <Your Code>


### 3.2 NBC on Feature Engineering Dataset

In [None]:
# Huấn luyện và đánh giá với kỹ thuật tạo đặc trưng
model_fe, accuracy_fe = ### <Your Code>
test_accuracy_fe = ### <Your Code>


## 4.Result Visualization

In [None]:
plt.rcParams['font.family'] = 'DejaVu Serif'

labels = ['Original Dataset', 'Feature Engineering Dataset']
val_accs  = [accuracy, accuracy_fe]
test_accs = [test_accuracy, test_accuracy_fe]

x = np.arange(len(labels))
width = 0.3

fig, ax = plt.subplots(figsize=(5, 5))

rects1 = ax.bar(x - width/2, val_accs,  width,
                label='Validation Accuracy',
                color='tab:blue', edgecolor='black', linewidth=1.2)
rects2 = ax.bar(x + width/2, test_accs, width,
                label='Test Accuracy',
                color='tab:red', edgecolor='black', linewidth=1.2)

ax.set_ylim(0.5, 1.0)
ax.set_ylabel('Accuracy')
ax.set_title('Acc trên tập Val và Test')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend(ncol=2, loc="upper center")

def autolabel(rects):
    for rect in rects:
        h = rect.get_height()
        ax.annotate(f'{h:.2f}', xy=(rect.get_x()+rect.get_width()/2, h),
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
fig.savefig("accuracy_comparison.png", dpi=300, bbox_inches="tight")
fig.savefig("accuracy_comparison.pdf", bbox_inches="tight")
plt.show()
