In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# 1. Load và chia dữ liệu 60/20/20
digits = load_digits()
X, y = digits.data, digits.target
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y #0.2 test
) 
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp # 0.25* 0.8 = 0.2 validation
)
print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

Train: (1077, 64)
Val: (360, 64)
Test: (360, 64)


In [3]:
# LOGISTIC REGRESSION

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

# 2. Huấn luyện mô hình trên tập train: 
# (với logistic regression, cần tìm tham số C tối ưu bằng validation rồi mới train full)

# 3. Tối ưu C trên validation

C_values = [0.01, 0.1, 1.0, 10.0, 100.0]
best_C = None
best_val_acc = 0.0


print("\nLogistic Regression | Optimize:")
for C in C_values:
    model = LogisticRegression(
        C=C,
        penalty='l2',
        solver='lbfgs',
        max_iter=2000,
        multi_class='multinomial'
    )
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    print(f"C = {C:<6} | Validation accuracy = {val_acc:.4f}")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_C = C

print(f"\nBest C on validation set: {best_C}, accuracy = {best_val_acc:.4f}")

# Train lại trên train+val (mô hình cuối cùng)
X_train_full = np.vstack([X_train, X_val])
y_train_full = np.hstack([y_train, y_val])

print("\nLogistic Regression | Train ")

best_model = LogisticRegression(
    C=best_C,
    penalty='l2',
    solver='lbfgs',
    max_iter=2000,
    multi_class='multinomial'
)
best_model.fit(X_train_full, y_train_full)

# 4. Test accuracy
test_pred = best_model.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

print(f"\nTest accuracy (Logistic Regression) = {test_acc:.4f}\n")
print(classification_report(y_test, test_pred))

precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, test_pred, average='macro'
)

# 5. Update CSV
results_row = {
    "Model": "Logistic Regression",
    "Train size": X_train.shape[0],
    "Validation size": X_val.shape[0],
    "Test size": X_test.shape[0],
    "Hyperparameters": f"C={best_C}, penalty=L2, solver=lbfgs, multinomial",
    "Best validation accuracy": best_val_acc,
    "Test accuracy": test_acc,
    "Precision (macro)": precision_macro,
    "Recall (macro)": recall_macro,
    "F1-score (macro)": f1_macro,
    "Notes": "random_state=42, no scaling"
}

csv_path = "digits_models_results.csv"

try:
    df = pd.read_csv(csv_path)
    if "Model" in df.columns:
        df = df[df["Model"] != "Logistic Regression"]
    df = pd.concat([df, pd.DataFrame([results_row])], ignore_index=True)
except FileNotFoundError:
    df = pd.DataFrame([results_row])

df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"Updated to file: {csv_path}")



Logistic Regression | Optimize:
C = 0.01   | Validation accuracy = 0.9722




C = 0.1    | Validation accuracy = 0.9722
C = 1.0    | Validation accuracy = 0.9639
C = 10.0   | Validation accuracy = 0.9556
C = 100.0  | Validation accuracy = 0.9528

Best C on validation set: 0.01, accuracy = 0.9722

Logistic Regression | Train 





Test accuracy (Logistic Regression) = 0.9639

              precision    recall  f1-score   support

           0       1.00      0.97      0.99        36
           1       0.91      0.86      0.89        36
           2       1.00      1.00      1.00        35
           3       0.97      1.00      0.99        37
           4       0.92      1.00      0.96        36
           5       1.00      1.00      1.00        37
           6       1.00      0.97      0.99        36
           7       1.00      0.97      0.99        36
           8       0.86      0.91      0.89        35
           9       0.97      0.94      0.96        36

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.96      0.96      0.96       360

Updated to file: digits_models_results.csv


In [4]:
# DECISION TREE

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# 2. Huấn luyện mô hình trên tập train:
# Với Decision Tree, ta cũng cần tối ưu siêu tham số (max_depth, criterion, min_samples_split) bằng tập validation rồi mới train full train+val.

# 3. Tối ưu siêu tham số trên validation
max_depth_list = [5, 10, 20, None]
criteria = ["gini", "entropy"]
min_samples_list = [2, 5, 10]

best_params = None
best_val_acc = 0.0

print("\nDecision Tree | Optimize:")
for depth in max_depth_list:
    for crit in criteria:
        for min_s in min_samples_list:

            model = DecisionTreeClassifier(
                max_depth=depth,
                criterion=crit,
                min_samples_split=min_s,
                random_state=42
            )

            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            val_acc = accuracy_score(y_val, val_pred)

            print(f"max_depth={depth}, criterion={crit}, min_samples_split={min_s} | Val Acc = {val_acc:.4f}")

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_params = (depth, crit, min_s)

print("\nBest params on validation:", best_params)
print("Best validation accuracy:", best_val_acc)

# 4. Train lại mô hình tốt nhất trên train+val
best_depth, best_crit, best_min_s = best_params

X_train_full = np.vstack([X_train, X_val])
y_train_full = np.hstack([y_train, y_val])

best_tree = DecisionTreeClassifier(
    max_depth=best_depth,
    criterion=best_crit,
    min_samples_split=best_min_s,
    random_state=42
)

print("\nDecision Tree | Train")
best_tree.fit(X_train_full, y_train_full)

# 4. Test accuracy
test_pred = best_tree.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, test_pred, average='macro'
)

print(f"Test accuracy (Decision Tree) = {test_acc:.4f}")
print("Precision (macro):", precision_macro)
print("Recall (macro):", recall_macro)
print("F1-score (macro):", f1_macro)

print("\nClassification report:")
print(classification_report(y_test, test_pred))

# 5. Update CSV
results_row = {
    "Model": "Decision Tree",
    "Train size": X_train.shape[0],
    "Validation size": X_val.shape[0],
    "Test size": X_test.shape[0],
    "Hyperparameters": (
        f"max_depth={best_depth}, criterion={best_crit}, "
        f"min_samples_split={best_min_s}"
    ),
    "Best validation accuracy": best_val_acc,
    "Test accuracy": test_acc,
    "Precision (macro)": precision_macro,
    "Recall (macro)": recall_macro,
    "F1-score (macro)": f1_macro,
    "Notes": "random_state=42"
}

csv_path = "digits_models_results.csv"

try:
    df = pd.read_csv(csv_path)
    if "Model" in df.columns:
        df = df[df["Model"] != "Decision Tree"]
    df = pd.concat([df, pd.DataFrame([results_row])], ignore_index=True)
except FileNotFoundError:
    df = pd.DataFrame([results_row])

df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\nUpdated to file: {csv_path}")



Decision Tree | Optimize:
max_depth=5, criterion=gini, min_samples_split=2 | Val Acc = 0.6667
max_depth=5, criterion=gini, min_samples_split=5 | Val Acc = 0.6667
max_depth=5, criterion=gini, min_samples_split=10 | Val Acc = 0.6611
max_depth=5, criterion=entropy, min_samples_split=2 | Val Acc = 0.7222
max_depth=5, criterion=entropy, min_samples_split=5 | Val Acc = 0.7250
max_depth=5, criterion=entropy, min_samples_split=10 | Val Acc = 0.7194
max_depth=10, criterion=gini, min_samples_split=2 | Val Acc = 0.8389
max_depth=10, criterion=gini, min_samples_split=5 | Val Acc = 0.8556
max_depth=10, criterion=gini, min_samples_split=10 | Val Acc = 0.8361
max_depth=10, criterion=entropy, min_samples_split=2 | Val Acc = 0.8167
max_depth=10, criterion=entropy, min_samples_split=5 | Val Acc = 0.8222
max_depth=10, criterion=entropy, min_samples_split=10 | Val Acc = 0.8194
max_depth=20, criterion=gini, min_samples_split=2 | Val Acc = 0.8333
max_depth=20, criterion=gini, min_samples_split=5 | Val Acc 

In [5]:
# K-NEAREST NEIGHBORS (KNN)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# 2. Huấn luyện mô hình trên tập train:
# Với KNN, ta cần tối ưu k (n_neighbors), weights, metric bằng tập validation.

# 3. Tối ưu siêu tham số trên validation
k_values = [1, 3, 5, 7, 9, 11]
weight_options = ["uniform", "distance"]
metric_options = ["euclidean", "manhattan"]

best_params = None
best_val_acc = 0.0

print("\nKNN | Optimize: ")
for k in k_values:
    for w in weight_options:
        for m in metric_options:

            model = KNeighborsClassifier(
                n_neighbors=k,
                weights=w,
                metric=m
            )

            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            val_acc = accuracy_score(y_val, val_pred)

            print(f"k={k}, weights={w}, metric={m} | Val Acc = {val_acc:.4f}")

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_params = (k, w, m)

print("\nBest params on validation:", best_params)
print("Best validation accuracy:", best_val_acc)

# 4. Train lại mô hình tốt nhất trên train+val
best_k, best_weight, best_metric = best_params

X_train_full = np.vstack([X_train, X_val])
y_train_full = np.hstack([y_train, y_val])

best_knn = KNeighborsClassifier(
    n_neighbors=best_k,
    weights=best_weight,
    metric=best_metric
)

print("KNN | Train")
best_knn.fit(X_train_full, y_train_full)

# 4. Test accuracy
test_pred = best_knn.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, test_pred, average='macro'
)

print(f"\nTest accuracy (KNN) = {test_acc:.4f}\n")
print("Precision (macro):", precision_macro)
print("Recall (macro):", recall_macro)
print("F1-score (macro):", f1_macro)

print("\nClassification report:")
print(classification_report(y_test, test_pred))

# 5. Update CSV
results_row = {
    "Model": "KNN",
    "Train size": X_train.shape[0],
    "Validation size": X_val.shape[0],
    "Test size": X_test.shape[0],
    "Hyperparameters": (
        f"k={best_k}, weights={best_weight}, metric={best_metric}"
    ),
    "Best validation accuracy": best_val_acc,
    "Test accuracy": test_acc,
    "Precision (macro)": precision_macro,
    "Recall (macro)": recall_macro,
    "F1-score (macro)": f1_macro,
    "Notes": "KNN không cần random_state"
}

csv_path = "digits_models_results.csv"

try:
    df = pd.read_csv(csv_path)
    if "Model" in df.columns:
        df = df[df["Model"] != "KNN"]
    df = pd.concat([df, pd.DataFrame([results_row])], ignore_index=True)
except FileNotFoundError:
    df = pd.DataFrame([results_row])

df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\nUpdated to file: {csv_path}")



KNN | Optimize: 
k=1, weights=uniform, metric=euclidean | Val Acc = 0.9889
k=1, weights=uniform, metric=manhattan | Val Acc = 0.9833
k=1, weights=distance, metric=euclidean | Val Acc = 0.9889
k=1, weights=distance, metric=manhattan | Val Acc = 0.9833
k=3, weights=uniform, metric=euclidean | Val Acc = 0.9806
k=3, weights=uniform, metric=manhattan | Val Acc = 0.9806
k=3, weights=distance, metric=euclidean | Val Acc = 0.9806
k=3, weights=distance, metric=manhattan | Val Acc = 0.9806
k=5, weights=uniform, metric=euclidean | Val Acc = 0.9806
k=5, weights=uniform, metric=manhattan | Val Acc = 0.9806
k=5, weights=distance, metric=euclidean | Val Acc = 0.9806
k=5, weights=distance, metric=manhattan | Val Acc = 0.9806
k=7, weights=uniform, metric=euclidean | Val Acc = 0.9806
k=7, weights=uniform, metric=manhattan | Val Acc = 0.9778
k=7, weights=distance, metric=euclidean | Val Acc = 0.9806
k=7, weights=distance, metric=manhattan | Val Acc = 0.9806
k=9, weights=uniform, metric=euclidean | Val A

In [6]:
# ARTIFICIAL NEURAL NETWORK (MLP)

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# 2. Huấn luyện mô hình trên tập train:
# Với ANN (MLP), ta cần tối ưu siêu tham số (hidden_layer_sizes, activation, alpha) bằng tập validation rồi mới train full train+val.

# 3. Tối ưu siêu tham số trên validation
hidden_layer_list = [
    (50,),
    (100,),
    (50, 50)
]
activation_list = ["relu", "tanh"]
alpha_list = [0.0001, 0.001]

best_params = None
best_val_acc = 0.0

print("ANN (MLP) | Optimize: ")
for hidden in hidden_layer_list:
    for act in activation_list:
        for alpha in alpha_list:

            model = MLPClassifier(
                hidden_layer_sizes=hidden,
                activation=act,
                alpha=alpha,
                max_iter=300,
                random_state=42
            )

            model.fit(X_train, y_train)
            val_pred = model.predict(X_val)
            val_acc = accuracy_score(y_val, val_pred)

            print(f"hidden={hidden}, activation={act}, alpha={alpha} | Val Acc = {val_acc:.4f}")

            if val_acc > best_val_acc:
                best_val_acc = val_acc
                best_params = (hidden, act, alpha)

print("\nBest params on validation:", best_params)
print("Best validation accuracy:", best_val_acc)

# 4. Train lại mô hình tốt nhất trên train+val
best_hidden, best_act, best_alpha = best_params

X_train_full = np.vstack([X_train, X_val])
y_train_full = np.hstack([y_train, y_val])

best_mlp = MLPClassifier(
    hidden_layer_sizes=best_hidden,
    activation=best_act,
    alpha=best_alpha,
    max_iter=300,
    random_state=42
)

print("ANN (MLP) | Train")
best_mlp.fit(X_train_full, y_train_full)

# 4. Test accuracy
test_pred = best_mlp.predict(X_test)
test_acc = accuracy_score(y_test, test_pred)

precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    y_test, test_pred, average='macro'
)

print(f"\nTest accuracy (ANN - MLP) = {test_acc:.4f}\n")
print("Precision (macro):", precision_macro)
print("Recall (macro):", recall_macro)
print("F1-score (macro):", f1_macro)

print("\nClassification report:")
print(classification_report(y_test, test_pred))

# 5. Update CSV
results_row = {
    "Model": "Neural Network (MLP)",
    "Train size": X_train.shape[0],
    "Validation size": X_val.shape[0],
    "Test size": X_test.shape[0],
    "Hyperparameters": (
        f"hidden={best_hidden}, activation={best_act}, alpha={best_alpha}, "
        f"max_iter=300"
    ),
    "Best validation accuracy": best_val_acc,
    "Test accuracy": test_acc,
    "Precision (macro)": precision_macro,
    "Recall (macro)": recall_macro,
    "F1-score (macro)": f1_macro,
    "Notes": "random_state=42, no scaling"
}

csv_path = "digits_models_results.csv"

try:
    df = pd.read_csv(csv_path)
    if "Model" in df.columns:
        df = df[df["Model"] != "Neural Network (MLP)"]
    df = pd.concat([df, pd.DataFrame([results_row])], ignore_index=True)
except FileNotFoundError:
    df = pd.DataFrame([results_row])

df.to_csv(csv_path, index=False, encoding="utf-8-sig")
print(f"\nUpdated to file: {csv_path}")


ANN (MLP) | Optimize: 
hidden=(50,), activation=relu, alpha=0.0001 | Val Acc = 0.9528
hidden=(50,), activation=relu, alpha=0.001 | Val Acc = 0.9583
hidden=(50,), activation=tanh, alpha=0.0001 | Val Acc = 0.9667
hidden=(50,), activation=tanh, alpha=0.001 | Val Acc = 0.9667
hidden=(100,), activation=relu, alpha=0.0001 | Val Acc = 0.9750
hidden=(100,), activation=relu, alpha=0.001 | Val Acc = 0.9750
hidden=(100,), activation=tanh, alpha=0.0001 | Val Acc = 0.9694
hidden=(100,), activation=tanh, alpha=0.001 | Val Acc = 0.9722
hidden=(50, 50), activation=relu, alpha=0.0001 | Val Acc = 0.9667
hidden=(50, 50), activation=relu, alpha=0.001 | Val Acc = 0.9694
hidden=(50, 50), activation=tanh, alpha=0.0001 | Val Acc = 0.9500
hidden=(50, 50), activation=tanh, alpha=0.001 | Val Acc = 0.9500

Best params on validation: ((100,), 'relu', 0.0001)
Best validation accuracy: 0.975
ANN (MLP) | Train

Test accuracy (ANN - MLP) = 0.9889

Precision (macro): 0.9889287612971824
Recall (macro): 0.988650793650793

In [9]:

csv_path = "digits_models_results.csv"
df = pd.read_csv(csv_path)

print (df)

print("NHẬN XÉT (CHATGPT) \n")

# 1. Tìm model tốt nhất theo Test Accuracy
best_model_row = df.loc[df["Test accuracy"].idxmax()]
best_model_name = best_model_row["Model"]
best_acc = best_model_row["Test accuracy"]

# 2. Tìm model tệ nhất
worst_model_row = df.loc[df["Test accuracy"].idxmin()]
worst_model_name = worst_model_row["Model"]
worst_acc = worst_model_row["Test accuracy"]

# 3. Tính trung bình F1
df["F1-score (macro)"] = pd.to_numeric(df["F1-score (macro)"], errors="coerce")
avg_f1 = df["F1-score (macro)"].mean()

# 4. Sinh nhận xét tự động
print("Dựa trên bảng kết quả 4 mô hình:\n")

print(f"- **Mô hình có độ chính xác cao nhất** là: **{best_model_name}**, với Test Accuracy = **{best_acc:.4f}**.")
print(f"- **Mô hình có độ chính xác thấp nhất** là: **{worst_model_name}**, với Test Accuracy = **{worst_acc:.4f}**.\n")

print(f"- Trung bình F1-score của các mô hình là: **{avg_f1:.4f}**.\n")

print("**Nhận xét tổng quát:**")

print(f"""
• {best_model_name} cho kết quả tốt nhất, cho thấy mô hình này phù hợp nhất với đặc trưng của bộ dữ liệu Digits.

• Các mô hình còn lại cho hiệu quả khác nhau tùy vào bản chất:
  - Logistic Regression hoạt động ổn định, ít overfit.
  - Decision Tree dễ overfit, nên accuracy thấp hơn.
  - KNN có performance khá cao nhưng phụ thuộc mạnh vào k và metric.
  - Neural Network (MLP) có khả năng học phi tuyến tốt, nhưng cần tuning nhiều siêu tham số.

• Tùy mục tiêu ứng dụng thực tế:
  - Nếu ưu tiên **đơn giản và ổn định** → Logistic Regression hoặc KNN.
  - Nếu ưu tiên **diễn giải mô hình** → Decision Tree.
  - Nếu cần **khả năng học phi tuyến mạnh nhất** → MLP (ANN).
""")


                  Model  Train size  Validation size  Test size  \
0   Logistic Regression        1077              360        360   
1         Decision Tree        1077              360        360   
2                   KNN        1077              360        360   
3  Neural Network (MLP)        1077              360        360   

                                     Hyperparameters  \
0      C=0.01, penalty=L2, solver=lbfgs, multinomial   
1  max_depth=10, criterion=gini, min_samples_split=5   
2             k=1, weights=uniform, metric=euclidean   
3  hidden=(100,), activation=relu, alpha=0.0001, ...   

   Best validation accuracy  Test accuracy  Precision (macro)  Recall (macro)  \
0                  0.972222       0.963889           0.964482        0.963651   
1                  0.855556       0.827778           0.827349        0.826950   
2                  0.988889       0.986111           0.986575        0.985873   
3                  0.975000       0.988889           0.9889