### Preprocessing

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn.model_selection import train_test_split # Để chia bộ dữ liệu

#### Load data

In [None]:
data = pd.read_csv('data.csv', sep=';')

print(f"\nĐã load dữ liệu: {data.shape}")
print(f"\nPhân phối nhãn ban đầu:")
print(data['Target'].value_counts())
print(f"\nTỷ lệ (%):")
print(data['Target'].value_counts(normalize=True) * 100)


✓ Đã load dữ liệu: (4424, 37)

Phân phối nhãn ban đầu:
Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

Tỷ lệ (%):
Target
Graduate    49.932188
Dropout     32.120253
Enrolled    17.947559
Name: proportion, dtype: float64


#### Xử lý nhãn

In [3]:
# Chỉ giữ lại Graduate và Dropout, loại bỏ Enrolled
data = data[data['Target'].isin(['Graduate', 'Dropout'])].copy()

# Chuyển đổi nhãn
data['Target'] = data['Target'].map({'Graduate': 1, 'Dropout': 0})

#### Chuẩn hóa dữ liệu

In [4]:
# Chuẩn hóa các cột float
normalize_columns = [
    "Previous qualification (grade)",
    "Admission grade",
    "Unemployment rate",
    "Inflation rate",
    "GDP"
]

for column in normalize_columns:
    data[column] = (data[column] - data[column].mean()) / data[column].std()

#### Tách X và y

In [None]:
X_raw = data.iloc[:, :-1].values

print(f"Shape của X (raw): {X_raw.shape}")

✓ Shape của X (raw): (3630, 36)


#### Chuẩn hóa min - max

In [6]:
def min_max_scale(X):
    """Chuẩn hóa dữ liệu về [0, 1]"""
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    return (X - X_min) / (X_max - X_min + 1e-8)

X_scaled = min_max_scale(X_raw)

#### Thêm cột Bias

In [None]:
# Thêm bias term
X = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])

print(f"Shape của X (sau scaling + bias): {X.shape}")
print(f"X có NaN: {np.isnan(X).any()}")
print(f"X có Inf: {np.isinf(X).any()}")

✓ Shape của X (sau scaling + bias): (3630, 37)
✓ X có NaN: False
✓ X có Inf: False


In [8]:
# y từ chính data đã lọc & map
y = data["Target"].to_numpy().reshape(-1, 1)

### Assignment 1 (4 scores):

- Use Numpy only to construct the Logistic Regression model.
- Train that Logistic Regression model dataset using the Gradient Descend approach on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset. *Note that three class in this dataset must be merge into two class as: graduate and non-graduate (dropout or enroll)*.
- Evaluate that Logistic Regression model on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Visualize the loss function of the training process.

#### Dữ liệu

In [9]:
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Train và test

In [10]:
class LogisticRegression:
    def __init__(self, epoch: int = 100, lr: float = 0.1, reg_lambda: float = 0.01):
        self.epoch = epoch
        self.lr = lr
        self.reg_lambda = reg_lambda
        self.losses = []
        self.accuracies = []
        self.theta = None

    def _sigmoid(self, z: np.ndarray) -> np.ndarray:
        z_clip = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clip))

    def _compute_loss(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        eps = 1e-15
        y_hat = np.clip(y_hat, eps, 1 - eps)
        n = len(y)
        loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
        if self.reg_lambda > 0:
            reg_term = (self.reg_lambda / (2 * n)) * np.sum(self.theta[1:] ** 2)
            loss += reg_term
        return loss

    def _compute_accuracy(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        predictions = (y_hat >= 0.5).astype(int)
        return np.mean(predictions == y)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n_samples, n_features = X.shape
        self.theta = np.random.randn(n_features, 1) * np.sqrt(2.0 / n_features)
        
        with tqdm(range(self.epoch), desc="Logistic Regression") as pb:
            for e in pb:
                # Forward
                z = X @ self.theta
                y_hat = self._sigmoid(z)
                
                # Backward
                grad = (1 / n_samples) * (X.T @ (y_hat - y))
                if self.reg_lambda > 0:
                    reg_grad = np.vstack([
                        np.zeros((1, 1)),
                        (self.reg_lambda / n_samples) * self.theta[1:]
                    ])
                    grad += reg_grad
                
                # Gradient clipping
                grad_norm = np.linalg.norm(grad)
                if grad_norm > 5:
                    grad = 5 * grad / grad_norm
                
                # Update
                self.theta -= self.lr * grad
                
                # Metrics
                loss = self._compute_loss(y, y_hat)
                acc = self._compute_accuracy(y, y_hat)
                
                pb.set_postfix({"loss": f"{loss:.4f}", "acc": f"{acc:.4f}"})
                
                self.losses.append(loss)
                self.accuracies.append(acc)
                
                if e > 10 and abs(self.losses[-1] - self.losses[-2]) < 1e-6:
                    break

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        z = X @ self.theta
        return self._sigmoid(z)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return (self.predict_proba(X) >= 0.5).astype(int)


In [11]:
print("\n--- Training Logistic Regression ---")
np.random.seed(42)
logistic_model = LogisticRegression(epoch=100, lr=0.1, reg_lambda=0.01)
logistic_model.fit(X_train_log, y_train_log)


--- Training Logistic Regression ---


Logistic Regression: 100%|██████████| 100/100 [00:00<00:00, 777.15it/s, loss=0.4996, acc=0.7910]


In [12]:
# === ĐÁNH GIÁ LOGISTIC REGRESSION ===
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# 1) Dự đoán
y_pred_train = logistic_model.predict(X_train_log)
y_pred_test  = logistic_model.predict(X_test_log)

# 2) Accuracy
acc_train = np.mean(y_pred_train == y_train_log)
acc_test  = np.mean(y_pred_test  == y_test_log)

# 3) Loss
#    - loss train đã có sẵn trong logistic_model.losses[-1]
#    - tính thêm loss trên tập test để tham khảo
y_proba_test = logistic_model.predict_proba(X_test_log)
loss_train = logistic_model.losses[-1]
loss_test  = logistic_model._compute_loss(y_test_log, y_proba_test)

print("\n=== KẾT QUẢ ĐÁNH GIÁ (Logistic Regression) ===")
print(f"Accuracy (train): {acc_train:.4f}")
print(f"Accuracy (test) : {acc_test:.4f}")
print(f"Loss (train)    : {loss_train:.4f}")
print(f"Loss (test)     : {loss_test:.4f}")

# 4) Confusion Matrix + Classification Report (cần 1D vector)
y_test_1d = y_test_log.ravel()
y_pred_test_1d = y_pred_test.ravel()

print("\nConfusion Matrix (test):")
print(confusion_matrix(y_test_1d, y_pred_test_1d, labels=[0, 1]))

print("\nClassification Report (test):")
print(classification_report(y_test_1d, y_pred_test_1d, target_names=['Dropout', 'Graduate']))



=== KẾT QUẢ ĐÁNH GIÁ (Logistic Regression) ===
Accuracy (train): 0.7927
Accuracy (test) : 0.7920
Loss (train)    : 0.4996
Loss (test)     : 0.5012

Confusion Matrix (test):
[[140 144]
 [  7 435]]

Classification Report (test):
              precision    recall  f1-score   support

     Dropout       0.95      0.49      0.65       284
    Graduate       0.75      0.98      0.85       442

    accuracy                           0.79       726
   macro avg       0.85      0.74      0.75       726
weighted avg       0.83      0.79      0.77       726



### Assignment 2 (4 scores):

- Use Numpy only to construct the Sofmax Regression model.
- Train that Logistic Regression model dataset using the Gradient Descend approach on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Evaluate that Logistic Regression model on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Visualize the loss function of the training process.

#### Data

In [24]:
# y hiện là nhị phân {0,1} -> chuyển sang one-hot 2 cột
# Cột 0: Dropout, Cột 1: Graduate (tuỳ theo bạn muốn order thế nào)
K = 2
y_soft = np.eye(K)[y.reshape(-1).astype(int)]  # shape: (n_samples, 2)

In [25]:
X_train_soft, X_test_soft, y_train_soft, y_test_soft = train_test_split(X, y_soft, test_size=0.2, random_state=42, stratify=y)

#### Train và test

In [15]:
class SoftmaxRegression:
    def __init__(self, epoch: int = 200, lr: float = 0.1, reg_lambda: float = 0.01):
        self.epoch = epoch
        self.lr = lr
        self.reg_lambda = reg_lambda
        self.losses = []
        self.accuracies = []
        self.theta = None
        
    def _softmax(self, z: np.ndarray) -> np.ndarray:
        z_shifted = z - np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z_shifted)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def _compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        n = len(y_true)
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        loss = -np.sum(y_true * np.log(y_pred)) / n
        if self.reg_lambda > 0:
            reg_term = (self.reg_lambda / (2 * n)) * np.sum(self.theta[1:, :] ** 2)
            loss += reg_term
        return loss
    
    def _compute_accuracy(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true_labels = np.argmax(y_true, axis=1)
        y_pred_labels = np.argmax(y_pred, axis=1)
        return np.mean(y_true_labels == y_pred_labels)
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n_samples, n_features = X.shape
        num_classes = y.shape[1]
        self.theta = np.random.randn(n_features, num_classes) * 0.01
        
        with tqdm(range(self.epoch), desc="Softmax Regression") as pb:
            for e in pb:
                # Forward
                z = X @ self.theta
                y_pred = self._softmax(z)
                
                # Backward
                grad = (1 / n_samples) * (X.T @ (y_pred - y))
                if self.reg_lambda > 0:
                    reg_grad = np.vstack([
                        np.zeros((1, num_classes)),
                        (self.reg_lambda / n_samples) * self.theta[1:, :]
                    ])
                    grad += reg_grad
                
                # Gradient clipping
                grad_norm = np.linalg.norm(grad)
                if grad_norm > 5:
                    grad = 5 * grad / grad_norm
                
                # Update
                self.theta -= self.lr * grad
                
                # Metrics
                loss = self._compute_loss(y, y_pred)
                acc = self._compute_accuracy(y, y_pred)
                
                pb.set_postfix({"loss": f"{loss:.4f}", "acc": f"{acc:.4f}"})
                
                self.losses.append(loss)
                self.accuracies.append(acc)
                
                if e > 10 and abs(self.losses[-1] - self.losses[-2]) < 1e-6:
                    break
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        z = X @ self.theta
        return self._softmax(z)
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)

In [16]:
print("\n--- Training Softmax Regression ---")
np.random.seed(42)
softmax_model = SoftmaxRegression(epoch=200, lr=0.1, reg_lambda=0.01)
softmax_model.fit(X_train_soft, y_train_soft)


--- Training Softmax Regression ---


Softmax Regression: 100%|██████████| 200/200 [00:00<00:00, 591.92it/s, loss=0.3906, acc=0.8585]


In [17]:
# === ĐÁNH GIÁ SOFTMAX REGRESSION (2-class, y one-hot) ===
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# 1) Chuyển y one-hot -> nhãn 0/1
y_true_train = np.argmax(y_train_soft, axis=1)
y_true_test  = np.argmax(y_test_soft,  axis=1)

# 2) Dự đoán nhãn
y_pred_train = softmax_model.predict(X_train_soft)
y_pred_test  = softmax_model.predict(X_test_soft)

# 3) Accuracy
acc_train = np.mean(y_pred_train == y_true_train)
acc_test  = np.mean(y_pred_test  == y_true_test)

# 4) Loss
#    - loss train có trong softmax_model.losses[-1]
#    - tính thêm loss test để tham khảo
y_proba_test = softmax_model.predict_proba(X_test_soft)
loss_train = softmax_model.losses[-1]
loss_test  = softmax_model._compute_loss(
    # cần y ở dạng one-hot để tính cross-entropy
    np.eye(2)[y_true_test], 
    y_proba_test
)

print("\n=== KẾT QUẢ ĐÁNH GIÁ (Softmax Regression) ===")
print(f"Accuracy (train): {acc_train:.4f}")
print(f"Accuracy (test) : {acc_test:.4f}")
print(f"Loss (train)    : {loss_train:.4f}")
print(f"Loss (test)     : {loss_test:.4f}")

# 5) Confusion Matrix + Classification Report
print("\nConfusion Matrix (test):")
print(confusion_matrix(y_true_test, y_pred_test, labels=[0, 1]))

print("\nClassification Report (test):")
print(classification_report(y_true_test, y_pred_test, target_names=['Dropout', 'Graduate']))



=== KẾT QUẢ ĐÁNH GIÁ (Softmax Regression) ===
Accuracy (train): 0.8585
Accuracy (test) : 0.8512
Loss (train)    : 0.3906
Loss (test)     : 0.3876

Confusion Matrix (test):
[[195  89]
 [ 19 423]]

Classification Report (test):
              precision    recall  f1-score   support

     Dropout       0.91      0.69      0.78       284
    Graduate       0.83      0.96      0.89       442

    accuracy                           0.85       726
   macro avg       0.87      0.82      0.83       726
weighted avg       0.86      0.85      0.85       726



### Assignment 3 (2 scores):

- Use a Machine Learning library (Scikit Learn or Skorch) to implement and evaluate the Logistic Regression on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Use a Machine Learning library (Scikit Learn or Skorch) to implement and evaluate the Softmax Regression on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

#### Logistic Regression

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [20]:
log_model = LogisticRegression()
log_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [21]:
# Evaluation
y_pred = log_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("\n=== KẾT QUẢ LOGISTIC REGRESSION (Binary) ===")
print(f"Accuracy: {acc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=["Dropout", "Graduate"]))


=== KẾT QUẢ LOGISTIC REGRESSION (Binary) ===
Accuracy: 0.9146

Confusion Matrix:
[[233  51]
 [ 11 431]]

Classification Report:
              precision    recall  f1-score   support

     Dropout       0.95      0.82      0.88       284
    Graduate       0.89      0.98      0.93       442

    accuracy                           0.91       726
   macro avg       0.92      0.90      0.91       726
weighted avg       0.92      0.91      0.91       726



#### Softmax Regression

In [22]:
# Train
softmax_model = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=2000)
softmax_model.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [23]:
# Evaluation
y_pred = softmax_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("\n=== KẾT QUẢ SOFTMAX REGRESSION (Multiclass) ===")
print(f"Accuracy: {acc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


=== KẾT QUẢ SOFTMAX REGRESSION (Multiclass) ===
Accuracy: 0.9187

Confusion Matrix:
[[235  49]
 [ 10 432]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.83      0.89       284
           1       0.90      0.98      0.94       442

    accuracy                           0.92       726
   macro avg       0.93      0.90      0.91       726
weighted avg       0.92      0.92      0.92       726

