### Preprocessing

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

#### Load data

In [2]:
data = pd.read_csv('data.csv', sep=';')

print(f"\n✓ Đã load dữ liệu: {data.shape}")
print(f"\nPhân phối nhãn ban đầu:")
print(data['Target'].value_counts())
print(f"\nTỷ lệ (%):")
print(data['Target'].value_counts(normalize=True) * 100)


✓ Đã load dữ liệu: (4424, 37)

Phân phối nhãn ban đầu:
Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

Tỷ lệ (%):
Target
Graduate    49.932188
Dropout     32.120253
Enrolled    17.947559
Name: proportion, dtype: float64


#### Xử lý nhãn

In [3]:
# Chỉ giữ lại Graduate và Dropout, loại bỏ Enrolled
data = data[data['Target'].isin(['Graduate', 'Dropout'])].copy()

# Chuyển đổi nhãn
data['Target'] = data['Target'].map({'Graduate': 1, 'Dropout': 0})

#### Chuẩn hóa dữ liệu

In [4]:
# Chuẩn hóa các cột float
normalize_columns = [
    "Previous qualification (grade)",
    "Admission grade",
    "Unemployment rate",
    "Inflation rate",
    "GDP"
]

for column in normalize_columns:
    data[column] = (data[column] - data[column].mean()) / data[column].std()

#### Tách X và y

In [None]:
X_raw = data.iloc[:, :-1].values

print(f"✓ Shape của X (raw): {X_raw.shape}")

✓ Shape của X (raw): (3630, 36)


#### Chuẩn hóa min - max

In [6]:
def min_max_scale(X):
    """Chuẩn hóa dữ liệu về [0, 1]"""
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    return (X - X_min) / (X_max - X_min + 1e-8)

X_scaled = min_max_scale(X_raw)

#### Thêm cột Bias

In [None]:
# Thêm bias term
X = np.hstack([np.ones((X_scaled.shape[0], 1)), X_scaled])

print(f"✓ Shape của X (sau scaling + bias): {X.shape}")
print(f"✓ X có NaN: {np.isnan(X).any()}")
print(f"✓ X có Inf: {np.isinf(X).any()}")

✓ Shape của X (sau scaling + bias): (3630, 37)
✓ X có NaN: False
✓ X có Inf: False


### Assignment 1 (4 scores):

- Use Numpy only to construct the Logistic Regression model.
- Train that Logistic Regression model dataset using the Gradient Descend approach on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset. *Note that three class in this dataset must be merge into two class as: graduate and non-graduate (dropout or enroll)*.
- Evaluate that Logistic Regression model on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Visualize the loss function of the training process.

In [9]:
class LogisticRegression:
    def __init__(self, epoch: int = 100, lr: float = 0.1, reg_lambda: float = 0.01):
        self.epoch = epoch
        self.lr = lr
        self.reg_lambda = reg_lambda
        self.losses = []
        self.accuracies = []
        self.theta = None

    def _sigmoid(self, z: np.ndarray) -> np.ndarray:
        z_clip = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clip))

    def _compute_loss(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        eps = 1e-15
        y_hat = np.clip(y_hat, eps, 1 - eps)
        n = len(y)
        loss = -np.mean(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
        if self.reg_lambda > 0:
            reg_term = (self.reg_lambda / (2 * n)) * np.sum(self.theta[1:] ** 2)
            loss += reg_term
        return loss

    def _compute_accuracy(self, y: np.ndarray, y_hat: np.ndarray) -> float:
        predictions = (y_hat >= 0.5).astype(int)
        return np.mean(predictions == y)

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n_samples, n_features = X.shape
        self.theta = np.random.randn(n_features, 1) * np.sqrt(2.0 / n_features)
        
        with tqdm(range(self.epoch), desc="Logistic Regression") as pb:
            for e in pb:
                # Forward
                z = X @ self.theta
                y_hat = self._sigmoid(z)
                
                # Backward
                grad = (1 / n_samples) * (X.T @ (y_hat - y))
                if self.reg_lambda > 0:
                    reg_grad = np.vstack([
                        np.zeros((1, 1)),
                        (self.reg_lambda / n_samples) * self.theta[1:]
                    ])
                    grad += reg_grad
                
                # Gradient clipping
                grad_norm = np.linalg.norm(grad)
                if grad_norm > 5:
                    grad = 5 * grad / grad_norm
                
                # Update
                self.theta -= self.lr * grad
                
                # Metrics
                loss = self._compute_loss(y, y_hat)
                acc = self._compute_accuracy(y, y_hat)
                
                pb.set_postfix({"loss": f"{loss:.4f}", "acc": f"{acc:.4f}"})
                
                self.losses.append(loss)
                self.accuracies.append(acc)
                
                if e > 10 and abs(self.losses[-1] - self.losses[-2]) < 1e-6:
                    break

    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        z = X @ self.theta
        return self._sigmoid(z)

    def predict(self, X: np.ndarray) -> np.ndarray:
        return (self.predict_proba(X) >= 0.5).astype(int)


In [None]:
print("X shape:", getattr(X_logistic, "shape", None))
print("y shape:", getattr(y_logistic, "shape", None))
print("X dtype:", getattr(X_logistic, "dtype", None))
print("y dtype:", getattr(y_logistic, "dtype", None))

# Nếu là pandas object, xem số hàng sau khi chuyển sang numpy
import numpy as np
X_arr = np.asarray(X_logistic)
y_arr = np.asarray(y_logistic).reshape(-1, 1) if np.asarray(y_logistic).ndim == 1 else np.asarray(y_logistic)

print("X_arr shape:", X_arr.shape)
print("y_arr shape:", y_arr.shape)
if X_arr.shape[0] == 0:
    print(">>> Tập dữ liệu rỗng: X không có mẫu.")
if X_arr.shape[0] != y_arr.shape[0]:
    print(">>> Số hàng X và y KHÔNG KHỚP.")


X shape: (0, 37)
y shape: (0, 1)
X dtype: float64
y dtype: int64
X_arr shape: (0, 37)
y_arr shape: (0, 1)
>>> Tập dữ liệu rỗng: X không có mẫu.


In [None]:
print("\n--- Training Logistic Regression ---")
logistic_model = LogisticRegression(epoch=100, lr=0.1, reg_lambda=0.01)
logistic_model.fit(X, y)


--- Training Logistic Regression ---


Logistic Regression:   0%|          | 0/100 [00:00<?, ?it/s]


ZeroDivisionError: division by zero

In [None]:
y_pred = model.predict(X)
final_acc = np.mean(y_pred == y)
print(f"\n=== KẾT QUẢ CUỐI CÙNG ===")
print(f"Accuracy trên toàn bộ dữ liệu: {final_acc:.4f}")
print(f"Loss cuối cùng: {model.losses[-1]:.4f}")

# Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
print("\nConfusion Matrix:")
print(confusion_matrix(y, y_pred))
print("\nClassification Report:")
print(classification_report(y, y_pred, target_names=['Dropout', 'Graduate']))


=== KẾT QUẢ CUỐI CÙNG ===
Accuracy trên toàn bộ dữ liệu: 0.7931
Loss cuối cùng: 0.4996

Confusion Matrix:
[[ 719  702]
 [  49 2160]]

Classification Report:
              precision    recall  f1-score   support

     Dropout       0.94      0.51      0.66      1421
    Graduate       0.75      0.98      0.85      2209

    accuracy                           0.79      3630
   macro avg       0.85      0.74      0.75      3630
weighted avg       0.83      0.79      0.78      3630



### Assignment 2 (4 scores):

- Use Numpy only to construct the Sofmax Regression model.
- Train that Logistic Regression model dataset using the Gradient Descend approach on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Evaluate that Logistic Regression model on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Visualize the loss function of the training process.

In [None]:
# Nhãn cho Softmax Regression (Multi-class: 3 classes)
print("\n--- Softmax Regression (Multi-class Classification) ---")
label_mapping = {'Graduate': 0, 'Dropout': 1, 'Enrolled': 2}
y_softmax_labels = data['Target'].map(label_mapping).values
X_softmax = X.copy()

print(f"✓ Số samples: {len(y_softmax_labels)}")
for label, idx in label_mapping.items():
    count = np.sum(y_softmax_labels == idx)
    print(f"✓ {label}: {count}")
print(f"✓ X_softmax shape: {X_softmax.shape}")

# One-hot encoding
def one_hot_encode(y, num_classes):
    n = len(y)
    y_one_hot = np.zeros((n, num_classes))
    y_one_hot[np.arange(n), y] = 1
    return y_one_hot

num_classes = 3
y_softmax = one_hot_encode(y_softmax_labels, num_classes)
print(f"✓ y_softmax shape (one-hot): {y_softmax.shape}")


--- Softmax Regression (Multi-class Classification) ---
✓ Số samples: 3630
✓ Graduate: 0
✓ Dropout: 0
✓ Enrolled: 0
✓ X_softmax shape: (3630, 37)


IndexError: arrays used as indices must be of integer (or boolean) type

In [None]:
class SoftmaxRegression:
    def __init__(self, epoch: int = 200, lr: float = 0.1, reg_lambda: float = 0.01):
        self.epoch = epoch
        self.lr = lr
        self.reg_lambda = reg_lambda
        self.losses = []
        self.accuracies = []
        self.theta = None
        
    def _softmax(self, z: np.ndarray) -> np.ndarray:
        z_shifted = z - np.max(z, axis=1, keepdims=True)
        exp_z = np.exp(z_shifted)
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def _compute_loss(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        n = len(y_true)
        y_pred = np.clip(y_pred, 1e-15, 1 - 1e-15)
        loss = -np.sum(y_true * np.log(y_pred)) / n
        if self.reg_lambda > 0:
            reg_term = (self.reg_lambda / (2 * n)) * np.sum(self.theta[1:, :] ** 2)
            loss += reg_term
        return loss
    
    def _compute_accuracy(self, y_true: np.ndarray, y_pred: np.ndarray) -> float:
        y_true_labels = np.argmax(y_true, axis=1)
        y_pred_labels = np.argmax(y_pred, axis=1)
        return np.mean(y_true_labels == y_pred_labels)
    
    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        n_samples, n_features = X.shape
        num_classes = y.shape[1]
        self.theta = np.random.randn(n_features, num_classes) * 0.01
        
        with tqdm(range(self.epoch), desc="Softmax Regression") as pb:
            for e in pb:
                # Forward
                z = X @ self.theta
                y_pred = self._softmax(z)
                
                # Backward
                grad = (1 / n_samples) * (X.T @ (y_pred - y))
                if self.reg_lambda > 0:
                    reg_grad = np.vstack([
                        np.zeros((1, num_classes)),
                        (self.reg_lambda / n_samples) * self.theta[1:, :]
                    ])
                    grad += reg_grad
                
                # Gradient clipping
                grad_norm = np.linalg.norm(grad)
                if grad_norm > 5:
                    grad = 5 * grad / grad_norm
                
                # Update
                self.theta -= self.lr * grad
                
                # Metrics
                loss = self._compute_loss(y, y_pred)
                acc = self._compute_accuracy(y, y_pred)
                
                pb.set_postfix({"loss": f"{loss:.4f}", "acc": f"{acc:.4f}"})
                
                self.losses.append(loss)
                self.accuracies.append(acc)
                
                if e > 10 and abs(self.losses[-1] - self.losses[-2]) < 1e-6:
                    break
    
    def predict_proba(self, X: np.ndarray) -> np.ndarray:
        z = X @ self.theta
        return self._softmax(z)
    
    def predict(self, X: np.ndarray) -> np.ndarray:
        proba = self.predict_proba(X)
        return np.argmax(proba, axis=1)

print("✓ Định nghĩa SoftmaxRegression class")

In [None]:
print("\n--- Training Softmax Regression ---")
softmax_model = SoftmaxRegression(epoch=200, lr=0.1, reg_lambda=0.01)
softmax_model.fit(X_softmax, y_softmax)

In [None]:
print("\n" + "-"*60)
print("SOFTMAX REGRESSION (Multi-class Classification)")
print("-"*60)

y_pred_softmax = softmax_model.predict(X_softmax)
y_true_softmax = y_softmax_labels

acc_softmax = np.mean(y_pred_softmax == y_true_softmax)
print(f"\nAccuracy: {acc_softmax:.4f} ({acc_softmax*100:.2f}%)")
print(f"Final Loss: {softmax_model.losses[-1]:.4f}")

print("\nConfusion Matrix:")
cm_softmax = confusion_matrix(y_true_softmax, y_pred_softmax)
print(cm_softmax)

print("\nClassification Report:")
class_names = ['Graduate', 'Dropout', 'Enrolled']
print(classification_report(y_true_softmax, y_pred_softmax, 
                          target_names=class_names))

### Assignment 3 (2 scores):

- Use a Machine Learning library (Scikit Learn or Skorch) to implement and evaluate the Logistic Regression on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.
- Use a Machine Learning library (Scikit Learn or Skorch) to implement and evaluate the Softmax Regression on the [Predict students’ dropout and academic success](https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success) dataset.