In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from collections import Counter

# =========================
# Load and preprocess data
# =========================
df = pd.read_csv("../lab7/drug_200.csv")

X = df.drop('Drug', axis=1)
y = df['Drug']

# Encode categorical variables
X = pd.get_dummies(X, drop_first=True)
y = LabelEncoder().fit_transform(y)

# Standardize
scaler = StandardScaler()
X = scaler.fit_transform(X)

# ===============================
# Softmax Logistic Regression (Multi-class)
# ===============================
class SoftmaxRegressionScratch:
    def __init__(self, lr=0.1, epochs=1000, reg_type=None, lam=0.01, l1_ratio=0.5):
        self.lr = lr
        self.epochs = epochs
        self.reg_type = reg_type
        self.lam = lam
        self.l1_ratio = l1_ratio

    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        self.W = np.zeros((n_features, n_classes))
        self.b = np.zeros((1, n_classes))

        # One-hot encode y
        y_onehot = np.zeros((n_samples, n_classes))
        y_onehot[np.arange(n_samples), y] = 1

        for _ in range(self.epochs):
            logits = np.dot(X, self.W) + self.b
            probs = self.softmax(logits)
            error = probs - y_onehot

            grad_W = (np.dot(X.T, error)) / n_samples
            grad_b = np.mean(error, axis=0, keepdims=True)

            # Regularization
            if self.reg_type == "l1":  # Lasso
                grad_W += self.lam * np.sign(self.W)
            elif self.reg_type == "l2":  # Ridge
                grad_W += self.lam * self.W
            elif self.reg_type == "elastic":
                grad_W += self.lam * (self.l1_ratio * np.sign(self.W) + (1 - self.l1_ratio) * self.W)

            # Update weights
            self.W -= self.lr * grad_W
            self.b -= self.lr * grad_b

    def predict(self, X):
        logits = np.dot(X, self.W) + self.b
        probs = self.softmax(logits)
        return np.argmax(probs, axis=1)

# ===============================
# KNN Classifier (from scratch)
# ===============================
class KNNClassifierScratch:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        preds = []
        for x in X:
            distances = np.sqrt(np.sum((self.X_train - x)**2, axis=1))
            k_idx = np.argsort(distances)[:self.k]
            k_labels = [self.y_train[i] for i in k_idx]
            preds.append(Counter(k_labels).most_common(1)[0][0])
        return np.array(preds)

# ===============================
# Cross-validation evaluation
# ===============================
def cross_val_evaluate(model_class, X, y, folds=5, **kwargs):
    kf = KFold(n_splits=folds, shuffle=True, random_state=42)
    accs, precs, recs, f1s = [], [], [], []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = model_class(**kwargs)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='macro', zero_division=0)
        accs.append(acc)
        precs.append(prec)
        recs.append(rec)
        f1s.append(f1)

    return {
        "Accuracy": np.mean(accs),
        "Precision": np.mean(precs),
        "Recall": np.mean(recs),
        "F1-score": np.mean(f1s)
    }

# ===============================
# Run models
# ===============================
results = {}

# Logistic Regression variants
results["Softmax Logistic (No Reg)"] = cross_val_evaluate(SoftmaxRegressionScratch, X, y, reg_type=None)
results["Softmax Lasso (L1)"] = cross_val_evaluate(SoftmaxRegressionScratch, X, y, reg_type="l1", lam=0.01)
results["Softmax Ridge (L2)"] = cross_val_evaluate(SoftmaxRegressionScratch, X, y, reg_type="l2", lam=0.01)
results["Softmax Elastic Net"] = cross_val_evaluate(SoftmaxRegressionScratch, X, y, reg_type="elastic", lam=0.01, l1_ratio=0.5)

# KNN variants
results["KNN (K=1)"] = cross_val_evaluate(KNNClassifierScratch, X, y, k=1)
results["KNN (K=3)"] = cross_val_evaluate(KNNClassifierScratch, X, y, k=3)
results["KNN (K=5)"] = cross_val_evaluate(KNNClassifierScratch, X, y, k=5)

# Display results
results_df = pd.DataFrame(results).T
print("\n===== Model Performance Comparison (5-Fold CV) =====\n")
print(results_df)



===== Model Performance Comparison (5-Fold CV) =====

                           Accuracy  Precision    Recall  F1-score
Softmax Logistic (No Reg)     0.935   0.900016  0.944183  0.912367
Softmax Lasso (L1)            0.950   0.951770  0.954573  0.947747
Softmax Ridge (L2)            0.930   0.902706  0.925072  0.899137
Softmax Elastic Net           0.945   0.941361  0.942901  0.937964
KNN (K=1)                     0.895   0.871286  0.928733  0.880698
KNN (K=3)                     0.830   0.806333  0.831865  0.805048
KNN (K=5)                     0.780   0.731445  0.784725  0.742573
