In [None]:
# ML Final Project

In [4]:
# import all th required libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
import dill

In [5]:
def load_data():
    training_data_path = "fashion-mnist_train.csv"
    testing_data_path = "fashion-mnist_test.csv"

    train_csv = pd.read_csv(training_data_path, header=None, skiprows=1)
    test_csv = pd.read_csv(testing_data_path, header=None, skiprows=1)

    X_train, y_train = train_csv.iloc[:, 1:], train_csv.iloc[:, 0].astype(int)
    X_test, y_test = test_csv.iloc[:, 1:], test_csv.iloc[:, 0].astype(int)  # Fixed!

    return (np.asarray(X_train)[:10000],
            np.asarray(y_train)[:10000],
            np.asarray(X_test)[:1000],
            np.asarray(y_test)[:1000])
    
def sigmoid(x):
    # Clip to prevent overflow
    return 1 / (1 + np.exp(-np.clip(x, -500, 500)))


In [11]:

class BinaryLogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.mean = None
        self.std = None

    def fit(self, X, y):

        self.mean = X.mean(axis=0)
        self.std = X.std(axis=0) + 1e-8  # Add epsilon to avoid division by zero

        X_norm = (X - self.mean) / self.std
        n, d = X_norm.shape

        # Initialize weights
        w = np.zeros(d)

        for epoch in range(self.epochs):
            p = sigmoid(X_norm @ w)
            grad = (X_norm.T @ (p - y)) / n
            w -= self.lr * grad

            #  if gradient is close to zero, stop
            if np.linalg.norm(grad) < 1e-6:
                break

        self.w_ = w
        return self

    def predict_probability(self, X):
        X_norm = (X - self.mean) / self.std
        return sigmoid(X_norm @ self.w_)

    def predict(self, X, threshold=0.5):
        return (self.predict_probability(X) >= threshold).astype(int)

class MulticlassLogisticRegression:
    def __init__(self, lr=0.01, epochs=1000):
        self.lr = lr
        self.epochs = epochs
        self.classifiers_ = []

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        y = np.asarray(y)
        for c in self.classes_:
            clf = BinaryLogisticRegression(self.lr, self.epochs)
            # Create binary labels
            y_binary = (y == c).astype(float)
            clf.fit(X, y_binary)
            self.classifiers_.append(clf)
        return self

    def predict(self, X):
        probas = np.stack([clf.predict_probability(X) for clf in self.classifiers_], axis=1)
        return self.classes_[np.argmax(probas, axis=1)]


In [None]:
def softmax(z: np.ndarray) -> np.ndarray:
    """
    Stable softmax
    """
    z = np.asarray(z, dtype=np.float64, copy=False)
    z = z - np.max(z, axis=1, keepdims=True)
    e = np.exp(z)
    s = e.sum(axis=1, keepdims=True)
    s[s == 0.0] = 1.0
    return e / s


class MulticlassLogisticRegression:
    """
    Multinomial logistic regression trained with batch gradient descent.
    """

    def __init__(
        self, lr: float = 0.05, epochs: int = 1500, tol: float = 1e-6, l2: float = 1e-4
    ):
        self.lr = lr
        self.epochs = epochs
        self.tol = tol
        self.l2 = l2  # L2 regularization coefficient
        self.W_: np.ndarray | None = None  # (n_features, n_classes)
        self.classes_: np.ndarray | None = None
        self.mean: np.ndarray | None = None  # (n_features,)
        self.std: np.ndarray | None = None  # (n_features,)

    def fit(self, X: np.ndarray, y: np.ndarray) 
        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y)

        # Classes and inverse indices (y_idx maps each sample to its class index)
        self.classes_, y_idx = np.unique(y, return_inverse=True)
        n_samples, n_features = X.shape
        n_classes = int(self.classes_.shape[0])

        # Standardize with clamped std to avoid exploding features
        self.mean = X.mean(axis=0)
        std = X.std(axis=0)
        std_floor = 1e-2 
        self.std = np.where(std < std_floor, std_floor, std)
        Xn = (X - self.mean) / self.std

        # Initialize weights
        self.W_ = np.zeros((n_features, n_classes), dtype=np.float64)

        # One-hot labels
        Y = np.eye(n_classes, dtype=np.float64)[y_idx]  # (n_samples, n_classes)

        # Batch gradient descent with L2
        for _ in range(self.epochs):
            logits = Xn @ self.W_  # (n_samples, n_classes)
            probs = softmax(logits)  # (n_samples, n_classes)
            grad = (Xn.T @ (probs - Y)) / n_samples + self.l2 * self.W_

            # Update
            self.W_ -= self.lr * grad

            # Early stopping
            if np.linalg.norm(grad) < self.tol:
                break

        return self

    def _standardize(self, X: np.ndarray) -> np.ndarray:
        X = np.asarray(X, dtype=np.float64)
        return (X - self.mean) / self.std

    def predict_proba(self, X: np.ndarray, temperature: float = 1.0) -> np.ndarray:
        Xn = self._standardize(X)
        logits = Xn @ self.W_
        if temperature and temperature > 0 and temperature != 1.0:
            logits = logits / float(temperature)
        return softmax(logits)

    def predict(self, X: np.ndarray) -> np.ndarray:
        probs = self.predict_proba(X)
        idx = np.argmax(probs, axis=1)
        return self.classes_[idx]


In [7]:
# Load data
X_train, y_train, X_test, y_test = load_data()

# Train model
model = MulticlassLogisticRegression(lr=0.01, epochs=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# save this model
with open("fashion-mnist-model.pkl", 'wb') as f:
  dill.dump(model, f)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.8380
