In [1]:
import numpy as np
import scipy.sparse as sparse

import tqdm as tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, Normalizer

seed = 69420
np.random.seed(69420)

In [2]:
def read_sparse_dataset(dataset_name: str, *args, **kwargs) -> sparse.csr_matrix:
    with open(dataset_name, 'r') as f:
        num_lines = sum(1 for _ in f)

    sparse_row_list = []
    with open(dataset_name, 'r') as f:
        for row in tqdm.tqdm(f, total=num_lines):
            data = np.fromstring(row, *args, **kwargs)
            sparse_row = sparse.csr_matrix(data)
            sparse_row_list.append(sparse_row)

    data_matrix: sparse.csr_matrix
    data_matrix = sparse.vstack(sparse_row_list) # type: ignore
    return data_matrix

In [4]:
Xy_raw: sparse.csr_matrix
Xy_raw = read_sparse_dataset('training.csv', sep=',', dtype=np.int32)

100%|██████████| 12000/12000 [02:38<00:00, 75.91it/s] 


In [5]:
X_raw: sparse.csr_matrix; y_raw: np.ndarray
X_raw = Xy_raw[:, 1:-1] # type: ignore
y_raw = Xy_raw[:, -1].toarray() # type: ignore

In [5]:
scaler = Normalizer()
encoder = OneHotEncoder()

X_norm: sparse.csr_matrix; y_cat: np.ndarray
X_norm = scaler.fit_transform(X_raw)
y_cat = encoder.fit_transform(y_raw)

In [6]:
X_train: sparse.csr_matrix; X_test: sparse.csr_matrix; y_train: np.ndarray; y_test: np.ndarray
X_train, X_test, y_train, y_test = train_test_split(X_norm, y_cat, test_size=0.2, random_state=seed) # type: ignore
y_train, y_test = y_train.toarray(), y_test.toarray() # type: ignore

In [7]:
def calculate_loss(*, y, s):
    return np.sum(y * np.log(s)) / -y.shape[0]

In [8]:
def multilogistic(*, z):
    return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)

In [9]:
def calculate_gradient(*, X, y, s):
    return X.T @ (s - y) / y.shape[0]

In [10]:
def predict(*, X, W):
    w, b = W[1:], W[0]
    return multilogistic(z=X @ w + b)

In [11]:
def accuracy(*, y_true, y_pred):
    return np.sum(y_true == y_pred) / y_true.shape[0]

In [12]:
def logistic_regression(X, y, epochs, alpha, lambda_, threshold, W_init=None):
    if W_init is None:
        W = np.random.normal(0, 0.1, (X.shape[1], y.shape[1])) 
        W = np.vstack((np.zeros((1, y.shape[1])), W))
    else:
        W = W_init

    X = sparse.hstack((((X.shape[0], 1)), X))
    losses = []
    i = 0
    try:
        with tqdm.trange(epochs) as t:
            for i in t:
                s = multilogistic(z=X @ W)                
                gradient = calculate_gradient(X = X, y = y, s = s) + lambda_ * W
                
                loss = calculate_loss(y = y, s = s) + lambda_ * np.sum(W**2)
                losses.append(loss)

                accuracy_ = accuracy(y_true = np.argmax(y, axis=1), y_pred = np.argmax(s, axis=1))
                t.set_postfix(loss=f"{loss:.6f}", accuracy=f"{accuracy_:.6f}")

                if i > 0 and np.abs(losses[-1] - losses[-2]) < threshold:
                    print("Converged at epoch ", i)
                    break

                W = W - alpha * gradient
    except KeyboardInterrupt:
        print("Interrupted by user at epoch ", i)

    return W, losses

In [13]:
W = None

In [21]:
W, losses = logistic_regression(X_train, y_train, epochs = 10000, alpha = 1, lambda_ = 1e-4, threshold = 1e-6, W_init = W)

  9%|▉         | 940/10000 [02:06<20:18,  7.43it/s, accuracy=0.721146, loss=2.061988]

Interrupted by user at epoch  940





In [22]:
s = predict(X = X_train, W = W)
print("Train accuracy: ", accuracy(y_true = np.argmax(y_train, axis=1), y_pred = np.argmax(s, axis=1)))

Train accuracy:  0.72125


In [23]:
s = predict(X = X_test, W = W)
print("Test accuracy: ", accuracy(y_true = np.argmax(y_test, axis=1), y_pred = np.argmax(s, axis=1)))

Test accuracy:  0.6558333333333334


In [25]:
Xy_testfile_raw: sparse.csr_matrix
Xy_testfile_raw = read_sparse_dataset('testing.csv', sep=',', dtype=np.int32)

100%|██████████| 6774/6774 [00:10<00:00, 666.26it/s]


In [30]:
X_testfile_raw: sparse.csr_matrix; y_testfile_raw: np.ndarray
X_testfile_raw = Xy_testfile_raw[:, 1:-1] # type: ignore
y_testfile_raw = Xy_testfile_raw[:, -1].toarray() # type: ignore

In [32]:
X_testfile_norm: sparse.csr_matrix; y_testfile_cat: np.ndarray
X_testfile_norm = scaler.transform(X_testfile_raw)
y_testfile_cat = encoder.transform(y_testfile_raw)

ValueError: X has 61187 features, but Normalizer is expecting 61188 features as input.