# Lab Assignment Three: Extending Logistic Regression

### Authors
- Juliana Antonio
- Xiaona Hang
- Chuanqi Deng


### 1. Preparation and Overview

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
from numpy.linalg import pinv
from scipy.special import expit
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm import tqdm

raw_data = pd.read_csv("data/bodyPerformance.csv")

# preprocessing
data = pd.get_dummies(raw_data, columns=['gender'],dtype=np.int8) # one-hot encoding for gender
labels = data['class'].map(lambda c: ord(c) - ord('A')) # encode lables into integer
data.drop(['class'], axis=1, inplace=True) # remove class column

# scale
scaler = StandardScaler()
features = scaler.fit_transform(data)

# PCA
pca = PCA(8)
pca_feature = pca.fit_transform(features)

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(pca_feature, labels, train_size=0.8)
print(pca_feature.shape, X_train.shape, X_test.shape,  y_train.shape, y_test.shape)

(13393, 8) (10714, 8) (2679, 8) (10714,) (2679,)


In [2]:
data.describe()

Unnamed: 0,age,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,gender_F,gender_M
count,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0
mean,36.775106,168.559807,67.447316,23.240165,78.796842,130.234817,36.963877,15.209268,39.771224,190.129627,0.367804,0.632196
std,13.625639,8.426583,11.949666,7.256844,10.742033,14.713954,10.624864,8.456677,14.276698,39.868,0.482226,0.482226
min,21.0,125.0,26.3,3.0,0.0,0.0,0.0,-25.0,0.0,0.0,0.0,0.0
25%,25.0,162.4,58.2,18.0,71.0,120.0,27.5,10.9,30.0,162.0,0.0,0.0
50%,32.0,169.2,67.4,22.8,79.0,130.0,37.9,16.2,41.0,193.0,0.0,1.0
75%,48.0,174.8,75.3,28.0,86.0,141.0,45.2,20.7,50.0,221.0,1.0,1.0
max,64.0,193.8,138.1,78.4,156.2,201.0,70.5,213.0,80.0,303.0,1.0,1.0


#### Performance Comparison with Scikit-learn

In [3]:
%%time
lr_sk = SKLogisticRegression(solver='liblinear') # all params default
lr_sk.fit(X_train,y_train)
print(np.hstack((lr_sk.intercept_[:,np.newaxis],lr_sk.coef_)))
yhat = lr_sk.predict(X_test)

accuracy = accuracy_score(y_test, yhat)
#precision = precision_score(y_test, yhat, average='weighted')  
#recall = recall_score(y_test, yhat, average='weighted')  
#f1 = f1_score(y_test, yhat, average='weighted')  

print("Accuracy:", accuracy)
#print("Precision:", precision)
#print("Recall:", recall)
#print("F1 Score:", f1)

[[-2.41409932 -0.07652043  1.11987994  1.87635321 -0.99278248  1.10132692
   0.61974    -0.04853377 -1.59134486]
 [-1.18537334 -0.04545084  0.15878133  0.32179436 -0.28878665  0.18766268
   0.13837957 -0.0767379  -0.1943455 ]
 [-1.12144996 -0.01379161 -0.01437194 -0.11733073 -0.10340796 -0.21080821
  -0.30513723  0.04083779  0.32180087]
 [-2.13932052  0.15446421 -1.22976883 -1.79134747  1.36491043 -0.62400042
  -0.27894579  0.12101879  1.58349591]]
Accuracy: 0.5819335572974991
CPU times: user 195 ms, sys: 94.6 ms, total: 290 ms
Wall time: 39.3 ms


### 3. Deployment

Which implementation of logistic regression would you advise be used in a deployed machine learning model, your implementation or scikit-learn (or other third party implementation)? Why?


### 4. Exceptional Work

Implement an optimization technique for logistic regression using mean square error as your objective function (instead of maximum likelihood). Derive the gradient updates for the Hessian and use Newton's method to update the values of "w". Then answer, which process do you prefer: maximum likelihood OR minimum mean-squared error?

#### Mean Squared Error (MSE) Approach 

In logistic regression, the Mean Squared Error (MSE) approach minimizes the squared difference between predicted probabilities and actual labels.

$$
\text{MSE} = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2
$$

The gradient of the MSE loss with respect to the weights is computed as:

$$
\nabla l(\mathbf{w}) = \frac{1}{N} \mathbf{X}^T (\mathbf{y} - \hat{\mathbf{y}})
$$

where:
- $\mathbf{X}$ is the design matrix
- $\mathbf{y}$ is the true labels
- $\hat{\mathbf{y}}$ is the predicted labels

We compute the gradient of the MSE loss function with respect to the model parameters and update the parameters iteratively until convergence.

The Hessian matrix is computed as:

$$
\mathbf{H} = \frac{1}{N} \mathbf{X}^T \mathbf{X}
$$

Using Newton's method, we iteratively update the weights as:

$$
\mathbf{w}_{\text{new}} = \mathbf{w}_{\text{old}} - (\mathbf{H}^{-1} \nabla l(\mathbf{w}_{\text{old}}))
$$


In [4]:
class LogisticRegressionMSE:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        self.weights = None  # Initialize weights
        
    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.w_ = np.zeros((num_features + 1, 1))
        X = np.c_[np.ones((num_samples, 1)), X]  # Add intercept
        
        for _ in range(self.iters):
            y_pred = self._sigmoid(X @ self.w_)
            y_pred = y_pred.flatten()
            gradient = X.T @ (y_pred - y) / num_samples
            hessian = X.T @ X / num_samples
            gradient = gradient.reshape(-1, 1) # Reshape gradient to ensure it's treated as a column vector
            self.w_ -= self.eta * np.linalg.inv(hessian) @ gradient # Update the model parameters using Newton's method
        
        self.weights = self.w_  # Store weights
    
    def predict_proba(self, X):
        X = np.c_[np.ones((X.shape[0], 1)), X]
        return self._sigmoid(X @ self.w_)
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return (probabilities >= 0.5).astype(int)
    
    def _sigmoid(self, z):
        z_clipped = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clipped))

In [5]:
%%time
from sklearn.metrics import accuracy_score

lr_mse = LogisticRegressionMSE(eta=0.1, iterations=1000)

lr_mse.fit(X_train, y_train)
y_pred = lr_mse.predict(X_test)
print(np.vstack((lr_mse.weights[0], lr_mse.weights[1:])))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


[[ 65.34428434]
 [  2.73227359]
 [-24.86548843]
 [-38.7678778 ]
 [ 23.9149409 ]
 [-17.73570103]
 [-11.76043133]
 [  3.20034628]
 [ 37.35061185]]
Accuracy: 0.3523702874206794
CPU times: user 3.21 s, sys: 123 ms, total: 3.33 s
Wall time: 425 ms


In [6]:
class LogisticRegressionMSEOvR: # One vs. All multiclass with MSE
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        self.classifiers = {}  # Dictionary to store binary classifiers
        
    def fit(self, X, y):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)
        
        for class_label in unique_classes:
            y_binary = (y == class_label).astype(int)  # Convert to binary labels
            classifier = LogisticRegressionMSE(self.eta, self.iters)
            classifier.fit(X, y_binary)
            self.classifiers[class_label] = classifier
    
    def predict_proba(self, X):
        num_samples = X.shape[0]
        probabilities = np.zeros((num_samples, len(self.classifiers)))
        
        for i, (class_label, classifier) in enumerate(self.classifiers.items()):
            probabilities[:, i] = classifier.predict_proba(X).flatten()
        
        return probabilities
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)


In [7]:
%%time
lr_ovr = LogisticRegressionMSEOvR(eta=0.1, iterations=1000)
# Fit the model to your data
lr_ovr.fit(X_train, y_train)
# Make predictions
y_pred = lr_ovr.predict(X_test)
weights_stacked = np.hstack([lr_ovr.classifiers[class_label].weights for class_label in lr_ovr.classifiers])
print(weights_stacked)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

[[-2.19701082 -1.18614417 -1.12209991 -2.01123338]
 [-0.06948775 -0.0454751  -0.01379986  0.14468911]
 [ 1.01139011  0.15890944 -0.01438236 -1.13946324]
 [ 1.68106384  0.32210358 -0.11740654 -1.66290641]
 [-0.883847   -0.28910844 -0.10350916  1.26607836]
 [ 0.9722856   0.18791939 -0.21102579 -0.57993279]
 [ 0.5654239   0.13864962 -0.3056917  -0.25859165]
 [-0.04832219 -0.0768913   0.04091497  0.11430389]
 [-1.47864458 -0.19487226  0.32259268  1.47253174]]
Accuracy: 0.5759611795446062
CPU times: user 11.8 s, sys: 408 ms, total: 12.2 s
Wall time: 1.55 s


References: