# Lab Assignment Three: Extending Logistic Regression

### Authors
- Juliana Antonio
- Xiaona Hang
- Chuanqi Deng


### 1. Preparation and Overview

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
from numpy.linalg import pinv
from scipy.special import expit

raw_data = pd.read_csv("data/bodyPerformance.csv")

data = pd.get_dummies(raw_data, columns=['gender'],dtype=np.int8) # one-hot encoding for gender
labels = data['class'].map(lambda c: ord(c) - ord('A')) # encode lables into integer
data.drop(['class'], axis=1, inplace=True) # remove class column

# scale
scaler = StandardScaler()
features = scaler.fit_transform(data)

# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.8)
print(features.shape, X_train.shape, X_test.shape,  y_train.shape, y_test.shape)

# 5-fold cross-validation
cross_validator = KFold(n_splits=5)
split_indices = list(cross_validator.split(features, labels))

# example code
# for train_indices, test_indices in split_indices:
#     X_train, y_train = features[train_indices], labels[train_indices]
#     X_test, y_test = features[test_indices], labels[test_indices]
#     print(features.shape, X_train.shape, X_test.shape,  y_train.shape, y_test.shape)
#     # training...

(13393, 12) (10714, 12) (2679, 12) (10714,) (2679,)


In [7]:
data.describe()

Unnamed: 0,age,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,gender_F,gender_M
count,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0
mean,36.775106,168.559807,67.447316,23.240165,78.796842,130.234817,36.963877,15.209268,39.771224,190.129627,0.367804,0.632196
std,13.625639,8.426583,11.949666,7.256844,10.742033,14.713954,10.624864,8.456677,14.276698,39.868,0.482226,0.482226
min,21.0,125.0,26.3,3.0,0.0,0.0,0.0,-25.0,0.0,0.0,0.0,0.0
25%,25.0,162.4,58.2,18.0,71.0,120.0,27.5,10.9,30.0,162.0,0.0,0.0
50%,32.0,169.2,67.4,22.8,79.0,130.0,37.9,16.2,41.0,193.0,0.0,1.0
75%,48.0,174.8,75.3,28.0,86.0,141.0,45.2,20.7,50.0,221.0,1.0,1.0
max,64.0,193.8,138.1,78.4,156.2,201.0,70.5,213.0,80.0,303.0,1.0,1.0


#### Performance Comparison with Scikit-learn

In [16]:
%%time
lr_sk = SKLogisticRegression(solver='liblinear') # all params default
lr_sk.fit(X_train,y_train)
print(np.hstack((lr_sk.intercept_[:,np.newaxis],lr_sk.coef_)))
yhat = lr_sk.predict(X_test)

accuracy = accuracy_score(y_test, yhat)
#precision = precision_score(y_test, yhat, average='weighted')  
#recall = recall_score(y_test, yhat, average='weighted')  
#f1 = f1_score(y_test, yhat, average='weighted')  

print("Accuracy:", accuracy)
#print("Precision:", precision)
#print("Recall:", recall)
#print("F1 Score:", f1)

[[-2.49721585  1.33776226 -0.00583978 -1.02411138 -0.01283874 -0.08373051
   0.02922664  1.14013133  1.69501387  1.87791138  0.8462546   0.74077002
  -0.74077002]
 [-1.17333447  0.2739143   0.00617043 -0.19889842 -0.02196011 -0.06665385
   0.04720764  0.09380136  0.35752193  0.31936539  0.05966311 -0.01137007
   0.01137007]
 [-1.1399805  -0.14982434  0.4079647  -0.39545088 -0.06036942 -0.00569788
   0.02524086 -0.14677866  0.02202528 -0.23205801 -0.2886115  -0.22597135
   0.22597135]
 [-2.15686765 -1.21535641 -0.34863319  1.31637382  0.5095197   0.12485943
  -0.06773889 -0.74175846 -1.67582589 -1.93659814 -0.22788012 -0.42806853
   0.42806853]]
Accuracy: 0.5890257558790594
CPU times: user 73.8 ms, sys: 3.32 ms, total: 77.1 ms
Wall time: 83.6 ms


### 3. Deployment

Which implementation of logistic regression would you advise be used in a deployed machine learning model, your implementation or scikit-learn (or other third party implementation)? Why?


### 4. Exceptional Work

Implement an optimization technique for logistic regression using mean square error as your objective function (instead of maximum likelihood). Derive the gradient updates for the Hessian and use Newton's method to update the values of "w". Then answer, which process do you prefer: maximum likelihood OR minimum mean-squared error?

#### Mean Squared Error (MSE) Approach 

In logistic regression, the Mean Squared Error (MSE) approach minimizes the squared difference between predicted probabilities and actual labels.

$$
\text{MSE} = \frac{1}{N} \sum_{i=1}^{N} (y_i - \hat{y}_i)^2
$$

The gradient of the MSE loss with respect to the weights is computed as:

$$
\nabla l(\mathbf{w}) = \frac{1}{N} \mathbf{X}^T (\mathbf{y} - \hat{\mathbf{y}})
$$

where:
- $\mathbf{X}$ is the design matrix
- $\mathbf{y}$ is the true labels
- $\hat{\mathbf{y}}$ is the predicted labels

We compute the gradient of the MSE loss function with respect to the model parameters and update the parameters iteratively until convergence.

The Hessian matrix is computed as:

$$
\mathbf{H} = \frac{1}{N} \mathbf{X}^T \mathbf{X}
$$

Using Newton's method, we iteratively update the weights as:

$$
\mathbf{w}_{\text{new}} = \mathbf{w}_{\text{old}} - (\mathbf{H}^{-1} \nabla l(\mathbf{w}_{\text{old}}))
$$


In [None]:
import numpy as np

class LogisticRegressionMSE:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
    
    def fit(self, X, y):
        num_samples, num_features = X.shape
        self.w_ = np.zeros((num_features + 1, 1))
        X = np.c_[np.ones((num_samples, 1)), X]  # Add intercept
        
        for _ in range(self.iters):
            y_pred = self._sigmoid(X @ self.w_)
            y_pred = y_pred.flatten()
            gradient = X.T @ (y_pred - y) / num_samples
            hessian = X.T @ X / num_samples
            self.w_ -= self.eta * np.linalg.inv(hessian) @ gradient
    
    def predict_proba(self, X):
        return self._sigmoid(X @ self.w_)
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return (probabilities >= 0.5).astype(int)
    
    def _sigmoid(self, z):
        z_clipped = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z_clipped))


In [None]:
from sklearn.metrics import accuracy_score

# Instantiate LogisticRegressionMSE with desired parameters
lr_mse = LogisticRegressionMSE(eta=0.1, iterations=500)

# Fit the model to the training data
lr_mse.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = lr_mse.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


References: