In [6]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [7]:
class LogisticRegressionMLE_Improved:
    def __init__(self, regularization=1.0, max_iter=1000):
        self.C = regularization  # Inverse of regularization strength
        self.max_iter = max_iter
        self.weights = None
        self.bias = None
        self.scaler = StandardScaler()
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-np.clip(z, -500, 500)))
    
    def log_likelihood_with_regularization(self, params, X, y):

        # Negative log-likelihood with L2 regularization.
        # We want to minimize: -log_likelihood + (1/(2*C)) * ||weights||^2

        n_features = X.shape[1]
        weights = params[:n_features]
        bias = params[n_features]
        
        # Predictions
        linear_pred = np.dot(X, weights) + bias
        predictions = self.sigmoid(linear_pred)
        predictions = np.clip(predictions, 1e-10, 1 - 1e-10)
        
        # Negative log-likelihood
        log_lik = np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
        
        # L2 regularization term (only on weights, not bias)
        regularization_term = (1 / (2 * self.C)) * np.sum(weights ** 2)
        
        return -log_lik + regularization_term
    
    def gradient(self, params, X, y):

        n_features = X.shape[1]
        weights = params[:n_features]
        bias = params[n_features]
        
        linear_pred = np.dot(X, weights) + bias
        predictions = self.sigmoid(linear_pred)
        
        # Gradient of negative log-likelihood
        error = predictions - y
        grad_weights = np.dot(X.T, error) + (1 / self.C) * weights
        grad_bias = np.sum(error)
        
        return np.concatenate([grad_weights, [grad_bias]])
    
    def fit(self, X, y, verbose=True):
        # Scale features
        X_scaled = self.scaler.fit_transform(X)
        
        n_features = X_scaled.shape[1]
        initial_params = np.zeros(n_features + 1)
        
        # Minimize with gradient information
        result = minimize(
            fun=self.log_likelihood_with_regularization,
            x0=initial_params,
            args=(X_scaled, y),
            method='L-BFGS-B',  # Better for this problem
            jac=self.gradient,  # Provide gradient
            options={'maxiter': self.max_iter}
        )
        
        self.weights = result.x[:n_features]
        self.bias = result.x[n_features]
        
        if verbose:
            print(f"Optimization successful: {result.success}")
            print(f"Final negative log-likelihood: {result.fun:.4f}")
            print(f"Number of iterations: {result.nit}")
        
 
    
    def predict_proba(self, X):
        X_scaled = self.scaler.transform(X)
        linear_pred = np.dot(X_scaled, self.weights) + self.bias
        return self.sigmoid(linear_pred)
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return (probabilities > 0.5).astype(int)

# Example usage:
# model = LogisticRegressionMLE_Improved(regularization=1.0)
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")

In [1]:
# naive approach
# let's make a blueprint (a class) that can instantiate objects
class LR:
    def __init__(self):
        self.w = None
        self.b = None

    def sig(self,z):
        return 1/(1+np.exp(-z))
    def gradient(self,x,y,w,b):
        errors = y - self.sig(x@w+b)
        return -1/len(x)*(errors)@x, -1/len(x)*sum(errors)
    def fit(self,x,y,lr=0.01,maxiter=1000,intercept=True,tol=1e-5):
        # the main goal of this is to update the weights with gradient descent
        # first initialize the weights and the bias term
        self.w = np.random.normal(size=x.shape[1])
        if intercept:
            self.b = np.random.normal()
        else:
            self.b = 0
        for _ in range(maxiter):
            gw, gb = self.gradient(x,y,self.w,self.b)
            self.wnew = self.w - lr*gw
            self.bnew = self.b - lr*gb
            if np.linalg.norm(self.wnew-self.w)<tol:
                break
            self.w = self.wnew
            self.b = self.bnew
            
    def predict_proba(self,x):
        return self.sig(x@self.w+self.b)

    def predict_classes(self,x,thresh=0.5):
        return (self.sig(x@self.w+self.b)>thresh) + 0 
        
        

In [None]:
# setup 
w = np.random.uniform(size=x.shape[1])
b  = np.random.uniform()
lr = 0.0005
maxiter = 10000
tol = 1e-5
mse = []
u = 0
beta1 = 0.9
done = False
batch_size = 32
sw = 0
sb = 0
eps = 1e-6

In [31]:
# RMSPROP approach
class LR_rmsprop:
    def __init__(self):
        self.w = None
        self.b = None

    def loss(self,x,y):
        predictions = self.sig(x@self.w+self.b)
        log_lik = np.sum(y * np.log(predictions) + (1 - y) * np.log(1 - predictions))
        return -log_lik

    def sig(self,z):
        return 1/(1+np.exp(-z))
        
    def gradient(self,x,y,w,b):
        errors = y - self.sig(x@w+b)
        return -1/len(x)*(errors)@x, -1/len(x)*sum(errors)
        
    def fit(self,x,y,lr=0.01,maxiter=1000,intercept=True,tol=1e-5,sw=0,sb=0,eps=1e-6,batch_size=32,done=False,beta1=0.9,u=0):
        # the main goal of this is to update the weights with gradient descent
        # first initialize the weights and the bias term
        self.w = np.random.normal(size=x.shape[1])
        if intercept:
            self.b = np.random.normal()
        else:
            self.b = 0
        lss= []
        for i in range(maxiter):
            minibatches = np.array_split(np.random.permutation(range(len(x))),len(x)//batch_size)
            for batch in minibatches:
                gw, gb = self.gradient(x,y,self.w,self.b)
        
                # here we create adaptive learning rates
                sw = beta1*sw + (1-beta1)*sum(gw**2)
                sb = beta1*sb + (1-beta1)*gb**2
                
                self.wnew = self.w - lr/np.sqrt(sw+eps)*gw
                self.bnew = self.b - lr/np.sqrt(sb+eps)*gb
                u += 1
                lss.append(self.loss(x,y))
                if np.linalg.norm(self.wnew-self.w)<tol:
                    print('The Algorithm has Converged!')
                    done = True
                    break
                if (u+1)%100 ==0:
                    print(f'After {u+1} updates the Loss is: {self.loss(x,y)}')
                self.w = self.wnew
                self.b = self.bnew
            if done:
                break
            
    def predict_proba(self,x):
        return self.sig(x@self.w+self.b)

    def predict_classes(self,x,thresh=0.5):
        return (self.sig(x@self.w+self.b)>thresh) + 0 

In [32]:
data = pd.read_csv('https://github.com/dvasiliu/AML/blob/main/Data%20Sets/example_data_classification.csv?raw=true', header=None)

In [33]:
x = data.iloc[:,[0,1]].values
y = data.iloc[:,-1].values

In [34]:
model = LR_rmsprop()
model.fit(x,y)

After 100 updates the Loss is: 5664.759552898149
After 200 updates the Loss is: 439.42535223897215
After 300 updates the Loss is: 72.3688597835951
After 400 updates the Loss is: 70.41067712375975
After 500 updates the Loss is: 66.24164877728774
After 600 updates the Loss is: 62.48898565969772
After 700 updates the Loss is: 59.11642023793128
After 800 updates the Loss is: 56.08773607365078
After 900 updates the Loss is: 53.3679038582983
After 1000 updates the Loss is: 50.9239220204625
After 1100 updates the Loss is: 48.72532948881079
After 1200 updates the Loss is: 46.744450140634505
After 1300 updates the Loss is: 44.95643749170373
After 1400 updates the Loss is: 43.33918325602648
After 1500 updates the Loss is: 41.8731420036575
After 1600 updates the Loss is: 40.5411112457792
After 1700 updates the Loss is: 39.32799454173159
After 1800 updates the Loss is: 38.22056571760377
After 1900 updates the Loss is: 37.20724515917424
After 2000 updates the Loss is: 36.27789410794831
After 2100 u

In [38]:
sum(abs(y-model.predict_classes(x)))/len(y)

np.float64(0.09)