In [54]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing as mp
import time
from scipy.special import expit
import pandas as pd

In [55]:
def log_likelihood(X, w, y):
    # Compute softmax predictions
    scores = np.dot(X, w)
    prob = expit(scores)
    
    # Log likelihood = 1/N * sum(log p(y_i | x_i))
    log_lik = (y.T * np.log(prob) + (1 - y).T * np.log(1 - prob)) / X.shape[0]
    
    return np.mean(log_lik)

def objective(X, w, y):
    log_loss = -log_likelihood(X, w, y)
    l2_norm = np.linalg.norm(w)
    return log_loss, l2_norm

def gradient(X, w, y):
    # Compute softmax predictions
    scores = np.dot(X, w)
    prob = np.asmatrix(expit(scores))
    
    # Compute the gradient of the log loss
    grad = np.dot(np.transpose(X), prob - y) / X.shape[0]
    
    # Compute the gradient of the l2 norm
    l2_grad = 2 * w
    
    return grad, l2_grad