In this notebook, we implement two algorithms for Logistic Regression inspired by Bregman distances presented in 'Logistic Regression, AdaBoost and Bregman Distances' (Schapire et al 2002) and 'Bregman Distance to L1 Regularized Logistic Regression' (Huang and Gupta, 2010). We compare them to two more well known algorithms - Logistic Regression (Newton's Method, No Regularization) and Lasso Regression. 

We run these 4 algorithms on a variety of datasets and note any patterns on how the nature of the datasets affect the accuracy and runtime of the algorithms. 

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt


# Dictionary of datasets, keys are names (string) of datasets, values are 4 tuples: 
# (X_train, X_test, y_train, y_test)
datasets = dict()


In [2]:
import sklearn

sklearn.__version__

'0.20.0'

# Import MNIST data

In [4]:
from sklearn.datasets import fetch_mldata
#mnist = fetch_mldata('MNIST original')
# mldata.org is down

In [None]:
# Should be 70,000 images (28 by 28 for dimensionality of 784)  
print("Image Data Shape" , mnist.data.shape)
print("Label Data Shape", mnist.target.shape)

In [None]:
# We will focus on binary classification of images with label 0 or 1
mnist_relevant_indices = np.where(mnist.target <= 1.0)[0]

In [None]:
data = mnist.data[mnist_relevant_indices]
target = mnist.target[mnist_relevant_indices]

In [None]:
from sklearn.model_selection import train_test_split
datasets['MNIST'] = train_test_split(data, target, test_size=1/7.0, random_state=0)

# Import Fashion MNIST data

In [5]:
import fashion_mnist_reader
# This requires the data from https://github.com/zalandoresearch/fashion-mnist has been downloaded into data/fashion
X_train, y_train = fashion_mnist_reader.load_mnist('data/fashion', kind='train')
X_test, y_test = fashion_mnist_reader.load_mnist('data/fashion', kind='t10k')

In [6]:
#Filter for labels 0 and 1
fmnist_train_data = X_train[np.where(y_train <= 1.0)[0]]
fmnist_train_label = y_train[np.where(y_train <= 1.0)[0]]
fmnist_test_data = X_test[np.where(y_test <= 1.0)[0]]
fmnist_test_label = y_test[np.where(y_test <= 1.0)[0]]

datasets['FMNIST'] = (fmnist_train_data, fmnist_test_data, fmnist_train_label, fmnist_test_label)

# Import Ionosphere data

In [None]:
# Wait for sklearn 0.20.1 release for this to be fixed
#from sklearn.datasets import fetch_openml
#iono = fetch_openml(data_id=59)

# Logistic Regression (No Regularization)

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
def LogitR(X_train, X_test, y_train, y_test):
    Logit_model = LogisticRegression(C = 1e6, solver = 'lbfgs', max_iter = 1000)
    Logit_model.fit(X_train, y_train)
    predictions = Logit_model.predict(X_test)
    accuracy = Logit_model.score(X_test, y_test)
    weights = np.concatenate([Logit_model.intercept_, Logit_model.coef_[0]])
    return weights, predictions, accuracy

In [None]:
mnist_test_accuracy = LogitR(*datasets['MNIST'])[2]

In [None]:
mnist_test_accuracy

In [9]:
fmnist_test_accuracy = LogitR(*datasets['FMNIST'])[2]

In [10]:
fmnist_test_accuracy

0.983

# Lasso Regression (L1 Regularization)

In [11]:
from sklearn.linear_model import LassoCV

In [12]:
def LassoR(X_train, X_test, y_train, y_test):
    Lasso_model = LassoCV(cv=7)
    Lasso_model.fit(X_train, y_train)
    predictions = Lasso_model.predict(X_test)
    accuracy = Lasso_model.score(X_test, y_test)
    weights = np.concatenate([[Lasso_model.intercept_], Lasso_model.coef_])
    return weights, predictions, accuracy

In [None]:
LassoR(*datasets['MNIST'])[2]

In [13]:
LassoR(*datasets['FMNIST'])[2]

0.892645636644815

# Bregman Logistic Regression by Schapire et al.

In [14]:
def BregmanLogit(X_train, X_test, y_train, y_test):
    from scipy.special import expit as h # Logistic Sigmoid
    
    # First preprocess the data to 
    # 1) Include a bias parameter 
    # 2) Scale all instances x_i so the max l1 norm is <=1
    # 3) Have targets as +1, -1 instead of 1,0.
    X_train = np.concatenate([np.ones((X_train.shape[0],1)), X_train], axis = 1)
    X_test = np.concatenate([np.ones((X_test.shape[0],1)), X_test], axis = 1)
    
    l1_max = max(np.linalg.norm(X_train, ord = np.inf), np.linalg.norm(X_test, ord = np.inf))
    X_train = X_train/l1_max
    X_test = X_test/l1_max
     
    y_train = 2*y_train.astype(int) - 1
    y_test = 2*y_test.astype(int) - 1
    
    n_train_samples, x_dim = X_train.shape
    
    
    # Train weight vector (Parallel Algorithm, Section 5)
    w = np.zeros(x_dim)
    q = 1/2 * np.ones(n_train_samples)
    M = X_train * y_train[:, np.newaxis] # Makes M[i] = y[i] * x[i] so M[i][j] = y[i] x[i][j]
    
    M_pos = np.multiply(M, M>0)
    M_neg = np.multiply(-M, M<0)

    iters = 100
    for t in range(1,iters+1):
        # Update q
        if t==1: 
            q = 1/2 * np.ones(n_train_samples)
        if t>1: 
            q = np.divide(q, np.multiply(1-q, np.exp(M @ d)) + q)
        
        # Update d
        W_pos = q @ M_pos
        W_neg = q @ M_neg
        
        if t==0:
            print("W_pos", W_pos.shape, np.count_nonzero(W_pos))
            print(W_pos)
            print("W_neg", W_neg.shape, np.count_nonzero(W_neg))
            print(W_neg)

        def delta(w_pos, w_neg):
            # delta is picked to minimize the summand in Equation 27
            if w_pos == 0 and w_neg == 0:
                return 0
            if w_pos == 0 and w_neg != 0:
                return -999
            if w_pos != 0 and w_neg == 0:
                return 999
            return 1/2 * np.log(w_pos/w_neg)

        delta_vec = np.vectorize(delta)
        d = delta_vec(W_pos, W_neg)
        w += d
    
    print("weights", w.shape, np.count_nonzero(w))
    print(w)
    # Make predictions on test and evaluate accuracy
    predictions = 2*np.around(h(X_test @ w))-1
    accuracy = np.mean(y_test.T==predictions)
    return w, predictions, accuracy

In [15]:
BregmanLogit(*datasets['FMNIST'])[2]

weights (785,) 785
[ 5.97361448e+00 -9.99000000e+04  2.02661798e+01 -1.02842763e+02
 -2.37490763e+01  6.11120166e+01  8.67911232e+01  9.14556659e+01
  5.56785382e+01  3.14742216e+01  2.59278115e+01  3.76629863e+01
  3.64495977e+01  4.17048909e+01  5.30195888e+01  5.27190916e+01
  4.55230075e+01  3.07088945e+01  3.05671520e+01  3.93271289e+01
  3.53253791e+01  4.34636562e+01  7.67411465e+01  9.48726511e+01
  4.81747090e+01 -4.81895456e+01 -1.37002236e+02 -7.67384722e+01
 -9.99000000e+04 -9.99000000e+04  1.17052062e+00 -9.20081501e+01
  1.48174157e+01  3.55043290e+01  2.39047185e+01 -3.18315520e+00
 -2.31573015e+01 -1.95390056e+01  1.29934026e+00  2.36599850e+01
  2.07509399e+01  1.77132015e+01  2.01320650e+01  2.01153551e+01
  1.61863909e+01  1.55211192e+01  1.95331908e+01  1.90423645e+01
 -1.36222001e+00 -2.09364003e+01 -1.38265481e+01  5.06250278e+00
  1.35011257e+01 -5.12505351e+01 -1.87130007e+02 -6.05456470e+01
 -9.99000000e+04 -1.22298268e+00  6.13792673e+01 -1.03202398e+02
  8.76

0.966

In [16]:
import cvxpy as cvx
def BregmanLogit_Reg(X_train, X_test, y_train, y_test, alpha):
    from scipy.special import expit as h # Logistic Sigmoid
    
    # First preprocess the data to 
    # 1) Include a bias parameter 
    # 2) Scale all instances x_i so the max l1 norm is =1
    # 3) Have targets as +1, -1 instead of 1,0.
    X_train = np.concatenate([np.ones((X_train.shape[0],1)), X_train], axis = 1)
    X_test = np.concatenate([np.ones((X_test.shape[0],1)), X_test], axis = 1)
    
    l1_max = max(np.linalg.norm(X_train, ord = np.inf), np.linalg.norm(X_test, ord = np.inf))
    print(l1_max)
    X_train = X_train/(l1_max)
    X_test = X_test/(l1_max)
    
     
    y_train = 2*y_train.astype(int) - 1
    y_test = 2*y_test.astype(int) - 1
    
    n_train_samples, x_dim = X_train.shape
    
    
    # Train weight vector (Parallel Algorithm, Section 5)
    w = np.zeros(x_dim)
    q = 1/2 * np.ones(n_train_samples)
    M = X_train * y_train[:, np.newaxis] # Makes M[i] = y[i] * x[i] so M[i][j] = y[i] x[i][j]
    
    M_pos = np.multiply(M, M>0)
    M_neg = np.multiply(-M, M<0)
    
    iters = 60
    for t in range(1,iters+1):
        # Update q
        if t==1: 
            q = 1/2 * np.ones(n_train_samples)
        if t>1: 
            q = np.divide(q, np.multiply(1-q, np.exp(M @ d)) + q)
        
        # Update d
        W_pos = q @ M_pos
        W_neg = q @ M_neg
        
        d = cvx.Variable(x_dim)
        # d is chosen to minimize Equation 27
        bregman_bound = W_pos * (cvx.exp(-d) - 1) + W_neg * (cvx.exp(d)-1)
        objective = cvx.Minimize(bregman_bound)
        constraint = [cvx.norm1(w+d) <= alpha]
        prob = cvx.Problem(objective, constraint)
        prob.solve()  # Returns the optimal value.
        d = d.value
        w += d
    
    print("weights", w.shape, np.count_nonzero(w))
    print(w)
    print("scaled weights", np.count_nonzero(w/l1_max))
    print(w/l1_max)
    # Make predictions on test and evaluate accuracy
    predictions = 2*np.around(h(X_test @ w))-1
    accuracy = np.mean(y_test.T==predictions)
    return w, predictions, accuracy

In [17]:
BregmanLogit_Reg(*datasets['FMNIST'], 1e5)[2]

146073.0
weights (785,) 785
[ 5.77607670e+00 -1.79625393e+02 -1.56135643e+01 -8.09405090e+01
 -2.50880593e+01  3.13149821e+01  4.56365035e+01  4.78056288e+01
  2.72425714e+01  1.47839386e+01  1.52941227e+01  2.57316522e+01
  2.42808892e+01  2.70486196e+01  3.38479357e+01  3.35888579e+01
  2.94426272e+01  2.04922575e+01  2.13195425e+01  2.70921910e+01
  2.05946106e+01  2.13882282e+01  3.95920047e+01  4.73973582e+01
  2.26090301e+01 -3.99451036e+01 -9.59360020e+01 -5.97428251e+01
 -2.29334889e+02 -2.38860538e+02 -4.01526400e+01 -8.03334473e+01
 -5.86645229e+00  1.19024988e+01  3.88321906e+00 -1.20454355e+01
 -2.23427769e+01 -1.74764535e+01 -3.39981884e-02  1.66441240e+01
  1.47434568e+01  1.28780610e+01  1.43418828e+01  1.42727183e+01
  1.19812739e+01  1.14749691e+01  1.42100812e+01  1.39700209e+01
 -2.40441478e+00 -1.93382417e+01 -1.74259168e+01 -7.09611658e+00
 -1.81331742e+00 -4.51920639e+01 -1.34984806e+02 -9.04612461e+01
 -2.53214084e+02 -5.30149311e+01 -3.00099272e+01 -9.09817286e+

0.9565