# Recreating the Scikit-learn LogisticRegression method

In this notebook I'm going to re-create the logistic regression without the overhead of checks that scikit-learn method performs.

In [56]:
import time

import numpy as np

from sklearn.linear_model import LogisticRegression
from itea.classification  import ITEA_classifier

from sklearn.datasets import make_blobs

tfuncs    = {'id' : lambda x: x}
tfuncs_dx = {'id' : lambda x: np.ones_like(x),}

# Classification execution
X_clf, y_clf = make_blobs(
    n_samples    = 100,
    n_features   = 2,
    cluster_std  = 1,
    centers      = [(-10,-10), (0,0), (10, 10)],
    random_state = 0,
)

In [63]:
# First, the original method

exectime_ = time.time()
clf_original = LogisticRegression(solver='saga', penalty='none').fit(X_clf, y_clf)
exectime_ = time.time() - exectime_

print(exectime_)
print(clf_original.coef_)
print(clf_original.intercept_)
print(clf_original.classes_)
print(clf_original.n_iter_)

0.004296302795410156
[[-0.55009618 -0.60745217]
 [-0.08147184  0.11412668]
 [ 0.63156802  0.4933255 ]]
[-1.26197799  2.81647098 -1.55449298]
[0 1 2]
[100]




In [64]:
from sklearn.linear_model._sag import sag_solver
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.utils.extmath import row_norms

def my_implementation(X, y):
    classes_ = np.unique(y)
    
    n_classes = len(classes_)
    n_features = X.shape[1]
    
    max_squared_sum = row_norms(X, squared=True).max()

    if n_classes > 2:
        multi_class = 'multinomial'
        
        # SAG multinomial solver needs LabelEncoder, not LabelBinarizer
        le = LabelEncoder()
        Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)

        target = Y_multi.astype(X.dtype, copy=False)
        w0 = np.zeros((classes_.size, n_features + 1),
                      order='F', dtype=X.dtype)
        
        warm_start = {'coef': w0.T}
    else:
        multi_class = 'log'
        w0 = np.zeros(n_features + 1, dtype=X.dtype)
        
        mask = (y == classes_[1])
        n_classes -= 1
        
        y_bin = np.ones(y.shape, dtype=X.dtype)
        y_bin[~mask] = -1.
        
        target = y_bin
        warm_start = {'coef': np.expand_dims(w0, axis=1)}
    
    coef_, n_iter_, _ = sag_solver(
        X, target, sample_weight=None, loss=multi_class, alpha=0., beta=0.,
        max_iter=100, tol=0.001, verbose=0, random_state=None,
        check_input=False, max_squared_sum=max_squared_sum,
        warm_start_mem=warm_start,
        is_saga=True)
    
    if n_classes <= 2:
        coef_ = coef_.reshape(n_classes, n_features + 1)
    
    intercept_ = coef_[:, -1]
    coef_      = coef_[:, :-1]

    return coef_, intercept_, classes_, n_iter_

exectime_ = time.time()
coef_, intercept_, classes_, n_iter_ = my_implementation(X_clf, y_clf)
exectime_ = time.time() - exectime_

print(exectime_)
print(coef_)
print(intercept_)
print(classes_)
print(n_iter_)

0.0028505325317382812
[[-0.55362482 -0.60421306]
 [-0.07915516  0.11205122]
 [ 0.63277998  0.49216185]]
[-1.26707532  2.81774843 -1.55067311]
[0 1 2]
100
