# Recreating the Scikit-learn LogisticRegression method

In this notebook I'm going to re-create the logistic regression without the overhead of checks that scikit-learn method performs.

In [67]:
import time

import numpy as np
from sklearn.utils.extmath import softmax
from sklearn.linear_model import LogisticRegression
from itea.classification  import ITEA_classifier

from sklearn.datasets import make_blobs

tfuncs    = {'id' : lambda x: x}
tfuncs_dx = {'id' : lambda x: np.ones_like(x),}

# Classification execution
X_clf, y_clf = make_blobs(
    n_samples    = 100,
    n_features   = 2,
    cluster_std  = 1,
    centers      = [(-10,-10), (10, 10)],
    random_state = 0,
)

In [74]:
# First, the original method

exectime_ = time.time()
clf_original = LogisticRegression(solver='saga', penalty='none').fit(X_clf, y_clf)
exectime_ = time.time() - exectime_

print(exectime_)
print(clf_original.coef_)
print(clf_original.intercept_)
print(clf_original.classes_)
print(clf_original.n_iter_)
print(clf_original.predict_proba(X_clf[:2, :]))
print(clf_original.predict(X_clf[:10, :]))

0.0022547245025634766
[[0.48337368 0.51733756]]
[0.00588424]
[0 1]
[100]
[[9.99940572e-01 5.94275638e-05]
 [1.93190907e-04 9.99806809e-01]]
[0 1 0 1 1 0 0 1 0 0]




In [73]:
from sklearn.linear_model._sag import sag_solver
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from scipy.special import expit

def my_implementation(X, y):
    classes_  = np.unique(y)
    
    Z = X # Nosso Z seria o eval
    
    n_classes  = len(classes_)
    n_terms = Z.shape[1]
    
    max_squared_sum = row_norms(Z, squared=True).max()

    if n_classes > 2:
        multi_class = 'multinomial'
            
        # from scikit: "SAG multinomial solver needs LabelEncoder, not LabelBinarizer"
        target = LabelEncoder().fit_transform(y).astype(Z.dtype, copy=False)

        w0 = np.zeros(
            (classes_.size, n_terms + 1), order='F', dtype=Z.dtype)
        
        warm_start = {'coef': w0.T}
    else:
        multi_class = 'log'
        
        n_classes -= 1
        
        target = np.ones(y.shape, dtype=Z.dtype)
        target[~(y == classes_[1])] = -1.
        
        w0 = np.zeros(n_terms + 1, dtype=Z.dtype)
        
        warm_start = {'coef': np.expand_dims(w0, axis=1)}
    
    coef_, n_iter_, _ = sag_solver(
        Z, target, sample_weight=None, loss=multi_class, alpha=0., beta=0.,
        max_iter=100, tol=0.001, verbose=0, random_state=None,
        check_input=False, max_squared_sum=max_squared_sum,
        warm_start_mem=warm_start, is_saga=True)
    
    if n_classes <= 2:
        coef_ = coef_.reshape(n_classes, n_terms + 1)
    
    intercept_ = coef_[:, -1]
    coef_      = coef_[:, :-1]

    return coef_, intercept_, classes_, n_iter_



def predict_proba(coef_, intercept_, classes_, X):
    
    prob = (
        safe_sparse_dot(
        X, # AQUI SERIA O EVAL Z
        np.array(coef_).T
    ) + np.array(intercept_))
    
    # Normalizar no caso multiclasse
    if len(classes_) <= 2:
        prob = np.hstack( (np.ones(X.shape[0]).reshape(-1, 1), prob) )  
        prob[:, 0] -= prob[:, 1]
    
    return softmax(prob)
        
    
def predict(coef_, intercept_, classes_, X):
    probabilities = predict_proba(coef_, intercept_, classes_, X)

    return np.array(classes_)[np.argmax(probabilities, axis=1)]


exectime_ = time.time()
coef_, intercept_, classes_, n_iter_ = my_implementation(X_clf, y_clf)
exectime_ = time.time() - exectime_

print(exectime_)
print(coef_)
print(intercept_)
print(classes_)
print(n_iter_)
print(predict_proba(coef_, intercept_, classes_, X_clf[:2, :]))
print(predict(coef_, intercept_, classes_, X_clf[:10, :]))

0.0012922286987304688
[[0.46814302 0.50734168]]
[-0.02233791]
[0 1]
97
[[9.99999998e-01 2.00909890e-09]
 [1.66818863e-07 9.99999833e-01]]
[0 1 0 1 1 0 0 1 0 0]


In [81]:
print(np.array_equal(
    predict(coef_, intercept_, classes_, X_clf),
    clf_original.predict(X_clf)
))

True
