# Import Libraries

In [3]:
import numpy as np
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Create the Data

In [9]:
X , y = make_classification(n_samples= 2000, n_features= 50, n_informative=10,
                    random_state=812, n_classes=2)

In [10]:
X

array([[ 1.23173717,  1.10532801,  0.22776564, ..., -0.78356596,
        -0.15711703, -1.65414508],
       [ 0.48439638, -0.14303632, -2.51423517, ...,  0.60566089,
        -0.13130922, -0.21336398],
       [ 1.43086386, -0.13222191, -0.97300061, ..., -1.24373168,
        -0.8441854 , -1.68326389],
       ...,
       [ 1.31133403,  0.03317187,  2.49820238, ..., -0.42860444,
         1.11198425, -0.37050314],
       [ 1.40062972, -0.98004073, -0.89407227, ...,  1.63212064,
         0.06716772,  1.21445157],
       [ 0.44062982, -0.17520526,  3.96299506, ...,  0.25240773,
        -0.095253  ,  0.83673848]])

In [11]:
y

array([0, 0, 0, ..., 0, 1, 0])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

In [15]:
from collections import Counter
Counter(y) # balanced data

Counter({0: 1002, 1: 998})

In [17]:
X_train.shape, X_test.shape

((1600, 50), (400, 50))

# Vanilla Model

In [20]:
vanilla_lr = LogisticRegression()

In [22]:
vanilla_lr.fit(X_train, y_train)

In [24]:
vanilla_preds = vanilla_lr.predict(X_test)
vanilla_preds

array([1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,

In [25]:
y_test

array([1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,

In [29]:
print(classification_report(y_test, vanilla_preds, digits=3))

              precision    recall  f1-score   support

           0      0.693     0.759     0.724       187
           1      0.769     0.704     0.735       213

    accuracy                          0.730       400
   macro avg      0.731     0.732     0.730       400
weighted avg      0.733     0.730     0.730       400



# Hyperparameter Tuning

In [30]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
param_dist = {
    'C': [0.01, 0.1, 1, 10, 100], # Big value C is reliable
    'penalty': ["l1", "l2"],
    'solver': ["saga", "liblinear"]
}

In [36]:
tune_model = LogisticRegression()

In [37]:
random_search = RandomizedSearchCV(
    tune_model, param_dist, cv = 5,
    n_iter=5, verbose = 1 )

In [38]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [39]:
random_search.best_params_

{'solver': 'liblinear', 'penalty': 'l2', 'C': 100}

In [40]:
best_model = random_search.best_estimator_

In [41]:
tuned_preds = best_model.predict(X_test)

In [42]:
print(classification_report(y_test, tuned_preds, digits=3))

              precision    recall  f1-score   support

           0      0.693     0.759     0.724       187
           1      0.769     0.704     0.735       213

    accuracy                          0.730       400
   macro avg      0.731     0.732     0.730       400
weighted avg      0.733     0.730     0.730       400



In [43]:
tuned_probas = best_model.predict_proba(X_test)[:, 1]
tuned_probas

array([0.94255873, 0.66127795, 0.82563494, 0.41068122, 0.76319623,
       0.74989137, 0.37945329, 0.94042885, 0.93333442, 0.28211207,
       0.90343307, 0.69502836, 0.63062203, 0.23427391, 0.9153906 ,
       0.49091428, 0.3226734 , 0.42868028, 0.96008728, 0.63810533,
       0.26530854, 0.61899121, 0.04212219, 0.74420292, 0.07551113,
       0.31692028, 0.04854838, 0.17764407, 0.96386504, 0.79771149,
       0.20356129, 0.69413429, 0.22225273, 0.43441157, 0.74109191,
       0.07736576, 0.98476758, 0.63244771, 0.32174149, 0.86767143,
       0.23561282, 0.22659143, 0.69953521, 0.33766431, 0.58265229,
       0.19306524, 0.32919346, 0.99372076, 0.99467507, 0.86575909,
       0.50950683, 0.34896322, 0.21298919, 0.53117472, 0.21017572,
       0.23935218, 0.03864078, 0.21868479, 0.70081659, 0.15532477,
       0.79478429, 0.09306382, 0.40189534, 0.72548037, 0.42412732,
       0.59782899, 0.34283201, 0.69196268, 0.34459172, 0.07701354,
       0.04582586, 0.45446369, 0.29804311, 0.54306574, 0.09236