### Logistic Regression implementation(Binary Classification)

Link - https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.datasets import make_classification

In [3]:
#creates datasets
X,y = make_classification(n_samples=1000,n_features=10,n_classes=2,random_state=15)

In [4]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.377957,1.043895,1.043494,-0.101838,-1.617442,0.402713,0.913601,-0.067192,0.175471,-1.049646
1,-0.325259,1.276263,-0.686123,-2.463205,-0.489426,-0.240715,-1.469496,1.006633,-0.833692,0.957744
2,0.739019,-0.600903,-0.177294,1.335714,-0.817332,-0.790047,1.457365,-0.218981,0.878643,-1.257740
3,0.474312,-1.103002,1.189936,-0.800186,0.912377,-0.406451,-1.130950,1.985111,1.379029,1.041768
4,0.927365,1.114796,0.080284,1.261064,0.761179,0.921563,0.440832,0.184645,-1.567739,-0.142107
...,...,...,...,...,...,...,...,...,...,...
995,1.538272,0.171629,0.075371,-0.957658,-1.066219,1.158096,-0.036964,0.123689,0.927871,-0.225003
996,-0.060266,0.095018,-0.271685,1.830560,0.219445,-0.341269,1.180088,-0.216876,-1.752938,-0.810152
997,0.675563,-0.538420,-1.299500,0.747835,1.733898,-0.268044,-0.520953,2.043336,0.947388,0.790354
998,2.629710,-2.452899,-1.359785,1.592065,0.854157,1.618828,0.621701,0.378898,-1.971894,-0.252250


### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.30,random_state=42)

### Model training

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
lr = LogisticRegression()

In [9]:
lr.fit(X_train,y_train)

In [10]:
y_pred = lr.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0])

In [11]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [12]:
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))


0.9166666666666666
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       157
           1       0.92      0.90      0.91       143

    accuracy                           0.92       300
   macro avg       0.92      0.92      0.92       300
weighted avg       0.92      0.92      0.92       300

[[146  11]
 [ 14 129]]


## Hyperparameter Tuning AND Cross Validation

In [13]:
model = LogisticRegression()
penalty=['l1', 'l2', 'elasticnet']
c_values = [100,10,1.0,0.1,0.01]
solver=['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']

In [14]:
params = dict(penalty =penalty,C=c_values,solver = solver)
params

{'penalty': ['l1', 'l2', 'elasticnet'],
 'C': [100, 10, 1.0, 0.1, 0.01],
 'solver': ['lbfgs',
  'liblinear',
  'newton-cg',
  'newton-cholesky',
  'sag',
  'saga']}

## GridSearch CV

def - GridSearchCV in scikit-learn is a class used for hyperparameter tuning through exhaustive search over a specified parameter grid, utilizing cross-validation to evaluate model performance.It gives best parameters that fit your model

link - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

### Stratified K Fold
def - StratifiedKFold is a cross-validation iterator in scikit-learn that generates stratified folds, ensuring each fold maintains the same proportion of samples from each class as the original dataset.

link - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [15]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold()

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
grid = GridSearchCV(estimator=model,param_grid=params,scoring='accuracy',cv=cv ,n_jobs=-1)
#cv = cross validation
#scoring can which we have to calculate it can be precision,recall etc.

In [18]:
grid

In [19]:
grid.fit(X_train,y_train)

250 fits failed out of a total of 450.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Win\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Win\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Win\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  

In [20]:
grid.best_params_  #best parameters is selected 

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [21]:
grid.best_score_  #this is best accuracy we get

np.float64(0.9228571428571428)

In [22]:
y_pred_new = grid.predict(X_test)

In [23]:
print(accuracy_score(y_test,y_pred_new))
print(confusion_matrix(y_test,y_pred_new))
print(classification_report(y_test,y_pred_new))


0.9266666666666666
[[150   7]
 [ 15 128]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       157
           1       0.95      0.90      0.92       143

    accuracy                           0.93       300
   macro avg       0.93      0.93      0.93       300
weighted avg       0.93      0.93      0.93       300



## Randomised Search CV

def - The RandomizedSearchCV class in scikit-learn performs randomized search over hyperparameter settings using cross-validation to find the optimal parameters for a given estimator.
Unlike GridSearchCV, which exhaustively tries all combinations, RandomizedSearchCV samples a fixed number of parameter settings (n_iter) from specified distributions, making it more efficient for large parameter spaces.

link - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

In [24]:
from sklearn.model_selection import RandomizedSearchCV

In [25]:
model_new = LogisticRegression()
randomcv = RandomizedSearchCV(model_new,param_distributions=params,cv = 5, scoring='accuracy')

In [26]:
randomcv.fit(X_train,y_train)

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Win\AppData\Roaming\Python\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Win\AppData\Roaming\Python\Python313\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\Win\AppData\Roaming\Python\Python313\site-packages\sklearn\linear_model\_logistic.py", line 1193, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  Fi

In [27]:
randomcv.best_score_

np.float64(0.9128571428571428)

In [28]:
randomcv.best_params_

{'solver': 'newton-cholesky', 'penalty': 'l2', 'C': 10}

In [29]:
y_pred_randomcv = randomcv.predict(X_test) 

In [30]:
print(accuracy_score(y_test,y_pred_randomcv))
print(confusion_matrix(y_test,y_pred_randomcv))
print(classification_report(y_test,y_pred_randomcv))

0.9166666666666666
[[146  11]
 [ 14 129]]
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       157
           1       0.92      0.90      0.91       143

    accuracy                           0.92       300
   macro avg       0.92      0.92      0.92       300
weighted avg       0.92      0.92      0.92       300

