In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.DataFrame(data = datasets.load_breast_cancer().data,columns = datasets.load_breast_cancer().feature_names)
df['target'] = datasets.load_breast_cancer().target

In [3]:
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


## Train and Test data

In [4]:
X = df.drop(columns=['target'])
y = df['target']

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=28)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(398, 30)
(398,)
(171, 30)
(171,)


## Model Training using l2 regularization

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

for c in [0.1,1,10,100]:
    
    logreg = LogisticRegression(penalty='l2',max_iter=200,C = c)
    logreg.fit(X_train,y_train)
    
    print('C = ',c)
    print()
    
    y_pred_train = logreg.predict(X_train)
    y_pred_test = logreg.predict(X_test)
    
    print('Train accuracy:',accuracy_score(y_train,y_pred_train))

    print()
    print('Test accuracy:',accuracy_score(y_test,y_pred_test))
    print('-'*50)

C =  0.1

Train accuracy: 0.9547738693467337

Test accuracy: 0.9239766081871345
--------------------------------------------------
C =  1

Train accuracy: 0.9547738693467337

Test accuracy: 0.9473684210526315
--------------------------------------------------
C =  10

Train accuracy: 0.9623115577889447

Test accuracy: 0.9532163742690059
--------------------------------------------------
C =  100

Train accuracy: 0.9673366834170855

Test accuracy: 0.9532163742690059
--------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## Model Training using l1 regularization

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

for c in [0.1,1,10,100]:
    
    logreg = LogisticRegression(penalty='l1',max_iter=200,solver= 'liblinear',C = c)
    logreg.fit(X_train,y_train)
    
    print('C = ',c)
    print()
    
    y_pred_train = logreg.predict(X_train)
    y_pred_test = logreg.predict(X_test)
    
    print('Train accuracy:',accuracy_score(y_train,y_pred_train))

    print()
    print('Test accuracy:',accuracy_score(y_test,y_pred_test))
    print('-'*50)

C =  0.1

Train accuracy: 0.9422110552763819

Test accuracy: 0.9298245614035088
--------------------------------------------------
C =  1

Train accuracy: 0.9547738693467337

Test accuracy: 0.9415204678362573
--------------------------------------------------
C =  10

Train accuracy: 0.9899497487437185

Test accuracy: 0.9473684210526315
--------------------------------------------------
C =  100

Train accuracy: 0.992462311557789

Test accuracy: 0.9415204678362573
--------------------------------------------------


### Since C is inverse to regularization strength, C = 0.1 is most regularized,  and least difference between train and test performance, ie least overfitting .

### C = 100, ie least regularized, and hence we see most overfitting

## Feature Selection using l1 regularization

In [26]:
for c in [0.01,1,100]:
    print("C=",c)
    print()
    logreg = LogisticRegression(penalty='l1',max_iter=200,solver= 'liblinear',C = c,random_state=23)
    logreg.fit(X_train,y_train)
    coeffecients = np.round(logreg.coef_,4)
    print("Model coeffecients: ",coeffecients)
    
    print()
    
    eliminated_features = X_train.columns[coeffecients[0]==0]
    print('Eliminated features: ',eliminated_features)
    print('Total {} features eliminated'.format(len(eliminated_features)))
    print('-'*75)

C= 0.01

Model coeffecients:  [[ 0.      0.      0.1581  0.0036  0.      0.      0.      0.      0.
   0.      0.      0.      0.     -0.022   0.      0.      0.      0.
   0.      0.      0.      0.      0.     -0.0197  0.      0.      0.
   0.      0.      0.    ]]

Eliminated features:  Index(['mean radius', 'mean texture', 'mean smoothness', 'mean compactness',
       'mean concavity', 'mean concave points', 'mean symmetry',
       'mean fractal dimension', 'radius error', 'texture error',
       'perimeter error', 'smoothness error', 'compactness error',
       'concavity error', 'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst smoothness', 'worst compactness',
       'worst concavity', 'worst concave points', 'worst symmetry',
       'worst fractal dimension'],
      dtype='object')
Total 26 features eliminated
---------------------------------------------------------------------------
C=

### So, as we increase strength of regularization ie reduce C, more features get eliminated by l1 (lasso) regularization