## Logistic regression - Feature selection methods

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets as ds

In [5]:
iris = ds.load_iris()
X = iris.data
y = iris.target
print(X[: 5, :])
print(X.shape)

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
(150, 4)


In [6]:
# manually increase the number of features
np.random.seed(100)
E = np.random.uniform(0, 1, size=(len(X), 10))
X = np.hstack((X, E))
print(X.shape)

(150, 14)


In [7]:
# total number of feature 14

In [9]:
# Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100, test_size=0.3)
print(X_train.shape)

(105, 14)


In [10]:
print(X_test.shape)

(45, 14)


## Removing features with low variance

 it removes all zero-variance features.

In [15]:

from sklearn.feature_selection import VarianceThreshold

sel_variance_threshold = VarianceThreshold() 
X_train_remove_variance = sel_variance_threshold.fit_transform(X_train)
print(X_train_remove_variance.shape)


(105, 14)


###  Univariant Feature selection

In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

sel_chi2 = SelectKBest(chi2, k=4)    # select 4 features
X_train_chi2 = sel_chi2.fit_transform(X_train, y_train)
print(sel_chi2.get_support())

[ True  True  True  True False False False False False False False False
 False False]


## Recursive feature elimination

In [20]:
from sklearn.linear_model import LogisticRegression

model_logistic = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)


In [22]:
from sklearn.feature_selection import RFE

sel_rfe_logistic = RFE(estimator=model_logistic, n_features_to_select=4, step=1)
X_train_rfe_logistic = sel_rfe_logistic.fit_transform(X_train, y_train)

In [23]:
print(sel_rfe_logistic.get_support())

[False  True  True  True False False False False False False False False
 False  True]


### Model before feature selection

In [27]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

model_logistic = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000)
model_logistic.fit(X_train, y_train)
predict = model_logistic.predict(X_test)
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

[[16  0  0]
 [ 0 11  0]
 [ 0  2 16]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.85      1.00      0.92        11
           2       1.00      0.89      0.94        18

    accuracy                           0.96        45
   macro avg       0.95      0.96      0.95        45
weighted avg       0.96      0.96      0.96        45



In [29]:
# Feature selection done based on the chi square
#X_train_chi2

### Model after feature selection

In [31]:
model_logistic = LogisticRegression(solver='saga', multi_class='multinomial', max_iter=10000)
model_logistic.fit(X_train_chi2, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
X_test_chi2 = sel_chi2.transform(X_test)
print(X_test.shape)
print(X_test_chi2.shape)

(45, 14)
(45, 4)


In [33]:
# use the feature which has be selected by shisquare

In [34]:
predict = model_logistic.predict(X_test_chi2)
print(confusion_matrix(y_test, predict))
print(classification_report(y_test, predict))

[[16  0  0]
 [ 0 11  0]
 [ 0  1 17]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       0.92      1.00      0.96        11
           2       1.00      0.94      0.97        18

    accuracy                           0.98        45
   macro avg       0.97      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

