## Mutual Information with KBest Feature Selection on a Logistic Regression Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.linear_model import LogisticRegression

In [2]:
# make classification data with binary target values
data = datasets.make_classification(
    n_features = 100, 
    n_samples = 250, 
    random_state = 101)

In [3]:
# make a dataframe to improve readability
df = pd.DataFrame(data[0])

# add the targets
df['Y'] = data[1]

# inspect the results
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,Y
0,0.082954,1.12359,0.450332,0.370683,0.317396,-0.93909,-1.441382,-2.004795,-1.651707,1.573928,...,0.934738,0.170818,-1.274354,1.689363,1.367228,0.799577,0.455014,-0.027221,0.069512,1
1,0.331101,-0.011939,-0.189653,0.466539,0.176258,0.637982,0.233582,1.813702,0.287999,0.628121,...,-0.1673,0.481967,-0.736539,2.487001,-0.107273,-0.595674,0.979999,-0.402239,-1.264807,1
2,-1.108849,0.192823,0.61889,-0.925066,-0.768808,-1.23127,-0.548487,1.068495,-1.692912,0.312868,...,-1.149508,0.210644,-0.618396,-0.022537,-0.056109,0.786038,1.209227,0.36231,0.582363,1
3,0.698912,-0.704523,-0.969457,0.145845,-0.72291,-0.206862,1.911764,-0.223875,1.488214,-0.296299,...,1.230865,0.409152,-1.696726,-0.93288,0.09306,-0.481263,-1.120367,-0.144715,0.861984,0
4,1.58748,0.994489,-0.782158,1.414338,-2.074271,-1.278377,0.33977,0.077908,0.670356,-1.945021,...,0.634655,0.395664,0.707859,0.510948,-0.530439,-2.272537,-0.046641,3.354155,-0.734669,0


In [4]:
# select explanatory variables
X = df.iloc[ : , :-1 ]

# .values will give the values in a numpy array (shape: (n,1))
# .ravel will convert that array shape to (n, ) (i.e. flatten it)
# make the target values
y = df.iloc[ : , -1: ].values.ravel()

# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 101)

# create the modle from the training data
clf = LogisticRegression(random_state = 101).fit(X_train, y_train)

# score the model using the testing data
clf.score(X_test,y_test)

0.8888888888888888

In [5]:
# create pipeline using K best selects where k = 4
pipe = Pipeline(
    [
        ('mutual_info', SelectKBest(mutual_info_classif, k = 4)),
        ('logistic_reg',LogisticRegression(random_state = 101))
    ])

In [6]:
# fit the data to the pipeline
best_ = pipe.fit(X_train,y_train)

# show the k-best features
best_[:-1].get_feature_names_out()

# test the reduced model with testing data
best_.score(X_test, y_test)

0.9047619047619048