In [9]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score

from matplotlib import pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

In [10]:
#Load Data
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

#used columns
target = 'target'
used_cols = [c for c in df.columns.tolist() if c not in [target]]
X, y = df[used_cols], df[target]

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

## Recursive Feature Elimination

In [11]:
#No Feature Selection
lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=500)
lr.fit(X_train, y_train)
y_guess = lr.predict(X_train)
y_score = lr.predict(X_test)
print("Accuracy No Feature Selection:",accuracy_score(y_test, y_score))
print('orignial features:', str(len(used_cols)))
print()

# Recursive Feature Elimination
from sklearn.feature_selection import RFE
rfe = RFE(lr, n_features_to_select=7)
rfe.fit(X_train, y_train)
y_guess = rfe.predict(X_train)
y_score = rfe.predict(X_test)

rfe_support = rfe.get_support()
rfe_feature = X.loc[:,rfe_support].columns.tolist()
print("Accuracy With Feature Selection:",accuracy_score(y_test, y_score))
print('Selected features:', str(len(rfe_feature)))

Accuracy No Feature Selection: 0.9707602339181286
orignial features: 30

Accuracy With Feature Selection: 0.9824561403508771
Selected features: 7


### Which Features Did we Choose?

In [12]:
rfe_feature

['mean radius',
 'mean concavity',
 'worst radius',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry']

## Sequential Feature Selection

In [6]:
#No Feature Selection
lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1, max_iter=500)
lr.fit(X_train, y_train)
y_guess = lr.predict(X_train)
y_score = lr.predict(X_test)
print("Accuracy No Feature Selection:",accuracy_score(y_test, y_score))
print('orignial features:', str(len(used_cols)))
print()

# Sequential Feature Selection
from mlxtend.feature_selection import SequentialFeatureSelector
sfs = SequentialFeatureSelector(lr, k_features='best', n_jobs=-1)
sfs.fit(X_train, y_train)
# y_guess = sfs.predict(X_train)
features = list(sfs.k_feature_names_)
lr.fit(X_train[features], y_train)
y_score = lr.predict(X_test[features])

#save features
print("Accuracy With Feature Selection:",accuracy_score(y_test, y_score))
print('Selected features:', str(len(features)))

Accuracy No Feature Selection: 0.9707602339181286
orignial features: 30

Accuracy With Feature Selection: 0.9649122807017544
Selected features: 23


### Which Features did We Choose?

In [8]:
features

['mean texture',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension',
 'texture error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst radius',
 'worst texture',
 'worst area',
 'worst smoothness',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry']

## Exhaustive Feature Selection

In [5]:
# #No Feature Selection
# lr = LogisticRegression(class_weight = 'balanced', solver = 'lbfgs', random_state=42, n_jobs=-1)
# lr.fit(X_train, y_train)
# y_guess = lr.predict(X_train)
# y_score = lr.predict(X_test)
# print("Accuracy No Feature Selection:",accuracy_score(y_test, y_score))
# print('orignial features:', str(len(used_cols)))
# print()

# # Exhaustive Feature Elimination
# from mlxtend.feature_selection import ExhaustiveFeatureSelector
# efs = ExhaustiveFeatureSelector(lr, n_jobs=-1, max_features=5)
# efs.fit(X_train, y_train)
# # y_guess = sfs.predict(X_train)
# features = list(efs.best_feature_names_)
# lr.fit(X_train[features], y_train)
# y_score = lr.predict(X_test[features])

# #save features
# print("Accuracy With Feature Selection:",accuracy_score(y_test, y_score))
# print('Selected features:', str(len(features)))