In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

## LOAD DATASET

In [2]:
dataset = load_breast_cancer()
data = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])
data['target'] = dataset['target']

In [3]:
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
data.shape

(569, 31)

In [5]:
data.target.value_counts() / len(data)

1    0.627417
0    0.372583
Name: target, dtype: float64

## FEATURE SELECTION

### Univariate Selection

In [6]:
X = data.iloc[:, 0:(len(data.columns)-1)]
Y = data.iloc[:,-1]

feature extraction

In [7]:
# 4 is the number of best features we desired
un_selection = SelectKBest(score_func=f_classif, k=4)

In [8]:
un_fit = un_selection.fit(X, Y)

summarize scores

In [9]:
set_printoptions(precision=3)

In [10]:
scores = pd.DataFrame(data=np.column_stack((data.columns[:-1], un_fit.scores_)), 
                      columns=['colname', 'score'])

In [11]:
scores = scores.sort_values(by='score', ascending=False)

In [12]:
scores

Unnamed: 0,colname,score
27,worst concave points,964.385
22,worst perimeter,897.944
7,mean concave points,861.676
20,worst radius,860.782
2,mean perimeter,697.235
23,worst area,661.6
0,mean radius,646.981
3,mean area,573.061
6,mean concavity,533.793
26,worst concavity,436.692


summarize selected features

In [13]:
features = un_fit.transform(X)

In [14]:
print(features[0:5,:])

[[1.471e-01 2.538e+01 1.846e+02 2.654e-01]
 [7.017e-02 2.499e+01 1.588e+02 1.860e-01]
 [1.279e-01 2.357e+01 1.525e+02 2.430e-01]
 [1.052e-01 1.491e+01 9.887e+01 2.575e-01]
 [1.043e-01 2.254e+01 1.522e+02 1.625e-01]]


### Recursive Feature Elimination

In [15]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from warnings import filterwarnings
filterwarnings('ignore')

In [16]:
model = LogisticRegression(solver='lbfgs')
# 5 is the number of best feature we desired
rfe = RFE(model, 10)
rfe_fit = rfe.fit(X, Y)
print("Num Features: %d" % rfe_fit.n_features_)
print("Selected Features: %s" % rfe_fit.support_)
print("Feature Ranking: %s" % rfe_fit.ranking_)

Num Features: 10
Selected Features: [ True False False False False False  True  True False False  True  True
  True False False False False False False False False False False False
 False  True  True  True  True False]
Feature Ranking: [ 1 12 15 19 11  4  1  1  9 18  1  1  1  6 21  5 13 16 17 20 10  2  3 14
  8  1  1  1  1  7]


In [17]:
RFE_rank = pd.DataFrame(data=np.column_stack((rfe_fit.ranking_, data.columns[:-1])), 
             columns=['ranking', 'colname'])

In [18]:
RFE_rank.sort_values(by='ranking', ascending=True)

Unnamed: 0,ranking,colname
0,1,mean radius
27,1,worst concave points
26,1,worst concavity
25,1,worst compactness
6,1,mean concavity
7,1,mean concave points
10,1,radius error
11,1,texture error
12,1,perimeter error
28,1,worst symmetry
