In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn import model_selection
from sklearn.model_selection import KFold

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
import pandas as pd


In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold

## Data

In [3]:
data = pd.read_csv('feature_selection_data.csv')

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,y
0,26,Private,289700,Some-college,10,Married-civ-spouse,Other-service,Husband,White,Male,0,0,25,United-States,0
1,54,Private,329733,HS-grad,9,Never-married,Exec-managerial,Unmarried,White,Male,0,0,40,United-States,1
2,49,Local-gov,268234,HS-grad,9,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,21,Private,105577,Some-college,10,Never-married,Other-service,Own-child,White,Female,0,0,30,United-States,0
4,49,Local-gov,174981,Masters,14,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,47,United-States,1


In [5]:
data.shape

(2000, 15)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 15 columns):
age               2000 non-null int64
workclass         2000 non-null object
fnlwgt            2000 non-null int64
education         2000 non-null object
education.num     2000 non-null int64
marital.status    2000 non-null object
occupation        2000 non-null object
relationship      2000 non-null object
race              2000 non-null object
sex               2000 non-null object
capital.gain      2000 non-null int64
capital.loss      2000 non-null int64
hours.per.week    2000 non-null int64
native.country    2000 non-null object
y                 2000 non-null int64
dtypes: int64(7), object(8)
memory usage: 234.5+ KB


In [7]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
y                 0
dtype: int64

In [8]:
data.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,y
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,40.391,191776.4715,10.4635,2023.5435,131.647,42.2725,0.4755
std,12.963454,103014.656714,2.576623,10228.617873,494.896253,12.424549,0.499524
min,17.0,20057.0,1.0,0.0,0.0,1.0,0.0
25%,30.0,120914.5,9.0,0.0,0.0,40.0,0.0
50%,40.0,180060.0,10.0,0.0,0.0,40.0,0.0
75%,49.25,238095.0,13.0,0.0,0.0,50.0,1.0
max,84.0,981628.0,16.0,99999.0,2824.0,99.0,1.0


## Splitting into numeric and categorical columns
### Numeric data - MinMax scalling 
### Categorical data - OneHotEncoding

In [9]:
data_numerics = data.select_dtypes(include = ['number'])
data_categorics = data.select_dtypes(exclude=['number'])

categorical_dummies = pd.get_dummies(data_categorics)

numeric_scaler = MinMaxScaler()
numeric_scaled = numeric_scaler.fit_transform(data_numerics)
numeric_scaled = pd.DataFrame(numeric_scaled,columns=data_numerics.columns,index=data_numerics.index)

data_adult = pd.concat([categorical_dummies,numeric_scaled],axis=1)

In [10]:
data_adult.shape

(2000, 97)

In [11]:
data_adult.head()

Unnamed: 0,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,education_10th,education_11th,education_12th,...,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,y
0,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0.134328,0.280419,0.6,0.0,0.0,0.244898,0.0
1,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0.552239,0.322052,0.533333,0.0,0.0,0.397959,1.0
2,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0.477612,0.258095,0.533333,0.0,0.0,0.397959,1.0
3,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0.059701,0.088938,0.6,0.0,0.0,0.295918,0.0
4,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0.477612,0.161116,0.866667,0.0,0.0,0.469388,1.0


In [12]:
X = data_adult.iloc[:,:-1]
y = data_adult.iloc[:,-1]

## Spliting into train and test data

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [15]:
y_train.mean()# balanced dataset

0.475625

## Classification

In [19]:
forest = RandomForestClassifier()
kfold = KFold(n_splits=10,shuffle=True)

In [20]:
score_forest = cross_val_score(forest,
                               X_train,
                               y_train,
                               cv = kfold)

score_forest.mean()

0.78562500000000013

In [21]:
log = LogisticRegression()
score_log = cross_val_score(log,
                           X_train,
                           y_train,
                           cv=kfold)

score_log.mean()

0.80562500000000004

In [56]:
forest.__dict__

{'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=None, splitter='best'),
 'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'estimator_params': ('criterion',
  'max_depth',
  'min_samples_split',
  'min_samples_leaf',
  'min_weight_fraction_leaf',
  'max_features',
  'max_leaf_nodes',
  'min_impurity_split',
  'random_state'),
 'estimators_': [],
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_split': 1e-07,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Filtering according to feature variance

In [47]:
var_selector = VarianceThreshold(0.01)
var_selector.fit(X_train)

VarianceThreshold(threshold=0.01)

In [48]:
var_selector.variances_

array([ 0.04240586,  0.02968711,  0.06456211,  0.21623594,  0.0384    ,
        0.07881094,  0.04354336,  0.024375  ,  0.02733398,  0.00867344,
        0.00435586,  0.00682773,  0.01356094,  0.01051211,  0.03435586,
        0.04126523,  0.15582773,  0.014775  ,  0.21124023,  0.06563711,
        0.        ,  0.02674375,  0.16841836,  0.101775  ,  0.24483398,
        0.01234375,  0.189975  ,  0.02556094,  0.01899961,  0.04240586,
        0.09446836,  0.109375  ,  0.14559023,  0.02496836,  0.03551523,
        0.04411094,  0.07829336,  0.00124844,  0.12837344,  0.02019961,
        0.09446836,  0.02733398,  0.04467773,  0.24973594,  0.16553711,
        0.02378086,  0.10464844,  0.07464844,  0.0475    ,  0.00805898,
        0.02968711,  0.08138711,  0.00744375,  0.11723086,  0.19536094,
        0.19536094,  0.024375  ,  0.00373594,  0.00124844,  0.00187148,
        0.00249375,  0.00124844,  0.00062461,  0.00311523,  0.00249375,
        0.00062461,  0.00311523,  0.00062461,  0.00062461,  0.00

In [50]:
var_selector.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
       False, False, False,  True,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True, False,  True,
        True,  True,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
        True,  True,  True,  True,  True,  True], dtype=bool)

In [51]:
X.columns[var_selector.get_support()]

Index(['workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov', 'education_10th',
       'education_11th', 'education_7th-8th', 'education_9th',
       'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Prof-school', 'education_Some-college',
       'marital.status_Divorced', 'marital.status_Married-civ-spouse',
       'marital.status_Married-spouse-absent', 'marital.status_Never-married',
       'marital.status_Separated', 'marital.status_Widowed', 'occupation_?',
       'occupation_Adm-clerical', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Prof-specialty',
       'occupation_Protective-serv', '

In [52]:
X_train2 = var_selector.transform(X_train)

In [56]:
score_forest = cross_val_score(forest,
                               X_train2,
                               y_train,
                               cv = kfold)

score_forest.mean()

0.78874999999999995

In [57]:
log = LogisticRegression()
score_log = cross_val_score(log,
                           X_train2,
                           y_train,
                           cv=kfold)

score_log.mean()

0.80374999999999996

## Regularization L1

In [58]:
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV

In [73]:
lr_reg = LogisticRegression(C=0.5,penalty='l1')
lr_reg.fit(X_train,y_train)

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [88]:
sfm = SelectFromModel(lr_reg,threshold=1e-4)
sfm.fit(X_train,y_train)

filtered_data = sfm.transform(X_train)

In [89]:
filtered_data.shape

(1600, 40)

In [90]:
clf = LogisticRegressionCV(Cs=30,penalty='l1',solver='liblinear',cv=kfold)

In [92]:
sfm = SelectFromModel(clf,threshold=1e-4)
sfm.fit(X_train,y_train)

filtered_data = sfm.transform(X_train)

In [93]:
filtered_data.shape

(1600, 74)

## RandomForest classification with features selected with LogisticRegression with L1 regularization

In [108]:
fs_pipeline = Pipeline([("select_features",SelectFromModel(LogisticRegression(C=0.5,penalty='l1'),threshold=1e-4)),
                       ("model",RandomForestClassifier())])

In [109]:
params = {'select_features__threshold':[0.1, 0.2 ,0.4, 0.8, 1, 2]}

In [110]:
gs = GridSearchCV(fs_pipeline,
             param_grid=params,
            cv=kfold,
                 n_jobs=-1)
gs.fit(X_train,y_train)

GridSearchCV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
       error_score='raise',
       estimator=Pipeline(steps=[('select_features', SelectFromModel(estimator=LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, ...imators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'select_features__threshold': [0.1, 0.2, 0.4, 0.8, 1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [111]:
gs.grid_scores_



[mean: 0.78687, std: 0.02673, params: {'select_features__threshold': 0.1},
 mean: 0.79438, std: 0.02081, params: {'select_features__threshold': 0.2},
 mean: 0.79187, std: 0.03719, params: {'select_features__threshold': 0.4},
 mean: 0.78875, std: 0.03184, params: {'select_features__threshold': 0.8},
 mean: 0.78187, std: 0.03336, params: {'select_features__threshold': 1},
 mean: 0.70750, std: 0.02707, params: {'select_features__threshold': 2}]

## Grid.steps

In [142]:
gs.best_estimator_.steps[0][1].get_support().sum()

28

In [123]:
X_train.columns[gs.best_estimator_.steps[0][1].get_support()]

Index(['workclass_Federal-gov', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'education_10th', 'education_11th',
       'education_Assoc-voc', 'education_Bachelors', 'education_Prof-school',
       'marital.status_Married-civ-spouse', 'marital.status_Never-married',
       'occupation_?', 'occupation_Exec-managerial',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
       'occupation_Other-service', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech-support', 'relationship_Own-child',
       'relationship_Unmarried', 'relationship_Wife', 'sex_Female', 'age',
       'education.num', 'capital.gain', 'capital.loss', 'hours.per.week'],
      dtype='object')

In [132]:
gs.best_estimator_.steps[0][1].estimator_.coef_

array([[ -2.09671715e-04,   5.03704540e-01,  -4.08673753e-02,
          0.00000000e+00,   3.13514451e-01,  -4.37259415e-01,
         -1.24962208e-01,  -2.79458711e-01,  -2.21776080e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -3.83897587e-01,   4.06467400e-01,   0.00000000e+00,
          0.00000000e+00,   1.25605435e-01,   0.00000000e+00,
          9.72854241e-01,  -6.74169346e-02,   0.00000000e+00,
          1.38824004e+00,   0.00000000e+00,  -5.80704182e-01,
         -1.60439083e-01,   0.00000000e+00,  -3.23401044e-01,
          0.00000000e+00,  -1.59420561e-01,   4.45366048e-01,
         -1.70526937e-01,  -7.60705236e-01,  -3.64654736e-01,
         -9.13700205e-01,   0.00000000e+00,   7.24746627e-01,
          2.60385902e-01,   2.11980119e-01,   1.01344685e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -1.94213346e-01,  -9.22984970e-01,  -9.53107743e-01,
        

## RFE - REcursive Feature Elimination

In [145]:
from sklearn.feature_selection import RFE,RFECV

In [146]:
rfe_estimator = LogisticRegression(C=0.5,penalty='l1')

In [147]:
rfe_selector = RFECV(rfe_estimator,
                    cv=kfold,
                    step=5)

In [148]:
rfe_selector.fit(X_train,y_train)

RFECV(cv=KFold(n_splits=10, random_state=None, shuffle=True),
   estimator=LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
   n_jobs=1, scoring=None, step=5, verbose=0)

In [149]:
rfe_selector.grid_scores_

array([ 0.611875,  0.81    ,  0.80625 ,  0.81    ,  0.80625 ,  0.808125,
        0.81125 ,  0.811875,  0.8125  ,  0.8125  ,  0.8125  ,  0.8125  ,
        0.8125  ,  0.8125  ,  0.8125  ,  0.8125  ,  0.8125  ,  0.8125  ,
        0.8125  ,  0.8125  ])

In [150]:
rfe_selector.n_features_

96

In [153]:
X_train.columns[rfe_selector.get_support()]

Index(['workclass_?', 'workclass_Federal-gov', 'workclass_Local-gov',
       'workclass_Private', 'workclass_Self-emp-inc',
       'workclass_Self-emp-not-inc', 'workclass_State-gov', 'education_10th',
       'education_11th', 'education_12th', 'education_1st-4th',
       'education_5th-6th', 'education_7th-8th', 'education_9th',
       'education_Assoc-acdm', 'education_Assoc-voc', 'education_Bachelors',
       'education_Doctorate', 'education_HS-grad', 'education_Masters',
       'education_Preschool', 'education_Prof-school',
       'education_Some-college', 'marital.status_Divorced',
       'marital.status_Married-civ-spouse',
       'marital.status_Married-spouse-absent', 'marital.status_Never-married',
       'marital.status_Separated', 'marital.status_Widowed', 'occupation_?',
       'occupation_Adm-clerical', 'occupation_Craft-repair',
       'occupation_Exec-managerial', 'occupation_Farming-fishing',
       'occupation_Handlers-cleaners', 'occupation_Machine-op-inspct',
     