# <center> <u> Feature Selection </u> </center>

![image.png](attachment:image.png)

In [1]:
#import these libraries as we are going to use them. 
# Note: just have a look what all libraries you have imported
import pandas as pd
import numpy as np


from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LinearRegression
from math import sqrt

## 1.Filter Methods:




a. Missing Value Ratio Threshold


In [2]:
# create a data frame named diabetes and load the csv file
diabetes = pd.read_csv("/content/diabetes.csv")
#print the head 
print(diabetes.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
#Glucose BloodPressure, SkinThickness, Insulin, and BMI features cannot be zero ,we will impute zeros with nan value in these features.
diabetes['Glucose'].replace(0, np.nan, inplace=True)
diabetes['BloodPressure'].replace(0, np.nan, inplace=True)
diabetes['SkinThickness'].replace(0, np.nan, inplace=True)
diabetes['Insulin'].replace(0, np.nan, inplace=True)
diabetes['BMI'].replace(0, np.nan, inplace=True)


In [4]:
#display the no of null values in each feature
missing_values = diabetes.isna().sum()
print(missing_values)

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [5]:
#percentage of missing values for Glucose
print(((missing_values['Glucose'] / len(diabetes)) * 100))


0.6510416666666667


In [6]:
# calculate the percentage for Bloodpressure
print(((missing_values['BloodPressure'] / len(diabetes)) * 100))

4.557291666666666


In [7]:
# calculate the percentage for SkinThickness
print(((missing_values['SkinThickness'] / len(diabetes)) * 100))

29.557291666666668


In [8]:
# calculate the percentage for Insulin
print(((missing_values['Insulin'] / len(diabetes)) * 100))

48.69791666666667


In [9]:
# calculate the percentage for BMI
print(((missing_values['BMI'] / len(diabetes)) * 100))

1.4322916666666665


In [10]:
#we are keep only those features which are having missing data less than 10% 

diabetes_missing_value_threshold= diabetes.dropna(thresh=int(diabetes.shape[0]*.9),axis=1)

# print diabetes_missing_value_threshold
print(diabetes_missing_value_threshold)

     Pregnancies  Glucose  BloodPressure   BMI  DiabetesPedigreeFunction  Age  \
0              6    148.0           72.0  33.6                     0.627   50   
1              1     85.0           66.0  26.6                     0.351   31   
2              8    183.0           64.0  23.3                     0.672   32   
3              1     89.0           66.0  28.1                     0.167   21   
4              0    137.0           40.0  43.1                     2.288   33   
..           ...      ...            ...   ...                       ...  ...   
763           10    101.0           76.0  32.9                     0.171   63   
764            2    122.0           70.0  36.8                     0.340   27   
765            5    121.0           72.0  26.2                     0.245   30   
766            1    126.0           60.0  30.1                     0.349   47   
767            1     93.0           70.0  30.4                     0.315   23   

     Outcome  
0          1

In [11]:
diabetes_missing_value_threshold_features=diabetes_missing_value_threshold.drop('Outcome',axis=1)
diabetes_missing_value_threshold_label = diabetes_missing_value_threshold['Outcome']
 

In [12]:
#print diabetes_missing_value_threshold_features
print(diabetes_missing_value_threshold_features)

     Pregnancies  Glucose  BloodPressure   BMI  DiabetesPedigreeFunction  Age
0              6    148.0           72.0  33.6                     0.627   50
1              1     85.0           66.0  26.6                     0.351   31
2              8    183.0           64.0  23.3                     0.672   32
3              1     89.0           66.0  28.1                     0.167   21
4              0    137.0           40.0  43.1                     2.288   33
..           ...      ...            ...   ...                       ...  ...
763           10    101.0           76.0  32.9                     0.171   63
764            2    122.0           70.0  36.8                     0.340   27
765            5    121.0           72.0  26.2                     0.245   30
766            1    126.0           60.0  30.1                     0.349   47
767            1     93.0           70.0  30.4                     0.315   23

[768 rows x 6 columns]


In [13]:
#print diabetes_missing_value_threshold_label
print(diabetes_missing_value_threshold_label)

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64


## (b) Variance Threshold



In [14]:
# create a data frame named diabetes and load the csv file
diabetes = pd.read_csv("/content/diabetes_cleaned.csv")
#print the head 
print(diabetes.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness     Insulin   BMI  \
0          6.0    148.0           72.0           35.0  218.937760  33.6   
1          1.0     85.0           66.0           29.0   70.189298  26.6   
2          8.0    183.0           64.0           29.0  269.968908  23.3   
3          1.0     89.0           66.0           23.0   94.000000  28.1   
4          0.0    137.0           40.0           35.0  168.000000  43.1   

   DiabetesPedigreeFunction   Age  Outcome  
0                     0.627  50.0        1  
1                     0.351  31.0        0  
2                     0.672  32.0        1  
3                     0.167  21.0        0  
4                     2.288  33.0        1  


In [15]:
# seperate the features and the target as x and y 
x = diabetes.drop('Outcome',axis=1)
y = diabetes['Outcome']


In [16]:
# Return  the variance for X along the specified axis=0 for calculating independent variance.
np.var(x,axis=0)

Pregnancies                   11.339272
Glucose                      928.469828
BloodPressure                146.131068
SkinThickness                 77.184935
Insulin                     9471.909972
BMI                           48.750058
DiabetesPedigreeFunction       0.109636
Age                          138.122964
dtype: float64

In [17]:
# import minmax_scale
from sklearn.preprocessing import minmax_scale
# use minmax scale with feature_range=(0,10) and columns=X.columns,to scale the features of dataframe and store them into X_scaled_df 
x_scaled_df = minmax_scale(x,feature_range=(0,10))
X_scaled = pd.DataFrame(x_scaled_df,columns=x.columns)

In [18]:
# return X_scaled_df
print(X_scaled)

     Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0       3.529412  6.709677       4.897959       3.043478  2.740295  3.149284   
1       0.588235  2.645161       4.285714       2.391304  1.018185  1.717791   
2       4.705882  8.967742       4.081633       2.391304  3.331099  1.042945   
3       0.588235  2.903226       4.285714       1.739130  1.293850  2.024540   
4       0.000000  6.000000       1.632653       3.043478  2.150572  5.092025   
..           ...       ...            ...            ...       ...       ...   
763     5.882353  3.677419       5.306122       4.456522  2.289500  3.006135   
764     1.176471  5.032258       4.693878       2.173913  2.044244  3.803681   
765     2.941176  4.967742       4.897959       1.739130  1.502241  1.635992   
766     0.588235  5.290323       3.673469       2.391304  2.217956  2.433538   
767     0.588235  3.161290       4.693878       2.608696  1.215086  2.494888   

     DiabetesPedigreeFunction       Age

In [19]:
# Again return  the variance for X along the specified axis=0 to check the scales after using minmax scaler.
np.var(X_scaled,axis=0)

Pregnancies                 3.923624
Glucose                     3.864599
BloodPressure               1.521565
SkinThickness               0.911920
Insulin                     1.269563
BMI                         2.038719
DiabetesPedigreeFunction    1.998841
Age                         3.836749
dtype: float64

In [20]:
## To remove low  variance feature using variance threshold

# import variancethreshold

from sklearn.feature_selection import VarianceThreshold



# set threshold=1 and define it to variable select_features
variance_threshold=VarianceThreshold(threshold=1)

variance_threshold.fit(X_scaled)
variance_threshold.get_support()

array([ True,  True,  True, False,  True,  True,  True,  True])

In [21]:
constant_columns = [column for column in X_scaled.columns
                    if column not in x.columns[variance_threshold.get_support()]]


In [22]:
# Dropping constant columns
X_scaled.drop(constant_columns,axis=1,inplace=True)



In [69]:
# print of head values of X_scaled after dropping constant columns
print(X_scaled.head())
print(X_scaled.columns)

   Pregnancies   Glucose  BloodPressure   Insulin       BMI  \
0     3.529412  6.709677       4.897959  2.740295  3.149284   
1     0.588235  2.645161       4.285714  1.018185  1.717791   
2     4.705882  8.967742       4.081633  3.331099  1.042945   
3     0.588235  2.903226       4.285714  1.293850  2.024540   
4     0.000000  6.000000       1.632653  2.150572  5.092025   

   DiabetesPedigreeFunction       Age  
0                  2.344150  4.833333  
1                  1.165670  1.666667  
2                  2.536294  1.833333  
3                  0.380017  0.000000  
4                  9.436379  2.000000  
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


## (c) Chi-Squared statistical test (SelectKBest)




In [24]:
def generate_feature_scores_df(X,Score):
    feature_score=pd.DataFrame()
    for i in range(X.shape[1]):
        new =pd.DataFrame({"Features":X.columns[i],"Score":Score[i]},index=[i])
        feature_score=pd.concat([feature_score,new])
    return feature_score

In [51]:
# create a data frame named diabetes and load the csv file again
diabetes = pd.read_csv("/content/diabetes.csv")

In [52]:
# assign features to X variable and 'outcome' to y variable from the dataframe diabetes

x = diabetes.drop('Outcome',axis=1)
y = diabetes['Outcome']

In [53]:
#import chi2 and SelectKBest
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


In [54]:
# converting data cast to a float type.
x = x.astype(float)

In [55]:
# Initialise SelectKBest with above parameters 
chi2_test=SelectKBest(chi2,k=4)

# fit it with X and Y
chi2_model=chi2_test.fit(x,y)



In [56]:
#print the scores of chi2_model
chi2_model.scores_

array([ 111.51969064, 1411.88704064,   17.60537322,   53.10803984,
       2175.56527292,  127.66934333,    5.39268155,  181.30368904])

In [60]:
# use generate_feature_scores_df function to get features and their respective scores passing X and chi2_model.scores_ as paramter
feature_score_df=generate_feature_scores_df(x,chi2_model.scores_)


# return feature_score_df
feature_score_df


Unnamed: 0,Features,Score
0,Pregnancies,111.519691
1,Glucose,1411.887041
2,BloodPressure,17.605373
3,SkinThickness,53.10804
4,Insulin,2175.565273
5,BMI,127.669343
6,DiabetesPedigreeFunction,5.392682
7,Age,181.303689


In [62]:
#Lets get X with selected features of chi2_model using tranform function so we will have X_new
X_new= chi2_model.transform(x)


[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 ...
 [121.  112.   26.2  30. ]
 [126.    0.   30.1  47. ]
 [ 93.    0.   30.4  23. ]]


In [87]:
# Convert X_new into a dataframe

X_new=pd.DataFrame(X_new)


In [88]:
def get_features(raw_df,processed_df):
  selected_features = []
  for i in range(len(processed_df.columns)):
    for j in range(len(raw_df.columns)):
      if(processed_df.iloc[:,i].equals(raw_df.iloc[:,j])):
        selected_features.append(raw_df.columns[j])
  return selected_features

In [89]:
#repeat the previous steps of calling get_selected_features function( pass X and X_new as score in the function)
selected_features=get_features(x,X_new)

# return selected_features
selected_features

['Glucose', 'Insulin', 'BMI', 'Age']

Let have X with all features given in list selected_features and save this dataframe in variable chi2_best_features

In [91]:

chi2_best_features = x[selected_features]
# print chi2_best_features.head()
chi2_best_features.head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148.0,0.0,33.6,50.0
1,85.0,0.0,26.6,31.0
2,183.0,0.0,23.3,32.0
3,89.0,94.0,28.1,21.0
4,137.0,168.0,43.1,33.0


## (d) Anova-F Test


In [94]:
#import libraries
from sklearn.feature_selection import f_classif,SelectPercentile


# Initialise SelectPercentile function with parameters f_classif and percentile as 80
Anova_test= SelectPercentile(percentile=80)



#Fit the above object to the features and target i.e X and Y
Anova_model=Anova_test.fit(x,y)


In [96]:
# return scores of anova model
print(Anova_model.scores_)

[ 39.67022739 213.16175218   3.2569504    4.30438091  13.28110753
  71.7720721   23.8713002   46.14061124]


In [98]:
# use generate_feature_scores_df function to get features and their respective scores by passing X and Anova_model.scores_ as score in function 

feature_scores_df=generate_feature_scores_df(x,Anova_model.scores_)
# print feature_scores_df
print(feature_scores_df)

                   Features       Score
0               Pregnancies   39.670227
1                   Glucose  213.161752
2             BloodPressure    3.256950
3             SkinThickness    4.304381
4                   Insulin   13.281108
5                       BMI   71.772072
6  DiabetesPedigreeFunction   23.871300
7                       Age   46.140611


In [101]:
# Get all supported columns values in Anova_model with indices=True
cols = Anova_model.get_support(indices=True)
print(cols)
# Reduce X to the selected features of anova model using tranform 

X_new = x.iloc[:,cols]



[0 1 4 5 6 7]


In [102]:
#print X_new.head()
X_new.head()

Unnamed: 0,Pregnancies,Glucose,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6.0,148.0,0.0,33.6,0.627,50.0
1,1.0,85.0,0.0,26.6,0.351,31.0
2,8.0,183.0,0.0,23.3,0.672,32.0
3,1.0,89.0,94.0,28.1,0.167,21.0
4,0.0,137.0,168.0,43.1,2.288,33.0
