In [1]:
#Importing the required libraries
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold

import shap
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor
from boruta import BorutaPy
#from sklearn.preprocessing import OneHotEncoder, LabelEncoder
#from sklearn import tree

In [2]:
class Data:
    def __init__(self,df, pfeatures, target):
        '''
        function to intialise the class object 
        @params
        df = dataframe containing all the data with target variable 
        pfeatures=  list of columns other than target to do feature selection
        target = target variable in list
        '''
        self.df = df
        self.pfeatures = pfeatures
        self.target = target 
        
        

In [36]:
class FeatureSelection(Data):
    
    def __init__(self, Data, n_feature):
        '''
        function to intialise the class object 
        '''
        self.df = Data.df
        self.pfeatures = Data.pfeatures
        self.target = Data.target
        self.feature_results = pd.DataFrame()
        self.n_feature = n_feature
        self.union_feature = []
        self.inter_feature = []
    
    def recursive(self,estimator):
        '''
        function to do recursive feature selection and accepts estimator
        as parameter which is sklearn ML algorithm object 
        '''
        selector = RFE(estimator, 
                       n_features_to_select= self.n_feature, step=1)
        selector = selector.fit(self.df[self.pfeatures], 
                                self.df[self.target].values.ravel())
        temp = pd.DataFrame({"Features":self.pfeatures,
                             "Ranking":selector.ranking_,
                             "Method":'RFE'})
        temp = temp.sort_values('Ranking')
        
        temp1 = temp.head(self.n_feature)
        self.inter_feature = temp1['Features'].tolist()
        self.union_feature = temp1['Features'].tolist()
        self.feature_results=self.feature_results.append(temp)
        #print("Overall Accuracy using RFE: ", selector.score(self.df[self.pfeatures],self.df[self.target].values.ravel()))
        return temp

    def coefficient_based(self,estimator):
        '''
        function to do coefficient based feature selection and accepts estimator
        as parameter which is sklearn ML algorithm object 
        '''
        selector = SelectFromModel(estimator)
        selector = selector.fit(self.df[self.pfeatures], 
                                self.df[self.target].values.ravel())
        temp=pd.DataFrame({"Features":self.pfeatures,
                           "Selected":selector.get_support(),
                           "Selected_coef": selector.estimator_.coef_[0],
                           "Method":'Coefficient_based',
                           "Threshold":selector.threshold_})
        temp = temp.sort_values(by=['Selected'], ascending=False)
        
        temp1 = temp.head(self.n_feature)
        temp1_list = temp1['Features'].tolist()
        self.inter_feature = list(set(self.inter_feature) & set(temp1_list))
        self.union_feature = self.union_feature + temp1_list
        self.feature_results= self.feature_results.append(temp)
        return temp

    def ShAP_XGB(self, estimator):
        '''
        function to consider all possible predictions for an instance using all
        possible combinations of inputs based on elements of game theory and accepts estimator
        as parameter which is sklearn ML algorithm object 
        '''
        #Implemeting the feature selection using ShAP
        model = estimator
        # Fit the Model
        model.fit(self.df[self.pfeatures], 
                  self.df[self.target].values.ravel())

        # load JS visualization code to notebook
        #shap.initjs()

        #Collect the explainer and the shap_values
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(self.df[self.pfeatures])

        #Plot a standard bar plot
        #shap.summary_plot(shap_values, features=self.df[self.pfeatures], feature_names=self.df[self.pfeatures].columns, plot_type='bar')
        #shap.summary_plot(shap_values, features=self.df[self.pfeatures], feature_names=self.df[self.pfeatures].columns)
        shap_sum = np.abs(shap_values).mean(axis=0)
        importance_df = pd.DataFrame([self.df[self.pfeatures].columns.tolist(), shap_sum.tolist()]).T
        importance_df.columns = ['Features', 'shap_importance']
        importance_df = importance_df.sort_values('shap_importance', ascending=False)
        importance_df['Method'] = 'ShAP'
        
        temp1 = importance_df.head(self.n_feature)
        temp1_list = temp1['Features'].tolist()
        self.inter_feature = list(set(self.inter_feature) & set(temp1_list))
        self.union_feature = self.union_feature + temp1_list
        self.feature_results= self.feature_results.append(importance_df)
        return importance_df
    
    def Boruta_rf(self, estimator):
        '''
        function selects the features that actually have relationship with outcome variable
        The algorithm reshuffles the data to create shadow features
        It eliminates the features that have significantly worst importance than shadow ones
        There are few limitations in the approach and accepts estimator
        as parameter which is sklearn ML algorithm object 
        '''
        rf_model = estimator
        feat_selector = BorutaPy(rf_model,n_estimators = 'auto', verbose= 0,max_iter= 100)
        feat_selector.fit(self.df[self.pfeatures].values, self.df[self.target].values.ravel())
        temp = pd.DataFrame({"Features":self.pfeatures,
                             "Ranking":feat_selector.ranking_,
                             "Method":'Boruta'})
        temp = temp.sort_values('Ranking')
        
        temp1 = temp.head(self.n_feature)
        temp1_list = temp1['Features'].tolist()
        self.inter_feature = list(set(self.inter_feature) & set(temp1_list))
        self.union_feature = self.union_feature + temp1_list
        self.feature_results=self.feature_results.append(temp)
        return temp
    
    def Var_threshold(self, threshold_value):
        '''
        function selects the features based on the defined threshold taking 
        into consideration the multicollinearity and accepts threshold_value
        as parameter 
        '''
        selector = VarianceThreshold(threshold = threshold_value)
        selector.fit(self.df[self.pfeatures])
        #selector.get_support()
        temp = pd.DataFrame({"Features":self.pfeatures,
                             "Ranking":selector.get_support(),
                             "Method":'Variance Threshold'})
        temp = temp.sort_values(by=['Ranking'], ascending=False)
        
        temp1 = temp.head(self.n_feature)
        temp1_list = temp1['Features'].tolist()
        self.inter_feature = list(set(self.inter_feature) & set(temp1_list))
        self.union_feature = self.union_feature + temp1_list
        self.feature_results=self.feature_results.append(temp)
        return temp


In [37]:
# Read & Display data
data = pd.read_csv("C:/Users/Aman.aggarwal/Documents/Exelon/Chem Sampling/wine.csv")
data.head()

Unnamed: 0,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [38]:
#Assigning the dependent & independent variable
pfeatures = ['Malic.acid', 'Ash', 'Acl', 'Mg', 'Phenols',
             'Flavanoids', 'Nonflavanoid.phenols', 'Proanth', 'Color.int', 'Hue', 'OD', 'Proline']
target=['Alcohol']

#Creating the object for Data function
r= Data(data, pfeatures, target)

In [39]:
#Creating the object for Data function
r= Data(data, pfeatures, target)

In [40]:
#Creating the object for feature selection and specifying the number of features 
m = FeatureSelection(r, 5)

#Assigning the models
estimator1 = DecisionTreeRegressor()
estimator2 = SVR(kernel="linear")
estimator3 = XGBRegressor(n_estimators=1000, max_depth=10, learning_rate=0.001)
estimator4 = RandomForestRegressor(n_jobs= 4, oob_score= True)

In [41]:
#Calling the RFE function
m.recursive(estimator1)

Unnamed: 0,Features,Ranking,Method
1,Ash,1,RFE
4,Phenols,1,RFE
6,Nonflavanoid.phenols,1,RFE
8,Color.int,1,RFE
11,Proline,1,RFE
9,Hue,2,RFE
0,Malic.acid,3,RFE
5,Flavanoids,4,RFE
3,Mg,5,RFE
2,Acl,6,RFE


In [42]:
m.coefficient_based(estimator2)

Unnamed: 0,Features,Selected,Selected_coef,Method,Threshold
0,Malic.acid,True,0.162499,Coefficient_based,0.109256
4,Phenols,True,-0.127438,Coefficient_based,0.109256
5,Flavanoids,True,0.16222,Coefficient_based,0.109256
7,Proanth,True,-0.24807,Coefficient_based,0.109256
8,Color.int,True,0.180297,Coefficient_based,0.109256
10,OD,True,0.292363,Coefficient_based,0.109256
1,Ash,False,0.042729,Coefficient_based,0.109256
2,Acl,False,-0.041879,Coefficient_based,0.109256
3,Mg,False,0.004349,Coefficient_based,0.109256
6,Nonflavanoid.phenols,False,0.02944,Coefficient_based,0.109256


In [43]:
m.ShAP_XGB(estimator3)

Unnamed: 0,Features,shap_importance,Method
8,Color.int,0.165789,ShAP
11,Proline,0.0288032,ShAP
0,Malic.acid,0.0,ShAP
1,Ash,0.0,ShAP
2,Acl,0.0,ShAP
3,Mg,0.0,ShAP
4,Phenols,0.0,ShAP
5,Flavanoids,0.0,ShAP
6,Nonflavanoid.phenols,0.0,ShAP
7,Proanth,0.0,ShAP


In [44]:
m.Boruta_rf(estimator4)

Unnamed: 0,Features,Ranking,Method
1,Ash,1,Boruta
4,Phenols,1,Boruta
7,Proanth,1,Boruta
8,Color.int,1,Boruta
11,Proline,1,Boruta
0,Malic.acid,2,Boruta
2,Acl,3,Boruta
5,Flavanoids,4,Boruta
9,Hue,5,Boruta
10,OD,6,Boruta


In [49]:
m.Var_threshold(0.8)

Unnamed: 0,Features,Ranking,Method
0,Malic.acid,True,Variance Threshold
2,Acl,True,Variance Threshold
3,Mg,True,Variance Threshold
5,Flavanoids,True,Variance Threshold
8,Color.int,True,Variance Threshold
11,Proline,True,Variance Threshold
1,Ash,False,Variance Threshold
4,Phenols,False,Variance Threshold
6,Nonflavanoid.phenols,False,Variance Threshold
7,Proanth,False,Variance Threshold


In [46]:
m.feature_results

Unnamed: 0,Features,Method,Ranking,Selected,Selected_coef,Threshold,shap_importance
1,Ash,RFE,1.0,,,,
4,Phenols,RFE,1.0,,,,
6,Nonflavanoid.phenols,RFE,1.0,,,,
8,Color.int,RFE,1.0,,,,
11,Proline,RFE,1.0,,,,
9,Hue,RFE,2.0,,,,
0,Malic.acid,RFE,3.0,,,,
5,Flavanoids,RFE,4.0,,,,
3,Mg,RFE,5.0,,,,
2,Acl,RFE,6.0,,,,


In [47]:
#Get union of n_features from all feature selection methods
Union_Final_feature = list(set(m.union_feature)) 
Union_Final_feature

['Ash',
 'Acl',
 'Phenols',
 'Proline',
 'Malic.acid',
 'Nonflavanoid.phenols',
 'Flavanoids',
 'Proanth',
 'Color.int',
 'Mg']

In [48]:
#Get intersection of n_features from all feature selection methods
Intersection_Final_feature = m.inter_feature
Intersection_Final_feature

['Color.int']

In [None]:
# Prepare X and Y 
X = data.drop(['Alcohol'], inplace=False, axis=1)
y = data['Alcohol']