In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,Id,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour
0,1,30,0,Non-Travel,Research & Development,2,3,Medical,571,3,...,3,0,12,2,11,7,6,7,4,1
1,2,36,0,Travel_Rarely,Research & Development,12,4,Life Sciences,1614,3,...,3,2,7,2,3,2,1,1,2,1
2,3,55,1,Travel_Rarely,Sales,2,1,Medical,842,3,...,3,0,12,3,9,7,7,3,5,1
3,4,39,0,Travel_Rarely,Research & Development,24,1,Life Sciences,2014,1,...,3,0,18,2,7,7,1,7,4,1
4,5,37,0,Travel_Rarely,Research & Development,3,3,Other,689,3,...,3,1,10,2,10,7,7,8,1,1


In [4]:
df.shape

(1628, 29)

In [5]:
cols_to_drop = ['Department', 'PerformanceRating','Behaviour', 'Education', 'Id']

In [6]:
df_reduced = df.drop(cols_to_drop, axis=1)
df_reduced.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DistanceFromHome,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,JobInvolvement,JobRole,...,OverTime,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill
0,30,0,Non-Travel,2,Medical,571,3,Female,3,Laboratory Technician,...,No,14,0,12,2,11,7,6,7,4
1,36,0,Travel_Rarely,12,Life Sciences,1614,3,Female,3,Manufacturing Director,...,Yes,12,2,7,2,3,2,1,1,2
2,55,1,Travel_Rarely,2,Medical,842,3,Male,3,Sales Executive,...,No,16,0,12,3,9,7,7,3,5
3,39,0,Travel_Rarely,24,Life Sciences,2014,1,Male,3,Research Scientist,...,No,13,0,18,2,7,7,1,7,4
4,37,0,Travel_Rarely,3,Other,689,3,Male,3,Manufacturing Director,...,No,15,1,10,2,10,7,7,8,1


In [7]:
df_reduced.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DistanceFromHome',
       'EducationField', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender',
       'JobInvolvement', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'CommunicationSkill'],
      dtype='object')

In [8]:
obj_cols = df_reduced.select_dtypes(include=['object']).columns.tolist()
obj_cols

['BusinessTravel',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [9]:
y = df_reduced.loc[:,['Attrition']].values.ravel()
df_train = df_reduced.drop(['Attrition'],axis=1)

In [10]:
df_onehot = pd.get_dummies(df_train, drop_first=True, columns=obj_cols)
df_onehot

Unnamed: 0,Age,DistanceFromHome,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Married,MaritalStatus_Single,OverTime_Yes
0,30,2,571,3,3,4,2564,0,14,0,...,1,0,0,0,0,0,0,0,1,0
1,36,12,1614,3,3,3,4663,9,12,2,...,0,0,1,0,0,0,0,1,0,1
2,55,2,842,3,3,4,5160,4,16,0,...,0,0,0,0,0,1,0,0,1,0
3,39,24,2014,1,3,4,4108,7,13,0,...,0,0,0,0,1,0,0,0,1,0
4,37,3,689,3,3,3,9434,1,15,1,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1623,42,19,752,3,4,3,2759,6,12,0,...,0,0,0,0,1,0,0,0,0,1
1624,55,2,842,3,3,4,5160,4,16,0,...,0,0,0,0,0,1,0,0,1,0
1625,25,9,1439,1,2,1,4400,3,12,0,...,0,0,0,0,0,0,1,1,0,0
1626,29,13,1844,1,2,1,2335,4,15,3,...,0,0,0,0,0,0,0,0,0,1


In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
df_train_onehot_scaled = sc.fit_transform(df_onehot)

In [12]:
X = df_train_onehot_scaled

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [14]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel

### Select From Model Function

In [15]:
def get_support_from_model(model_class, max_features, X_train, y_train):
    sel = SelectFromModel(model_class, max_features=max_features)
    sel.fit(X_train, y_train)
    return sel.get_support()

In [16]:
def get_support_from_rfe(model_class, max_features, X_train, y_train):
    rfe = RFE(model_class, max_features)
    rfe.fit(X_train, y_train)
    return rfe.get_support()

In [17]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.svm import SVC

**Select From Model**

In [29]:
set_m_A = set(df_onehot.columns[get_support_from_model(RandomForestClassifier(n_estimators=100), 20, X_train, y_train)])
set_m_A

{'Age',
 'CommunicationSkill',
 'DistanceFromHome',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'JobSatisfaction',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'OverTime_Yes',
 'PercentSalaryHike',
 'StockOptionLevel',
 'TotalWorkingYears',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}

In [30]:
set_m_B = set(df_onehot.columns[get_support_from_model(AdaBoostClassifier(n_estimators=100), 20, X_train, y_train)])
set_m_B

{'Age',
 'DistanceFromHome',
 'EmployeeNumber',
 'JobInvolvement',
 'JobSatisfaction',
 'MonthlyIncome',
 'OverTime_Yes',
 'PercentSalaryHike',
 'TotalWorkingYears',
 'YearsAtCompany',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}

In [31]:
set_m_A.intersection(set_m_B)

{'Age',
 'DistanceFromHome',
 'EmployeeNumber',
 'JobSatisfaction',
 'MonthlyIncome',
 'OverTime_Yes',
 'PercentSalaryHike',
 'TotalWorkingYears',
 'YearsAtCompany',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}

In [32]:
model_set = set_m_A.intersection(set_m_B)

**Select From RFE**

In [33]:
set_r_A = set(df_onehot.columns[get_support_from_rfe(RandomForestClassifier(n_estimators=100), 20, X_train, y_train)])
set_r_A

{'Age',
 'BusinessTravel_Travel_Frequently',
 'CommunicationSkill',
 'DistanceFromHome',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobSatisfaction',
 'MaritalStatus_Single',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'OverTime_Yes',
 'PercentSalaryHike',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}

In [34]:
set_r_B = set(df_onehot.columns[get_support_from_rfe(AdaBoostClassifier(n_estimators=100), 20, X_train, y_train)])
set_r_B

{'Age',
 'BusinessTravel_Travel_Frequently',
 'CommunicationSkill',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'OverTime_Yes',
 'PercentSalaryHike',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}

In [35]:
set_r_C = set(df_onehot.columns[get_support_from_rfe(LinearRegression(), 20, X_train, y_train)])
set_r_C

{'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'CommunicationSkill',
 'DistanceFromHome',
 'EducationField_Life Sciences',
 'EducationField_Marketing',
 'EducationField_Medical',
 'EducationField_Other',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobRole_Laboratory Technician',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction',
 'NumCompaniesWorked',
 'OverTime_Yes',
 'StockOptionLevel',
 'TotalWorkingYears',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion'}

In [36]:
set_r_D = set(df_onehot.columns[get_support_from_rfe(SGDClassifier(), 20, X_train, y_train)])
set_r_D

{'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'CommunicationSkill',
 'EducationField_Life Sciences',
 'EducationField_Other',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction',
 'MaritalStatus_Married',
 'MaritalStatus_Single',
 'MonthlyIncome',
 'OverTime_Yes',
 'YearsAtCompany',
 'YearsSinceLastPromotion'}

In [37]:
set_r_A.intersection(set_r_B).intersection(set_r_C).intersection(set_r_D)

{'BusinessTravel_Travel_Frequently',
 'CommunicationSkill',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobSatisfaction',
 'OverTime_Yes',
 'YearsSinceLastPromotion'}

In [38]:
rfe_set = set_r_A.intersection(set_r_B).intersection(set_r_C).intersection(set_r_D)

In [39]:
model_set.intersection(rfe_set)

{'JobSatisfaction', 'OverTime_Yes', 'YearsSinceLastPromotion'}

In [40]:
master_set = set_m_A.union(set_m_B).union(set_r_A).union(set_r_B).union(set_r_C).union(set_r_D)

In [41]:
master_set

{'Age',
 'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'CommunicationSkill',
 'DistanceFromHome',
 'EducationField_Life Sciences',
 'EducationField_Marketing',
 'EducationField_Medical',
 'EducationField_Other',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'JobInvolvement',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction',
 'MaritalStatus_Married',
 'MaritalStatus_Single',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'OverTime_Yes',
 'PercentSalaryHike',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}

In [42]:
set(df_onehot.columns).difference()

{'Age',
 'BusinessTravel_Travel_Frequently',
 'BusinessTravel_Travel_Rarely',
 'CommunicationSkill',
 'DistanceFromHome',
 'EducationField_Life Sciences',
 'EducationField_Marketing',
 'EducationField_Medical',
 'EducationField_Other',
 'EducationField_Technical Degree',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'Gender_Male',
 'JobInvolvement',
 'JobRole_Human Resources',
 'JobRole_Laboratory Technician',
 'JobRole_Manager',
 'JobRole_Manufacturing Director',
 'JobRole_Research Director',
 'JobRole_Research Scientist',
 'JobRole_Sales Executive',
 'JobRole_Sales Representative',
 'JobSatisfaction',
 'MaritalStatus_Married',
 'MaritalStatus_Single',
 'MonthlyIncome',
 'NumCompaniesWorked',
 'OverTime_Yes',
 'PercentSalaryHike',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager'}