In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.head()

Unnamed: 0,Id,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeNumber,EnvironmentSatisfaction,...,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill,Behaviour
0,1,30,0,Non-Travel,Research & Development,2,3,Medical,571,3,...,3,0,12,2,11,7,6,7,4,1
1,2,36,0,Travel_Rarely,Research & Development,12,4,Life Sciences,1614,3,...,3,2,7,2,3,2,1,1,2,1
2,3,55,1,Travel_Rarely,Sales,2,1,Medical,842,3,...,3,0,12,3,9,7,7,3,5,1
3,4,39,0,Travel_Rarely,Research & Development,24,1,Life Sciences,2014,1,...,3,0,18,2,7,7,1,7,4,1
4,5,37,0,Travel_Rarely,Research & Development,3,3,Other,689,3,...,3,1,10,2,10,7,7,8,1,1


In [4]:
df.shape

(1628, 29)

In [5]:
#cols_to_drop = ['Department', 'PerformanceRating','Behaviour', 'Id']
cols_to_drop = ['Department', 'PerformanceRating','Behaviour', 'Education', 'Id']

In [6]:
df.drop(cols_to_drop, inplace=True, axis=1)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DistanceFromHome,EducationField,EmployeeNumber,EnvironmentSatisfaction,Gender,JobInvolvement,JobRole,...,OverTime,PercentSalaryHike,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,CommunicationSkill
0,30,0,Non-Travel,2,Medical,571,3,Female,3,Laboratory Technician,...,No,14,0,12,2,11,7,6,7,4
1,36,0,Travel_Rarely,12,Life Sciences,1614,3,Female,3,Manufacturing Director,...,Yes,12,2,7,2,3,2,1,1,2
2,55,1,Travel_Rarely,2,Medical,842,3,Male,3,Sales Executive,...,No,16,0,12,3,9,7,7,3,5
3,39,0,Travel_Rarely,24,Life Sciences,2014,1,Male,3,Research Scientist,...,No,13,0,18,2,7,7,1,7,4
4,37,0,Travel_Rarely,3,Other,689,3,Male,3,Manufacturing Director,...,No,15,1,10,2,10,7,7,8,1


In [7]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DistanceFromHome',
       'EducationField', 'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender',
       'JobInvolvement', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager', 'CommunicationSkill'],
      dtype='object')

In [8]:
obj_cols = df.select_dtypes(include=['object']).columns.tolist()
obj_cols

['BusinessTravel',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'OverTime']

In [9]:
obj_cols_prefix = [col+'_' for col in obj_cols]
obj_cols_prefix

['BusinessTravel_',
 'EducationField_',
 'Gender_',
 'JobRole_',
 'MaritalStatus_',
 'OverTime_']

In [10]:
y = df.loc[:,['Attrition']].values.ravel()
df.drop(['Attrition'],inplace=True,axis=1)

In [11]:
df = pd.get_dummies(df, drop_first=True, columns=obj_cols, prefix=obj_cols_prefix )
df

Unnamed: 0,Age,DistanceFromHome,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobSatisfaction,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StockOptionLevel,...,JobRole__Laboratory Technician,JobRole__Manager,JobRole__Manufacturing Director,JobRole__Research Director,JobRole__Research Scientist,JobRole__Sales Executive,JobRole__Sales Representative,MaritalStatus__Married,MaritalStatus__Single,OverTime__Yes
0,30,2,571,3,3,4,2564,0,14,0,...,1,0,0,0,0,0,0,0,1,0
1,36,12,1614,3,3,3,4663,9,12,2,...,0,0,1,0,0,0,0,1,0,1
2,55,2,842,3,3,4,5160,4,16,0,...,0,0,0,0,0,1,0,0,1,0
3,39,24,2014,1,3,4,4108,7,13,0,...,0,0,0,0,1,0,0,0,1,0
4,37,3,689,3,3,3,9434,1,15,1,...,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1623,42,19,752,3,4,3,2759,6,12,0,...,0,0,0,0,1,0,0,0,0,1
1624,55,2,842,3,3,4,5160,4,16,0,...,0,0,0,0,0,1,0,0,1,0
1625,25,9,1439,1,2,1,4400,3,12,0,...,0,0,0,0,0,0,1,1,0,0
1626,29,13,1844,1,2,1,2335,4,15,3,...,0,0,0,0,0,0,0,0,0,1


In [12]:
X = df.values

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)

In [16]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 50), max_features=20)
sel.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=50, n_jobs=None,
                                                 oob_score=False,


In [17]:
df.columns[sel.get_support()]

Index(['Age', 'DistanceFromHome', 'EmployeeNumber', 'EnvironmentSatisfaction',
       'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'OverTime__Yes'],
      dtype='object')