# Important Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

# Loading Data set

In [2]:
data = pd.read_csv("general_data.csv")

In [3]:
data.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4


In [4]:
data.shape

(4410, 24)

In [5]:
data.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [6]:
data.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeID                   int64
Gender                      object
JobLevel                     int64
JobRole                     object
MaritalStatus               object
MonthlyIncome                int64
NumCompaniesWorked         float64
Over18                      object
PercentSalaryHike            int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears          float64
TrainingTimesLastYear        int64
YearsAtCompany               int64
YearsSinceLastPromotion      int64
YearsWithCurrManager         int64
dtype: object

# Handling missing value

In [7]:
data['NumCompaniesWorked'] = data['NumCompaniesWorked'].fillna(method='bfill')

In [8]:
data['TotalWorkingYears'].mean()

11.279936378095888

In [9]:
data['TotalWorkingYears'] = np.where(data['TotalWorkingYears'],11.30,data['TotalWorkingYears'])

# Converting cotegorical variable into numerical variable

In [10]:
le = LabelEncoder()

In [11]:
data['Attrition'] = le.fit_transform(data['Attrition'])
data['BusinessTravel'] = le.fit_transform(data['BusinessTravel'])
data['Department'] = le.fit_transform(data['Department'])
data['EducationField '] = le.fit_transform(data['EducationField'])
data['Gender'] = le.fit_transform(data['Gender'])
data['JobRole'] = le.fit_transform(data['JobRole'])
data['MaritalStatus'] = le.fit_transform(data['MaritalStatus'])
data['Over18'] = le.fit_transform(data['Over18'])


In [14]:
data = data.drop('EducationField',axis=1)

In [15]:
data.head(2)

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EmployeeCount,EmployeeID,Gender,JobLevel,...,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EducationField
0,51,0,2,2,6,2,1,1,0,1,...,0,11,8,0,11.3,6,1,0,0,1
1,31,1,1,1,10,1,1,2,0,1,...,0,23,8,1,11.3,3,5,1,4,1


# Spliting data into train and test

In [16]:
X = data.drop('Attrition',axis=1)
y = data.Attrition

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)

In [20]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(3307, 23) (1103, 23) (3307,) (1103,)


In [21]:
X_train.columns

Index(['Age', 'BusinessTravel', 'Department', 'DistanceFromHome', 'Education',
       'EmployeeCount', 'EmployeeID', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked', 'Over18',
       'PercentSalaryHike', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsAtCompany',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'EducationField '],
      dtype='object')

In [23]:
X_train = X_train.drop('EmployeeID',axis=1)

# Random forest classifier

In [25]:
X_train_features = X_train[['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
                            'Education','EmployeeCount',  'Gender', 
                            'JobLevel', 'JobRole','MaritalStatus', 'MonthlyIncome',
                            'NumCompaniesWorked', 'Over18','PercentSalaryHike', 
                            'StandardHours', 'StockOptionLevel','TotalWorkingYears',
                            'TrainingTimesLastYear', 'YearsAtCompany','YearsSinceLastPromotion', 
                            'YearsWithCurrManager', 'EducationField ']]

In [26]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=2,oob_score=True)

In [27]:
rf_model.fit(X=X_train_features,y=y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=2,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [28]:
print("oob_score",rf_model.oob_score_)

oob_score 0.9903235560931358


In [29]:
for feature , imp in zip(X_train_features,rf_model.feature_importances_):
    print(feature,imp)

Age 0.1113482191217162
BusinessTravel 0.02858262398286124
Department 0.028087259285544435
DistanceFromHome 0.07366006645001301
Education 0.043603560180185866
EmployeeCount 0.0
Gender 0.01839402529990253
JobLevel 0.04179547003347848
JobRole 0.059064820083531346
MaritalStatus 0.04332504176931338
MonthlyIncome 0.09630943711577133
NumCompaniesWorked 0.06204872626587287
Over18 0.0
PercentSalaryHike 0.07023153839211402
StandardHours 0.0
StockOptionLevel 0.03618320681257866
TotalWorkingYears 0.002613473426009229
TrainingTimesLastYear 0.0497371541897752
YearsAtCompany 0.08334366747467452
YearsSinceLastPromotion 0.04749395919443046
YearsWithCurrManager 0.06013322377623048
EducationField  0.04404452714599665


# Decision Tree

In [31]:
train_features = X_train[['Age', 'DistanceFromHome',
                            'Education', 'JobLevel', 'JobRole','MaritalStatus', 'MonthlyIncome',
                            'NumCompaniesWorked','PercentSalaryHike', 'StockOptionLevel',
                            'TrainingTimesLastYear', 'YearsAtCompany','YearsSinceLastPromotion', 
                            'YearsWithCurrManager', 'EducationField ']]

In [32]:
tree_model = tree.DecisionTreeClassifier()

In [33]:
tree_model.fit(X = train_features,y=y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [36]:
print("Accuracy : ",tree_model.score(X = train_features,y=y_train))

Accuracy :  1.0


In [38]:
with open("Decisiontree.dot",'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=['Age', 'DistanceFromHome',
                            'Education', 'JobLevel', 'JobRole','MaritalStatus', 'MonthlyIncome',
                            'NumCompaniesWorked','PercentSalaryHike', 'StockOptionLevel',
                            'TrainingTimesLastYear', 'YearsAtCompany','YearsSinceLastPromotion', 
                            'YearsWithCurrManager', 'EducationField '],out_file=f)

# Prediction

In [41]:
X_test_features = X_test[['Age', 'DistanceFromHome',
                            'Education', 'JobLevel', 'JobRole','MaritalStatus', 'MonthlyIncome',
                            'NumCompaniesWorked','PercentSalaryHike', 'StockOptionLevel',
                            'TrainingTimesLastYear', 'YearsAtCompany','YearsSinceLastPromotion', 
                            'YearsWithCurrManager', 'EducationField ']]

In [42]:
test_pred = tree_model.predict(X=X_test_features)

In [43]:
test_output = pd.DataFrame({"EmployeeID":X_test['EmployeeID'],"Attrition":test_pred})

In [44]:
test_output.to_csv("Output.csv",index=False)