In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json
import pickle

In [32]:
df = pd.read_csv('data/test.csv')
df.shape

(470, 28)

In [33]:
df.columns

Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

### Feature Engineering

In [34]:
def extract_feature(df_input):
    df = df_input.copy()
    df['OwnStocks'] = df.StockOptionLevel.apply(lambda x: 'No' if x == 0 else 'Yes')
    df['PropWorkLife'] = df.TotalWorkingYears / df.Age
    df['PropExpComp'] = df.NumCompaniesWorked / (df.TotalWorkingYears+1)
    df['PropRoleComp'] = df.YearsInCurrentRole / (df.YearsAtCompany + 1)
    df['AgeBar'] = pd.cut(df.Age, bins = [0,27,45,np.inf], labels=['Young','Mid','Old']).astype('object')
    q_pays = {
        'Laboratory Technician': 2705.0, 
        'Manufacturing Director': 5824.4000000000015, 
        'Sales Executive': 5675.8, 
        'Research Scientist': 2693.4, 
        'Sales Representative': 2325.8, 
        'Healthcare Representative': 6348.6, 
        'Research Director': 15014.600000000002, 
        'Human Resources': 2741.0, 
        'Manager': 16894.0
    }
    df['AboveQPay'] = df.apply(lambda x: 'No' if x.MonthlyIncome < q_pays.get(x.JobRole) else 'Yes', axis = 1)
    df['WorkFactors'] = (df.EnvironmentSatisfaction+df.JobSatisfaction+df.JobInvolvement) / 15
    df['CommunicationSkill'] = 1 / df['CommunicationSkill']
    #df['PropCompYrsHike'] = df.YearsAtCompany/(df.PercentSalaryHike+1)
    cols_to_drop = [
                    'Id',
                    'Behaviour',
                    'PerformanceRating',
                    'Gender',
                    'Education', 
                    'Department',
                    'EmployeeNumber',
                    'PercentSalaryHike',
                    'YearsInCurrentRole',
                    'YearsSinceLastPromotion',
                    'YearsWithCurrManager',
                    'TrainingTimesLastYear',
                    'EducationField',
                    'StockOptionLevel',
                    'TotalWorkingYears',
                    'YearsAtCompany',
                    'NumCompaniesWorked',
                    'JobSatisfaction',
                    'EnvironmentSatisfaction',
                    'JobInvolvement',
                    'Age',
                    'MonthlyIncome',
                    'DistanceFromHome'
                  ]
    df.drop(cols_to_drop, inplace = True, axis = 1)
    print('Columns Dropped : {}'.format(cols_to_drop))
    print('Columns in DataFrame: {}'.format(df.columns.to_list()))
    return df

In [35]:
df_cleaned = extract_feature(df)

Columns Dropped : ['Id', 'Behaviour', 'PerformanceRating', 'Gender', 'Education', 'Department', 'EmployeeNumber', 'PercentSalaryHike', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'TrainingTimesLastYear', 'EducationField', 'StockOptionLevel', 'TotalWorkingYears', 'YearsAtCompany', 'NumCompaniesWorked', 'JobSatisfaction', 'EnvironmentSatisfaction', 'JobInvolvement', 'Age', 'MonthlyIncome', 'DistanceFromHome']
Columns in DataFrame: ['BusinessTravel', 'JobRole', 'MaritalStatus', 'OverTime', 'CommunicationSkill', 'OwnStocks', 'PropWorkLife', 'PropExpComp', 'PropRoleComp', 'AgeBar', 'AboveQPay', 'WorkFactors']


In [36]:
df_cleaned.dtypes

BusinessTravel         object
JobRole                object
MaritalStatus          object
OverTime               object
CommunicationSkill    float64
OwnStocks              object
PropWorkLife          float64
PropExpComp           float64
PropRoleComp          float64
AgeBar                 object
AboveQPay              object
WorkFactors           float64
dtype: object

### Preprocessing and Model Building Init

In [37]:
df_cleaned.shape

(470, 12)

In [38]:
df_X = df_cleaned.copy()

In [39]:
nominal_type = list(df_X.select_dtypes(include='object').columns)
nominal_type

['BusinessTravel',
 'JobRole',
 'MaritalStatus',
 'OverTime',
 'OwnStocks',
 'AgeBar',
 'AboveQPay']

In [40]:
nominal_type_vals = dict()
for ot in nominal_type:
    nominal_type_vals[ot] = list(df_X[ot].unique())
print(nominal_type_vals)

{'BusinessTravel': ['Travel_Rarely', 'Non-Travel', 'Travel_Frequently'], 'JobRole': ['Research Scientist', 'Sales Executive', 'Manufacturing Director', 'Laboratory Technician', 'Healthcare Representative', 'Human Resources', 'Manager', 'Research Director', 'Sales Representative'], 'MaritalStatus': ['Married', 'Divorced', 'Single'], 'OverTime': ['No', 'Yes'], 'OwnStocks': ['Yes', 'No'], 'AgeBar': ['Mid', 'Old', 'Young'], 'AboveQPay': ['No', 'Yes']}


In [41]:
numerical_type = list(df_X.select_dtypes(exclude='object').columns)
numerical_type

['CommunicationSkill',
 'PropWorkLife',
 'PropExpComp',
 'PropRoleComp',
 'WorkFactors']

In [42]:
ordinal_type = list()
ordinal_columns_dataset = ['EnvironmentSatisfaction',
                           'JobInvolvement',
                           'JobSatisfaction',
                           'Education',
                           'Behaviour',
                           'CommunicationSkill',
                           'PerformanceRating',
                           'StockOptionLevel',
                          ]
for col in ordinal_columns_dataset:
    if col in numerical_type:
        numerical_type.remove(col)
        ordinal_type.append(col)
        
ordinal_type

['CommunicationSkill']

In [43]:
final_cols = list(df_X.columns)
final_cols

['BusinessTravel',
 'JobRole',
 'MaritalStatus',
 'OverTime',
 'CommunicationSkill',
 'OwnStocks',
 'PropWorkLife',
 'PropExpComp',
 'PropRoleComp',
 'AgeBar',
 'AboveQPay',
 'WorkFactors']

In [44]:
numerical_index = list()
nominal_index = list()
ordinal_index = list()

for col in numerical_type:
    numerical_index.append(final_cols.index(col))
for col in nominal_type:
    nominal_index.append(final_cols.index(col))
for col in ordinal_type:
    ordinal_index.append(final_cols.index(col))

In [45]:
print('Numerical Columns')
for i,col in zip(numerical_index, numerical_type):
    print(i, col)
print('='*50)
print('Nominal Columns')
for i,col in zip(nominal_index, nominal_type):
    print(i, col)
print('='*50)
print('Ordinal Columns')
for i,col in zip(ordinal_index, ordinal_type):
    print(i, col)

Numerical Columns
6 PropWorkLife
7 PropExpComp
8 PropRoleComp
11 WorkFactors
Nominal Columns
0 BusinessTravel
1 JobRole
2 MaritalStatus
3 OverTime
5 OwnStocks
9 AgeBar
10 AboveQPay
Ordinal Columns
4 CommunicationSkill


In [46]:
df_X.describe()

Unnamed: 0,CommunicationSkill,PropWorkLife,PropExpComp,PropRoleComp,WorkFactors
count,470.0,470.0,470.0,470.0,470.0
mean,0.45,0.290479,0.271368,0.482867,0.542553
std,0.292041,0.152715,0.283311,0.270832,0.114025
min,0.2,0.0,0.0,0.0,0.266667
25%,0.25,0.181818,0.090909,0.333333,0.466667
50%,0.333333,0.27682,0.166667,0.5,0.533333
75%,0.5,0.386913,0.4,0.666667,0.6
max,1.0,0.689655,2.0,0.882353,0.8


In [47]:
from sklearn.preprocessing import OneHotEncoder
encoder_dic = {}
for col in nominal_index:
    enc = OneHotEncoder(categories='auto',drop='first',sparse = False).fit(df_X.iloc[:,[col]])
    encoder_dic[col] = enc

In [48]:
X_test = df_X.copy()

In [49]:
X_test.shape

(470, 12)

In [50]:
for i, enc in encoder_dic.items():
    temp = pd.DataFrame(enc.transform(X_test.iloc[:,[i]]), columns=[ list(df_X.columns)[i]+'_'+colname for colname in list(enc.categories_[0])[1:]]).reset_index()
    X_test = pd.concat( [X_test, temp], axis = 1 )

In [51]:
X_test.drop(nominal_type, inplace=True, axis=1)

In [52]:
final_cols_encoded = X_test.columns.values
final_cols_encoded

array(['CommunicationSkill', 'PropWorkLife', 'PropExpComp',
       'PropRoleComp', 'WorkFactors', 'index',
       'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely',
       'index', 'JobRole_Human Resources',
       'JobRole_Laboratory Technician', 'JobRole_Manager',
       'JobRole_Manufacturing Director', 'JobRole_Research Director',
       'JobRole_Research Scientist', 'JobRole_Sales Executive',
       'JobRole_Sales Representative', 'index', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'index', 'OverTime_Yes', 'index',
       'OwnStocks_Yes', 'index', 'AgeBar_Old', 'AgeBar_Young', 'index',
       'AboveQPay_Yes'], dtype=object)

In [53]:
X_test = X_test.values

In [54]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [55]:
def test_model_perf(model, X):
    predictions_df = pd.DataFrame(model.predict_proba(X),columns=['No_Attrition','Attrition'], index=range(1,471))
    predictions_df.No_Attrition = predictions_df.No_Attrition.apply(lambda x: np.around(x,7))
    predictions_df.Attrition = predictions_df.Attrition.apply(lambda x: np.around(x,7))
    predictions_df.index.name = 'Id'
    final_df = predictions_df[['Attrition']]
    return final_df

In [56]:
!mkdir submissions

mkdir: submissions: File exists


In [27]:
pkl_filename = "models/RandomForest.pkl"
sub_filename = "submissions/RandomForest.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, X_test)
predictions_df.to_csv(sub_filename)

In [57]:
pkl_filename = "models/Final_RandomForestA.pkl"
sub_filename = "submissions/Final_RandomForestA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, X_test)
predictions_df.to_csv(sub_filename)