In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json
import pickle

In [2]:
df = pd.read_csv('data/test.csv')
df.shape

(470, 28)

In [3]:
df.columns

Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

### Feature Engineering

In [4]:
def extract_feature(df_input):
    df = df_input.copy()
    df['OwnStocks'] = df.StockOptionLevel.apply(lambda x: 0 if x == 0 else 1)
    df['PropWorkLife'] = df.TotalWorkingYears / df.Age
    df['PropExpComp'] = df.TotalWorkingYears / (df.NumCompaniesWorked+1)
    cols_to_drop = [
                    'Id',
                    'Behaviour',
                    'PerformanceRating',
                    'Gender',
                    'Education', 
                    'Department',
                    'EmployeeNumber',
                    'PercentSalaryHike',
                    'YearsInCurrentRole',
                    'YearsSinceLastPromotion',
                    'YearsWithCurrManager',
                    'JobInvolvement',
                    'EducationField',
                    'YearsAtCompany',
                    'Age',
                    'StockOptionLevel',
                    'TotalWorkingYears',
                    'NumCompaniesWorked',
                    'MonthlyIncome'
                  ]
    df.drop(cols_to_drop, inplace = True, axis = 1)
    print('Columns Dropped : {}'.format(cols_to_drop))
    print('Columns in DataFrame: {}'.format(df.columns.to_list()))
    return df

In [5]:
df_cleaned = extract_feature(df)

Columns Dropped : ['Id', 'Behaviour', 'PerformanceRating', 'Gender', 'Education', 'Department', 'EmployeeNumber', 'PercentSalaryHike', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'JobInvolvement', 'EducationField', 'YearsAtCompany', 'Age', 'StockOptionLevel', 'TotalWorkingYears', 'NumCompaniesWorked', 'MonthlyIncome']
Columns in DataFrame: ['BusinessTravel', 'DistanceFromHome', 'EnvironmentSatisfaction', 'JobRole', 'JobSatisfaction', 'MaritalStatus', 'OverTime', 'TrainingTimesLastYear', 'CommunicationSkill', 'OwnStocks', 'PropWorkLife', 'PropExpComp']


### Preprocessing and Model Building Init

In [6]:
df_cleaned.drop_duplicates(inplace=True)
df_cleaned.shape

(470, 12)

In [7]:
df_X = df_cleaned.copy()

In [8]:
object_type = df_X.select_dtypes(include='object').columns
object_type

numerical_type = df_X.select_dtypes(exclude='object').columns
numerical_type

cat_cols = []
for c in object_type:
    cat_cols.append(df_X.columns.to_list().index(c))
    
cat_cols

[0, 3, 5, 6]

In [9]:
df_X.head()

Unnamed: 0,BusinessTravel,DistanceFromHome,EnvironmentSatisfaction,JobRole,JobSatisfaction,MaritalStatus,OverTime,TrainingTimesLastYear,CommunicationSkill,OwnStocks,PropWorkLife,PropExpComp
0,Travel_Rarely,9,4,Research Scientist,4,Married,No,3,5,1,0.178571,2.5
1,Travel_Rarely,6,1,Sales Executive,4,Divorced,No,4,3,1,0.419355,2.6
2,Travel_Rarely,6,3,Research Scientist,1,Divorced,Yes,2,4,1,0.351351,2.6
3,Travel_Rarely,1,4,Manufacturing Director,4,Married,Yes,4,5,1,0.190476,0.8
4,Non-Travel,4,3,Laboratory Technician,2,Married,No,5,2,0,0.2,4.5


In [10]:
df_X.dtypes

BusinessTravel              object
DistanceFromHome             int64
EnvironmentSatisfaction      int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
OverTime                    object
TrainingTimesLastYear        int64
CommunicationSkill           int64
OwnStocks                    int64
PropWorkLife               float64
PropExpComp                float64
dtype: object

In [11]:
df_X_cat = pd.get_dummies(df_X.iloc[:,cat_cols], drop_first=True)

In [12]:
df_X_num = df_X.loc[:, numerical_type]

In [13]:
df_X_final = pd.concat((df_X_num, df_X_cat), axis= 1)

In [14]:
df_X_final = df_X_final.values

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [16]:
def test_model_perf(model, X):
    predictions_df = pd.DataFrame(model.predict_proba(X),columns=['No_Attrition','Attrition'], index=range(1,471))
    predictions_df.No_Attrition = predictions_df.No_Attrition.apply(lambda x: np.around(x,7))
    predictions_df.Attrition = predictions_df.Attrition.apply(lambda x: np.around(x,7))
    predictions_df.index.name = 'Id'
    final_df = predictions_df[['Attrition']]
    return final_df

In [17]:
!mkdir submissions

mkdir: submissions: File exists


In [18]:
pkl_filename = "models/Adaboost.pkl"
sub_filename = "submissions/Adaboost.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)

In [19]:
pkl_filename = "models/DecisionTree.pkl"
sub_filename = "submissions/DecisionTree.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)

In [20]:
pkl_filename = "models/RandomForest.pkl"
sub_filename = "submissions/RandomForest.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)