In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json
import pickle

In [98]:
df = pd.read_csv('data/test.csv')
df.shape

(470, 28)

In [99]:
df.columns

Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

### Feature Engineering

In [117]:
def extract_feature(df_input):
    df = df_input.copy()
    df['PropWkngYrsToCompanies'] = pd.cut((df.TotalWorkingYears / (df.NumCompaniesWorked+1)),[-1,2,4,6,8,np.inf], labels=[1,2,3,4,5]).astype('int64')
    df['PropCurrMgrCompYears'] = df.YearsWithCurrManager / (df.YearsAtCompany+1)
    df['PropCurrRoleCompYears'] = df.YearsInCurrentRole / (df.YearsAtCompany+1)
    df['AgeCut'] = pd.cut(df.Age,bins = [0,25,35,50,np.inf], labels= [1,2,3,4]).astype('int64')
    df['Stocks'] = df.StockOptionLevel.apply(lambda x: 0 if x == 0 else 1)
    cols_to_drop = [
                'Id',
                'Behaviour', 
                'Gender',
                'Education',
                'Department',
                'StockOptionLevel',
                'Age',
                'PerformanceRating',
                'TrainingTimesLastYear', 
                'YearsInCurrentRole', 
                'YearsSinceLastPromotion', 
                'YearsWithCurrManager',
                'NumCompaniesWorked',
                'TotalWorkingYears',
                'PercentSalaryHike',
                'MaritalStatus',
                'EducationField',
                'YearsAtCompany'
                ]
    df.drop(cols_to_drop, inplace = True, axis = 1)
    print('Columns Dropped : {}'.format(cols_to_drop))
    print('Columns in DataFrame: {}'.format(df.columns.to_list()))
    return df

In [101]:
df_cleaned = extract_feature(df)

Columns Dropped : ['Id', 'Behaviour', 'Gender', 'Education', 'Department', 'StockOptionLevel', 'Age', 'PerformanceRating', 'TrainingTimesLastYear', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'NumCompaniesWorked', 'TotalWorkingYears', 'PercentSalaryHike', 'MaritalStatus']
Columns in DataFrame: ['BusinessTravel', 'DistanceFromHome', 'EducationField', 'EmployeeNumber', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobRole', 'JobSatisfaction', 'MonthlyIncome', 'OverTime', 'YearsAtCompany', 'CommunicationSkill', 'PropWkngYrsToCompanies', 'PropCurrMgrCompYears', 'PropCurrRoleCompYears', 'AgeCut', 'Stocks']


### Preprocessing and Model Building Init

In [102]:
df_cleaned.drop_duplicates(inplace=True)
df_cleaned.shape

(470, 17)

In [103]:
df_X = df_cleaned.copy()

In [104]:
object_type = df_X.select_dtypes(include='object').columns
object_type

numerical_type = df_X.select_dtypes(exclude='object').columns
numerical_type

cat_cols = []
for c in object_type:
    cat_cols.append(df_X.columns.to_list().index(c))
    
cat_cols

[0, 2, 6, 9]

In [105]:
df_X.head()

Unnamed: 0,BusinessTravel,DistanceFromHome,EducationField,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobRole,JobSatisfaction,MonthlyIncome,OverTime,YearsAtCompany,CommunicationSkill,PropWkngYrsToCompanies,PropCurrMgrCompYears,PropCurrRoleCompYears,AgeCut,Stocks
0,Travel_Rarely,9,Medical,377,4,3,Research Scientist,4,2070,No,5,5,2,0.666667,0.333333,2,1
1,Travel_Rarely,6,Medical,653,1,4,Sales Executive,4,5460,No,7,3,2,0.875,0.875,2,1
2,Travel_Rarely,6,Medical,474,3,4,Research Scientist,1,5974,Yes,7,4,2,0.875,0.875,3,1
3,Travel_Rarely,1,Life Sciences,827,4,2,Manufacturing Director,4,6062,Yes,4,5,1,0.4,0.6,3,1
4,Non-Travel,4,Life Sciences,972,3,3,Laboratory Technician,2,4447,No,9,2,3,0.8,0.7,3,0


In [106]:
df_X.dtypes

BusinessTravel              object
DistanceFromHome             int64
EducationField              object
EmployeeNumber               int64
EnvironmentSatisfaction      int64
JobInvolvement               int64
JobRole                     object
JobSatisfaction              int64
MonthlyIncome                int64
OverTime                    object
YearsAtCompany               int64
CommunicationSkill           int64
PropWkngYrsToCompanies       int64
PropCurrMgrCompYears       float64
PropCurrRoleCompYears      float64
AgeCut                       int64
Stocks                       int64
dtype: object

In [107]:
df_X_cat = pd.get_dummies(df_X.iloc[:,cat_cols], drop_first=True)

In [108]:
df_X_num = pd.get_dummies(df_X.loc[:, numerical_type])

In [109]:
df_X_final = pd.concat((df_X_num, df_X_cat), axis= 1)

In [110]:
df_X_final = df_X_final.values

In [111]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [112]:
def test_model_perf(model, X):
    predictions_df = pd.DataFrame(model.predict_proba(X),columns=['No_Attrition','Attrition'], index=range(1,471))
    predictions_df.No_Attrition = predictions_df.No_Attrition.apply(lambda x: np.around(x,7))
    predictions_df.Attrition = predictions_df.Attrition.apply(lambda x: np.around(x,7))
    predictions_df.index.name = 'Id'
    final_df = predictions_df[['Attrition']]
    return final_df

In [113]:
!mkdir submissions

mkdir: submissions: File exists


In [114]:
pkl_filename = "models/AdaboostA.pkl"
sub_filename = "submissions/AdaboostA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)

ValueError: Number of features of the model must match the input. Model n_features is 39 and input n_features is 29 

In [115]:
pkl_filename = "models/DecisionTreeA.pkl"
sub_filename = "submissions/DecisionTreeA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)

In [116]:
pkl_filename = "models/RandomForestA.pkl"
sub_filename = "submissions/RandomForestA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)