In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json
import pickle

In [4]:
df = pd.read_csv('data/test.csv')
df.shape

(470, 28)

In [5]:
df.columns

Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

### Feature Engineering

In [6]:
def extract_feature(df_input):
    df = df_input.copy()
    df['PropWkngYrsToCompanies'] = pd.cut((df.TotalWorkingYears / (df.NumCompaniesWorked+1)),[-1,2,4,6,8,np.inf], labels=[1,2,3,4,5]).astype('int64')
    df['PropCurrMgrCompYears'] = df.YearsWithCurrManager / (df.YearsAtCompany+1)
    df['PropCurrRoleCompYears'] = df.YearsInCurrentRole / (df.YearsAtCompany+1)
    #df['AgeCut'] = pd.cut(df.Age,bins = [0,25,35,50,np.inf], labels= [1,2,3,4]).astype('int64')
    df['Stocks'] = df.StockOptionLevel.apply(lambda x: 0 if x == 0 else 1)
    cols_to_drop = [
                'Id',
                'Behaviour', 
                'Gender',
                'Education',
                'Department',
                'StockOptionLevel',
                #'Age',
                'PerformanceRating',
                'TrainingTimesLastYear', 
                'YearsInCurrentRole', 
                'YearsSinceLastPromotion', 
                'YearsWithCurrManager',
                'NumCompaniesWorked',
                'TotalWorkingYears',
                'PercentSalaryHike',
                'MaritalStatus',
                'EducationField',
                'YearsAtCompany'
                ]
    df.drop(cols_to_drop, inplace = True, axis = 1)
    print('Columns Dropped : {}'.format(cols_to_drop))
    print('Columns in DataFrame: {}'.format(df.columns.to_list()))
    return df

In [7]:
df_cleaned = extract_feature(df)

Columns Dropped : ['Id', 'Behaviour', 'Gender', 'Education', 'Department', 'StockOptionLevel', 'PerformanceRating', 'TrainingTimesLastYear', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'NumCompaniesWorked', 'TotalWorkingYears', 'PercentSalaryHike', 'MaritalStatus', 'EducationField', 'YearsAtCompany']
Columns in DataFrame: ['Age', 'BusinessTravel', 'DistanceFromHome', 'EmployeeNumber', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobRole', 'JobSatisfaction', 'MonthlyIncome', 'OverTime', 'CommunicationSkill', 'PropWkngYrsToCompanies', 'PropCurrMgrCompYears', 'PropCurrRoleCompYears', 'Stocks']


### Preprocessing and Model Building Init

In [8]:
df_cleaned.drop_duplicates(inplace=True)
df_cleaned.shape

(470, 15)

In [9]:
df_X = df_cleaned.copy()

In [10]:
object_type = df_X.select_dtypes(include='object').columns
object_type

numerical_type = df_X.select_dtypes(exclude='object').columns
numerical_type

cat_cols = []
for c in object_type:
    cat_cols.append(df_X.columns.to_list().index(c))
    
cat_cols

[1, 6, 9]

In [11]:
df_X.head()

Unnamed: 0,Age,BusinessTravel,DistanceFromHome,EmployeeNumber,EnvironmentSatisfaction,JobInvolvement,JobRole,JobSatisfaction,MonthlyIncome,OverTime,CommunicationSkill,PropWkngYrsToCompanies,PropCurrMgrCompYears,PropCurrRoleCompYears,Stocks
0,28,Travel_Rarely,9,377,4,3,Research Scientist,4,2070,No,5,2,0.666667,0.333333,1
1,31,Travel_Rarely,6,653,1,4,Sales Executive,4,5460,No,3,2,0.875,0.875,1
2,37,Travel_Rarely,6,474,3,4,Research Scientist,1,5974,Yes,4,2,0.875,0.875,1
3,42,Travel_Rarely,1,827,4,2,Manufacturing Director,4,6062,Yes,5,1,0.4,0.6,1
4,45,Non-Travel,4,972,3,3,Laboratory Technician,2,4447,No,2,3,0.8,0.7,0


In [12]:
df_X.dtypes

Age                          int64
BusinessTravel              object
DistanceFromHome             int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
JobInvolvement               int64
JobRole                     object
JobSatisfaction              int64
MonthlyIncome                int64
OverTime                    object
CommunicationSkill           int64
PropWkngYrsToCompanies       int64
PropCurrMgrCompYears       float64
PropCurrRoleCompYears      float64
Stocks                       int64
dtype: object

In [13]:
df_X_cat = pd.get_dummies(df_X.iloc[:,cat_cols], drop_first=True)

In [14]:
df_X_num = pd.get_dummies(df_X.loc[:, numerical_type])

In [15]:
df_X_final = pd.concat((df_X_num, df_X_cat), axis= 1)

In [16]:
df_X_final = df_X_final.values

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn import metrics

In [18]:
def test_model_perf(model, X):
    predictions_df = pd.DataFrame(model.predict_proba(X),columns=['No_Attrition','Attrition'], index=range(1,471))
    predictions_df.No_Attrition = predictions_df.No_Attrition.apply(lambda x: np.around(x,7))
    predictions_df.Attrition = predictions_df.Attrition.apply(lambda x: np.around(x,7))
    predictions_df.index.name = 'Id'
    final_df = predictions_df[['Attrition']]
    return final_df

In [19]:
!mkdir submissions

mkdir: submissions: File exists


In [23]:
pkl_filename = "models/AdaboostA.pkl"
sub_filename = "submissions/AdaboostA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)

In [22]:
pkl_filename = "models/DecisionTreeA.pkl"
sub_filename = "submissions/DecisionTreeA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)

In [21]:
pkl_filename = "models/RandomForestA.pkl"
sub_filename = "submissions/RandomForestA.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, df_X_final)
predictions_df.to_csv(sub_filename)