In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import json

In [2]:
df = pd.read_csv('data/test.csv')

In [3]:
df.shape

(470, 28)

In [4]:
df.columns

Index(['Id', 'Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeNumber',
       'EnvironmentSatisfaction', 'Gender', 'JobInvolvement', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'OverTime', 'PercentSalaryHike',
       'PerformanceRating', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'CommunicationSkill',
       'Behaviour'],
      dtype='object')

In [5]:
cols_to_drop = ['Id', 'Gender', 'Behaviour', 'MaritalStatus', 'Education', 'Department', 'PerformanceRating',
                'TrainingTimesLastYear', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']
df_cleaned = df.drop(cols_to_drop, axis = 1)
len(df_cleaned.columns)

17

In [6]:
object_type = df_cleaned.select_dtypes(include='object').columns
with open('data/object_col_unique_vals.json', 'r') as fp:
    object_type_vals = json.load(fp)
object_type_vals

{'BusinessTravel': ['Non-Travel', 'Travel_Rarely', 'Travel_Frequently'],
 'EducationField': ['Medical',
  'Life Sciences',
  'Other',
  'Marketing',
  'Technical Degree',
  'Human Resources'],
 'JobRole': ['Laboratory Technician',
  'Manufacturing Director',
  'Sales Executive',
  'Research Scientist',
  'Sales Representative',
  'Healthcare Representative',
  'Research Director',
  'Human Resources',
  'Manager'],
 'OverTime': ['No', 'Yes']}

In [7]:
df_test = df_cleaned.drop(object_type_vals, axis=1)

In [8]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(handle_unknown='error', sparse=False, drop='first')
encoder = ohe.fit(df_cleaned[object_type])
object_type_onehot_arr = encoder.transform(df_cleaned[object_type])

In [9]:
X = np.hstack((df_test.values,object_type_onehot_arr))

In [10]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X = sc.fit_transform(X)

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

import pickle

In [12]:
def test_model_perf(model, X):
    predictions_df = pd.DataFrame(model.predict_proba(X),columns=['No_Attrition','Attrition'], index=range(1,471))
    predictions_df.No_Attrition = predictions_df.No_Attrition.apply(lambda x: np.around(x,7))
    predictions_df.Attrition = predictions_df.Attrition.apply(lambda x: np.around(x,7))
    predictions_df.index.name = 'Id'
    final_df = predictions_df[['Attrition']]
    return final_df

### Generating Testing files for models

In [41]:
!mkdir submissions

mkdir: submissions: File exists


Model : RandomForest2.0

In [13]:
pkl_filename = "models/RandomForest2.0.pkl"
sub_filename = "submissions/RandomForest2.0.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, X)
predictions_df.to_csv(sub_filename)

Model : RandomForest1.0

In [42]:
pkl_filename = "models/RandomForest1.0.pkl"
sub_filename = "submissions/RandomForest1.0.csv"
model = RandomForestClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, X)
predictions_df.to_csv(sub_filename)

Model: MLP1.0

In [43]:
pkl_filename = "models/MLP1.0.pkl"
sub_filename = "submissions/MLP1.0.csv"
model = MLPClassifier()
with open(pkl_filename, 'rb') as file:
    model = pickle.load(file)
predictions_df = test_model_perf(model, X)
predictions_df.to_csv(sub_filename)