In [1]:
from __future__ import print_function

import numpy as np

import pandas as pd

import pickle

import sklearn

from sklearn.metrics import roc_auc_score

In [2]:
#load the model built in prev. file
with open('final_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [3]:
model

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.33, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=123,
            verbose=0, warm_start=False))])

In [6]:
class EmployeeRetentionModel:
    def __init__(self, model_location):
        with open(model_location, 'rb') as f:
            self.model = pickle.load(f)
            
    def predict_proba(self, X_new, clean=True, featEngg=True):
        if clean:
            X_new = self.clean_data(X_new)
        
        if featEngg:
            X_new = self.engineer_features(X_new)
        
        return X_new, self.model.predict_proba(X_new)
    
    def clean_data(self, df):
        #remove duplicates
        df = df.drop_duplicates()

        #remove temp employees
        df = df[df.department!='temp']

        #fix nan values
        df.filed_complaint.fillna(0, inplace=True)
        df.recently_promoted.fillna(0, inplace=True)
        
        #in case of last_evaluation, it might make sense to have indicator of missing entry
        df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
        df.last_evaluation.fillna(0, inplace=True)

        df.department.fillna('Missing', inplace=True)
        # 'information_technology' should be 'IT'
        df.department.replace(['information_technology'], 'IT', inplace=True)

        # Return cleaned dataframe
        return df
    
    def engineer_features(self, df):
        # Create indicator features
        df['underperformers'] = ((df.last_evaluation<0.6) & (df.last_evaluation_missing==0)).astype(int)

        df['unhappy'] = ((df.satisfaction<0.2)).astype(int)

        df['overachievers'] = ((df.last_evaluation>0.8) & (df.satisfaction>0.7)).astype(int)
        
        # Create new dataframe with dummy features
        df = pd.get_dummies(df, columns=['salary', 'department'])

        # Return augmented DataFrame
        return df

In [7]:
#create an instance of above class
retention_model = EmployeeRetentionModel('final_model.pkl')

In [8]:
#load test data
raw_data = pd.read_csv('unseen_raw_data.csv')

print( raw_data.shape )
raw_data.head()

(750, 9)


Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
0,228,management,,0.735618,2,,high,0.805661,3.0
1,229,product,,1.0,4,,low,0.719961,4.0
2,196,sales,1.0,0.557426,4,,low,0.749835,2.0
3,207,IT,,0.715171,3,,high,0.987447,3.0
4,129,management,,0.484818,2,,low,0.441219,3.0


In [10]:
abt_data, pred1 = retention_model.predict_proba(raw_data, clean=True, featEngg=True)

In [11]:
abt_data.shape

(740, 25)

In [12]:
abt_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformers,unhappy,...,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,...,0,0,0,0,1,0,0,0,0,0


In [13]:
pred1

array([[ 0.995,  0.005],
       [ 0.955,  0.045],
       [ 1.   ,  0.   ],
       ..., 
       [ 1.   ,  0.   ],
       [ 1.   ,  0.   ],
       [ 1.   ,  0.   ]])

In [18]:
pred = [p[1] for p in pred1]
print(pred)

[0.0050000000000000001, 0.044999999999999998, 0.0, 0.0, 1.0, 0.035000000000000003, 0.0, 0.014999999999999999, 0.0, 1.0, 0.014999999999999999, 0.0, 0.40999999999999998, 0.014999999999999999, 0.0050000000000000001, 0.97499999999999998, 0.0050000000000000001, 0.10000000000000001, 0.01, 0.0, 0.0, 0.014999999999999999, 0.095000000000000001, 0.014999999999999999, 0.0, 1.0, 0.014999999999999999, 0.035000000000000003, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02, 0.0050000000000000001, 1.0, 0.98499999999999999, 0.0050000000000000001, 0.0050000000000000001, 0.01, 0.014999999999999999, 0.0050000000000000001, 0.074999999999999997, 1.0, 0.095000000000000001, 0.040000000000000001, 0.0, 0.059999999999999998, 0.45500000000000002, 0.014999999999999999, 0.11, 0.0, 0.0, 0.0, 0.025000000000000001, 0.995, 0.029999999999999999, 0.01, 0.0, 0.055, 0.0050000000000000001, 0.02, 0.0, 0.035000000000000003, 0.050000000000000003, 1.0, 0.059999999999999998, 0.0050000000000000001, 0.040000000000000001, 0.98999999999999999, 0.03500

In [19]:
abt_data.tail()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformers,unhappy,...,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support
745,211,0.0,0.599134,4,0.0,0.94614,3.0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
746,136,0.0,0.0,2,0.0,0.393581,3.0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
747,258,0.0,0.809516,4,0.0,0.913363,2.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
748,197,1.0,0.774142,3,0.0,0.682195,3.0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
749,190,0.0,0.533225,3,0.0,0.552628,3.0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
