### 1. Importing libraries and models

In [1]:
import numpy as np

import pandas as pd
pd.set_option('display.max_columns',100)

import pickle

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
with open('final_model.pkl','rb') as f:
    model = pickle.load(f)

### 2. Checking validity of saved model

In [3]:
df = pd.read_csv('analytical_base_table.csv')

In [4]:
y = df['status']
X = df.drop('status', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234, stratify=df.status)

In [5]:
pred = model.predict_proba(X_test)

pred =[p[1] for p in pred]

print('AUROC:',roc_auc_score(y_test, pred))

AUROC: 0.9915194952019338


  Xt = transform.transform(Xt)


### 3. Writing pre-modeling functions

In [6]:
def clean_data(df):
    df = df.drop_duplicates()
    df = df[df.department!= 'temp']
    df.filed_complaint = df.filed_complaint.fillna(0)
    df.recently_promoted.fillna(0,inplace=True)
    df.department.replace(['information_technology'],'IT',inplace=True)
    df['department'].fillna('Missing',inplace=True)
    df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
    df.last_evaluation.fillna(0,inplace=True)
    return df

In [7]:
raw_data = pd.read_csv('unseen_raw_data.csv')
cleaned_data = clean_data(raw_data)
cleaned_data.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing
0,228,management,0.0,0.735618,2,0.0,high,0.805661,3.0,0
1,229,product,0.0,1.0,4,0.0,low,0.719961,4.0,0
2,196,sales,1.0,0.557426,4,0.0,low,0.749835,2.0,0
3,207,IT,0.0,0.715171,3,0.0,high,0.987447,3.0,0
4,129,management,0.0,0.484818,2,0.0,low,0.441219,3.0,0


In [8]:
def engineer_features(df):
    df['underperformer'] = ((df.last_evaluation<0.6) & (df.last_evaluation_missing==0)).astype(int)
    df['unhappy'] = (df.satisfaction<0.2).astype(int)
    df['overachiever'] = ((df.last_evaluation>0.8) & (df.satisfaction>0.7)).astype(int)
    df = pd.get_dummies(df,columns=['department','salary'])
    return df

In [9]:
augmented_data = engineer_features(cleaned_data)
augmented_data.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,overachiever,department_IT,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0


In [10]:
pred = model.predict_proba(augmented_data)
print(pred[:5])

[[1.   0.  ]
 [0.98 0.02]
 [1.   0.  ]
 [1.   0.  ]
 [0.   1.  ]]


  Xt = transform.transform(Xt)


### 4. Constructing model class

In [11]:
class EmployeeStatus:
    
    def __init__(self,model_location):
        with open(model_location,'rb') as f:
            self.model = pickle.load(f)
            
    def predict_proba(self, X_new, clean=True, augment=True):
        if clean:
            X_new = self.clean_data(X_new)
            
        if augment:
            X_new = self.engineer_features(X_new)
            
        return X_new, self.model.predict_proba(X_new)
            
    
    def clean_data(self,df):
        df = df.drop_duplicates()
        df = df[df.department!= 'temp']
        df.filed_complaint = df.filed_complaint.fillna(0)
        df.recently_promoted.fillna(0,inplace=True)
        df.department.replace(['information_technology'],'IT',inplace=True)
        df['department'].fillna('Missing',inplace=True)
        df['last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
        df.last_evaluation.fillna(0,inplace=True)
        return df
    
    def engineer_features(self,df):
        df['underperformer'] = ((df.last_evaluation<0.6) & (df.last_evaluation_missing==0)).astype(int)
        df['unhappy'] = (df.satisfaction<0.2).astype(int)
        df['overachiever'] = ((df.last_evaluation>0.8) & (df.satisfaction>0.7)).astype(int)
        df = pd.get_dummies(df,columns=['department','salary'])
        return df    

### 5. Making an Executable Python Script which can be run from command line
with three positional arguments data_location, output_location, model_location(predictions.csv)
and two keyword arguments

### 6. Checking the predictions in output dataframe

In [12]:
predictions = pd.read_csv('predictions.csv')

predictions.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,satisfaction,tenure,last_evaluation_missing,underperformer,unhappy,overachiever,department_IT,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,salary_high,salary_low,salary_medium,prediction
0,228,0.0,0.735618,2,0.0,0.805661,3.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0.0
1,229,0.0,1.0,4,0.0,0.719961,4.0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0.02
2,196,1.0,0.557426,4,0.0,0.749835,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0.0
3,207,0.0,0.715171,3,0.0,0.987447,3.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.0
4,129,0.0,0.484818,2,0.0,0.441219,3.0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1.0
