In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
data = pd.read_excel("data/Attrition Data Exercise.xlsx")

# Initial analysis for key findings

In [None]:
data

In [None]:
data.shape

In [None]:
sns.histplot(data=data, hue='Attrition', x='BusinessTravel', stat='count', multiple='dodge')

In [None]:
numerical_features = ["Age","DailyRate","DistanceFromHome","Education","EnvironmentSatisfaction","JobInvolvement","JobLevel","JobSatisfaction","MonthlyIncome","NumCompaniesWorked","PercentSalaryHike","PerformanceRating",
                      "RelationshipSatisfaction", "StandardHours", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear",
                      "WorkLifeBalance", "Company Tenure (yrs)", "YearsInCurrentRole", "YearsSinceLastPromotion", "YearsWithCurrManager"]

In [None]:
data[['DailyRate','HourlyRate','MonthlyRate','MonthlyIncome']].corr()

In [None]:
# Income potential
Income_related_features = ['DailyRate','HourlyRate','MonthlyRate','MonthlyIncome'] 

"""
1. No correlation among them.
2. Only MonthlyIncome is interesting. People with lower income have more attrition -> Get statistic
"""

# Only related to current job role work
Current_job_role_related = ['JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',"OverTime",'StandardHours',]

"""
1. More prob of attrition when worked overtime, 2-3x
2. Job dissatisfaction + overtime = recipe for disaster
3. JobInvolvement is a better predictor compared to JobSatisfaction when looking at churn. 3-4x
"""

# Other than income how they are rewarded
Reward_and_career_advancement_related = ['StockOptionLevel',"YearsSinceLastPromotion"]

"""
1. Low and high stock option level options are 2-3x more likely to attrite.
2. When job involvement is low, people with low stock option are 5x more likely to attrite.
3. People with lower RSU and monthly income are more likely to attrite
"""

# Overeall career
Career_related_fields = ["Company Tenure (yrs)_Bucket","TotalWorkingYears", "NumCompaniesWorked"] # Removed Company Tenure (yrs)

"""
1. People who have switched more than 4 companies, are generally 2x more probably to attrite.
2. Max. prob. of attrition is people who have less TWY and less NumCompaniesWorked
3. At lower job inv levels, people between 0-5 years company tenure are much more probable, 2-3x to attrite. Next wave is after 15 years
"""

# Personal factors
Personal_factors = ["Age","DistanceFromHome","Gender","MaritalStatus","RelationshipSatisfaction","Location"] # REmoved RelationshipSatisfaction_bucket

"""
1. Younger, single people are more likely to attrite
2. Gender, DistanceFromHome, Location have less impact individually
3. Beyond 10 years in company, the level of attrition in USA is more than double that of other two regions - India and China
"""

Other_Job_related_features = ["YearsWithCurrManager","YearsInCurrentRole","PercentSalaryHike","PerformanceRating","WorkLifeBalance"] # Removed JobInvolvement_bucket, JobSatisfaction_bucket and WorkLifeBalance_bucket

"""
1. Younger people with poor WLB resign more often.
2. Not much more
"""


# Education
Education_related_features = ["Education","EducationField"] # Removed Educ_bucket

"""
No such pattern found
"""

# To do with company culture and outlook
Company_related = ["TrainingTimesLastYear",'Department',"BusinessTravel",]

"""
1. People who travel frequently are 3 times more likely to attrite than non travellers.
2. Overtime and job involvement not so impactful in attriting.
"""


In [None]:
Irrelevant_features = ["Over18", "EmployeeNumber", "EmployeeCount"]

In [None]:
data[Company_related]

In [None]:
i=-1
data[data['Attrition']=='Yes'][Other_Job_related_features[i]].value_counts() /data[Other_Job_related_features[i]].value_counts() * 100

In [None]:
i=3
j=2
data[data['Attrition']=='Yes'][[Other_Job_related_features[i],Other_Job_related_features[j]]].value_counts() /data[[Other_Job_related_features[i],Other_Job_related_features[j]]].value_counts() * 100

In [None]:
twy=pd.qcut(data['TotalWorkingYears'], q=5, labels=np.arange(5))
data['twy']=twy

In [None]:
data[data['Attrition']=='Yes'][['WorkLifeBalance','twy']].value_counts() /data[['WorkLifeBalance','twy']].value_counts() * 100

In [None]:
plt.figure(figsize=(20,10))
sns.boxplot(data=data, x='RelationshipSatisfaction', y='Attrition')

# Trying PCA
2 components
1. RelationshipSatisfaction - 69% var
2. TrainingTimesLastYear, Company Tenure (yrs), HourlyRate - 30% var

In [None]:
data

In [None]:
feats=['Age', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField',
        'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance',
       'Company Tenure (yrs)','YearsInCurrentRole', 'YearsSinceLastPromotion', 
       'YearsWithCurrManager','Location']

In [None]:
data[feats]

In [None]:
categorical_feats = ["BusinessTravel","Department","EducationField","JobRole","MaritalStatus","OverTime","Gender","Location"]

In [None]:
numerical_feats = list(set(feats).difference(set(categorical_feats)))

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

ct=ColumnTransformer([('encoder',OrdinalEncoder(),categorical_feats)], remainder='passthrough')

In [None]:
nm_data=ct.fit_transform(data[numerical_feats+categorical_feats])

In [None]:
len(numerical_feats)

In [None]:
nm_data.shape

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=0.9, svd_solver='full')

In [None]:
transformed_data = pca.fit_transform(nm_data)

In [None]:
transformed_data.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
for i in np.arange(31):
    corr_value=np.corrcoef(nm_data[:,i], transformed_data[:,1])[0][1]
    if corr_value>0.7:
        print("high",i)
    if corr_value<-0.7:
        print("low",i)


In [None]:
numerical_feats[22]

In [None]:
numerical_feats[8], numerical_feats[13], numerical_feats[15]

# Feature importance

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_classif

In [None]:
cat_feat_selector=SelectKBest(k=5,score_func=chi2)

In [None]:
cat_feat_selector.fit_transform(nm_data[:,23:], data['Attrition'])

In [None]:
pd.Series(index=categorical_feats, data=cat_feat_selector.scores_).sort_values(ascending=False)

In [None]:
num_feat_selector=SelectKBest(k=5,score_func=f_classif)
num_feat_selector.fit_transform(nm_data[:,:23], data['Attrition'])

In [None]:
pd.Series(index=numerical_feats, data=num_feat_selector.scores_).sort_values(ascending=False)

# Modelling

In [5]:
categorical_feats = ["BusinessTravel","Department","EducationField","JobRole","MaritalStatus","OverTime","Gender","Location"]
numerical_feats = ['JobLevel', 'StockOptionLevel', 'PercentSalaryHike', 'EnvironmentSatisfaction',
 'PerformanceRating', 'MonthlyIncome', 'JobSatisfaction', 'TotalWorkingYears', 'TrainingTimesLastYear', 'YearsWithCurrManager',
 'DistanceFromHome', 'DailyRate', 'YearsSinceLastPromotion', 'Company Tenure (yrs)', 'MonthlyRate', 'HourlyRate', 'YearsInCurrentRole',
 'WorkLifeBalance', 'Age', 'Education', 'JobInvolvement', 'NumCompaniesWorked', 'RelationshipSatisfaction']

In [6]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [72]:
X_train, X_test, y_train, y_test = train_test_split(data[numerical_feats+categorical_feats], data['Attrition'], stratify=data['Attrition'], random_state=42)

In [73]:
data_pipeline = ColumnTransformer([('encoder',OrdinalEncoder(),categorical_feats), ('scaler',StandardScaler(),numerical_feats)])
data_pipeline.fit(X_train)
X_train = data_pipeline.transform(X_train)
X_test = data_pipeline.transform(X_test)
label_enc = LabelEncoder().fit(y_train)
y_train = label_enc.transform(y_train)
y_test = label_enc.transform(y_test)

In [74]:
label_enc = LabelEncoder().fit(y_train)
y_train = label_enc.transform(y_train)
y_test = label_enc.transform(y_test)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report, roc_curve, precision_score, recall_score, roc_auc_score

In [76]:
models = [GaussianNB(), # Can't be helped
          BernoulliNB(), # Can't be helped
          LogisticRegression(), # Improved F1 to 0.49
          SVC(kernel='poly', degree=3), # Does not seem to be a good prospect
          DecisionTreeClassifier(class_weight={0:0.84,1:0.16}), 
          RandomForestClassifier(class_weight={0:0.84,1:0.16}), 
          AdaBoostClassifier(), # Improved f1 and recall. Recall not upto the mark of LR
          GradientBoostingClassifier(), # Improved. But not good enough
          XGBClassifier(class_weight={0:0.84,1:0.16})]

In [77]:
for model in models:
    model.fit(X_train,y_train)
    y_h=model.predict(X_test)
    print(f1_score(y_test, y_h))

0.46835443037974683
0.40707964601769914
0.4137931034482759
0.29333333333333333
0.32000000000000006
0.28571428571428575
0.4666666666666666
0.3037974683544304
Parameters: { "class_weight" } are not used.

0.3902439024390244


In [78]:
m=GaussianNB()
m.fit(X_train, y_train)
y_pred=m.predict(X_test)
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred.astype('int')))
print(recall_score(y_test, y_pred.astype('int')))

              precision    recall  f1-score   support

           0       0.92      0.80      0.85       309
           1       0.37      0.63      0.47        59

    accuracy                           0.77       368
   macro avg       0.65      0.71      0.66       368
weighted avg       0.83      0.77      0.79       368

0.46835443037974683
0.6271186440677966


### Trying to improve Logistic Regression

In [79]:
data['Attrition'].value_counts()/data.shape[0]

Attrition
No     0.838776
Yes    0.161224
Name: count, dtype: float64

In [81]:
m=LogisticRegression(class_weight={0:0.84,1:0.16}, penalty='elasticnet', solver='saga', l1_ratio=0.9, C=0.96).fit(X_train,y_train)
fpr,tpr,thresh=roc_curve(y_train, m.predict_proba(X_train)[:,1], )
pos=np.argmax(tpr-fpr)
thr=thresh[pos]
y_pred = m.predict_proba(X_test)[:,1] > thr
print(classification_report(y_test, y_pred.astype('int')))
print(recall_score(y_test, y_pred.astype('int')))
print(roc_auc_score(y_test, y_pred.astype('int')))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       309
           1       0.39      0.64      0.49        59

    accuracy                           0.78       368
   macro avg       0.66      0.73      0.67       368
weighted avg       0.84      0.78      0.80       368

0.6440677966101694
0.7265646426416543


In [None]:
precision = tp/tp+fp
recall = tp/tp+fn

I am okay with high fp but not high fn
So I want high recall and low precision

## Hyper parameter tuning

In [135]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from functools import partial
from hyperopt.base import STATUS_OK
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

In [9]:
import warnings
warnings.filterwarnings('ignore')

In [42]:
X=data[numerical_feats+categorical_feats]
y=data['Attrition']

In [19]:
X = data_pipeline.fit_transform(data)
y = label_enc.fit_transform(data['Attrition'])

In [204]:
def optimize(params):

    model=GradientBoostingClassifier(random_state=42)
    k=KFold()
    mapes=[]
    
    for idx in k.split(X=X,y=y):
        train_idx,test_idx=idx[0],idx[1]
        
        xtrain=X[train_idx]
        ytrain=y[train_idx]

        xtest=X[test_idx]
        ytest=y[test_idx]

        model.fit(xtrain,ytrain)
        preds=model.predict(xtest)
        
        fold_acc = recall_score(ytest, preds)
        mapes.append(fold_acc)
    
    return -np.mean(mapes)
    
#3
X = data_pipeline.fit_transform(data)
y = label_enc.fit_transform(data['Attrition'])

#4
rf_space = {
        "max_features": scope.float(hp.uniform('max_features',0.001,1)),
        "learning_rate": scope.float(hp.uniform('learning_rate',0.001,1)),
        "n_estimators": scope.int(hp.uniform('n_estimators',1,100)),
        "max_depth": scope.int(hp.uniform('max_depth',1,5)),
    }

#5
def score_hyperparams(params):
    scor = optimize(params)
    return {'loss':scor, 'status':STATUS_OK}

#6
trials = Trials()

result = fmin(
            fn=score_hyperparams,
            space=rf_space,
            max_evals=500,
            trials=trials,
            algo=tpe.suggest
        ) 
print(result)

100%|██████████| 500/500 [16:49<00:00,  2.02s/trial, best loss: -0.3184362685213916]  
{'learning_rate': 0.224924058953612, 'max_depth': 2.5653219096919924, 'max_features': 0.6886608621156498, 'n_estimators': 86.64905471492598}


In [10]:
from sklearn.model_selection import GridSearchCV

In [199]:
param_grid = {'learning_rate': np.arange(0.01,1,0.01),  
              'n_estimators':np.arange(1,100,1),
              'max_depth':np.arange(1,5,1),
              'max_features':np.arange(0.1,1,0.1)} 

In [200]:
model=GradientBoostingClassifier(random_state=42)
grid = GridSearchCV(model, param_grid, refit = True, n_jobs=-1, scoring='recall')

In [273]:
m=GradientBoostingClassifier(random_state=42, learning_rate=0.9, n_estimators=6000, max_features=0.5, max_depth=1)
m.fit(X_train, y_train)
y_pred=m.predict(X_test)
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred.astype('int')))
print(recall_score(y_test, y_pred.astype('int')))

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       309
           1       0.52      0.44      0.48        59

    accuracy                           0.85       368
   macro avg       0.71      0.68      0.69       368
weighted avg       0.84      0.85      0.84       368

0.47706422018348627
0.4406779661016949


In [252]:
m=AdaBoostClassifier(random_state=42, learning_rate=0.6, n_estimators=5000)
m.fit(X_train, y_train)
y_pred=m.predict(X_test)
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred.astype('int')))
print(recall_score(y_test, y_pred.astype('int')))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       309
           1       0.57      0.46      0.51        59

    accuracy                           0.86       368
   macro avg       0.74      0.70      0.71       368
weighted avg       0.85      0.86      0.85       368

0.509433962264151
0.4576271186440678


In [279]:
m=XGBClassifier(random_state=42, learning_rate=0.09, n_estimators=10000, max_depth=2)
m.fit(X_train, y_train)
y_pred=m.predict(X_test)
print(classification_report(y_test, y_pred))
print(f1_score(y_test, y_pred.astype('int')))
print(recall_score(y_test, y_pred.astype('int')))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92       309
           1       0.59      0.27      0.37        59

    accuracy                           0.85       368
   macro avg       0.73      0.62      0.64       368
weighted avg       0.83      0.85      0.83       368

0.37209302325581395
0.2711864406779661


## With sampling/splitting

In [2]:
X,y=data[numerical_feats+categorical_feats],data['Attrition']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,test_size=0.1, random_state=42)

data_pipeline = ColumnTransformer([('encoder',OrdinalEncoder(),categorical_feats), ('scaler',StandardScaler(),numerical_feats)])
data_pipeline.fit(X_train)
X_train = data_pipeline.transform(X_train)
X_val = data_pipeline.transform(X_val)
X_test = data_pipeline.transform(X_test)
label_enc = LabelEncoder().fit(y_train)
y_train = label_enc.transform(y_train)
y_val = label_enc.transform(y_val)
y_test = label_enc.transform(y_test)

NameError: name 'data' is not defined

In [48]:
nos=np.where(y_train==0)[0]
yes=np.where(y_train==1)[0]

print(len(nos) / len(yes))
print(len(yes)*5 - len(nos))

size=4+len(yes)
size

5.102564102564102
-20


199

In [121]:
indices1 = np.random.choice(nos, size=size, replace=False).tolist()
indices2=np.random.choice(list(set(nos).difference(set(indices1))), size=size, replace=False).tolist()
indices3=np.random.choice(list(set(nos).difference(set(indices1).union(set(indices2)))), size=size, replace=False).tolist()
indices4=np.random.choice(list(set(nos).difference(set(indices1).union(set(indices2).union(set(indices3))))), size=size, replace=False).tolist()
indices5=list(set(nos).difference(set(indices1).union(set(indices2).union(set(indices3).union(set(indices4))))))

X_train1,y_train1=X_train[list(indices1)+list(yes)], y_train[list(indices1)+list(yes)]
X_train2,y_train2=X_train[list(indices2)+list(yes)], y_train[list(indices2)+list(yes)]
X_train3,y_train3=X_train[list(indices3)+list(yes)], y_train[list(indices3)+list(yes)]
X_train4,y_train4=X_train[list(indices4)+list(yes)], y_train[list(indices4)+list(yes)]
X_train5,y_train5=X_train[list(indices5)+list(yes)], y_train[list(indices5)+list(yes)]


In [None]:
lr1=LogisticRegression(random_state=42).fit(X_train1,y_train1)
lr2=LogisticRegression(random_state=42).fit(X_train2,y_train2)
lr3=LogisticRegression(random_state=42).fit(X_train3,y_train3)
lr4=LogisticRegression(random_state=42).fit(X_train4,y_train4)
lr5=LogisticRegression(random_state=42).fit(X_train5,y_train5)

In [51]:
thrs=[]
for m in [lr1,lr2, lr3, lr4, lr5]:
    fpr,tpr,thresh=roc_curve(y_val, m.predict_proba(X_val)[:,1], )
    pos=np.argmax(tpr-fpr)
    thr=thresh[pos]
    print(thr)
    thrs.append(thr)

0.6303990816125558
0.8754494040192364
0.5136446930074011
0.6299445454037033
0.6321921591563339


In [123]:
lrs=[lr1,lr2,lr3, lr4, lr5]
thresholds=thrs

In [184]:
def pred_LR(df):

    if df.ndim==1:
        df = df.reshape(1,-1)

    probs=[]
    for i in np.arange(5):
        pred=lrs[i].predict_proba(df)[:,1]
        probs.append(pred)

    if (probs[0]>thrs[0]) | (probs[1]>thrs[1]) or (probs[2]>thrs[2]) or (probs[3]>thrs[3]) or (probs[4]>thrs[4]):
        idx = probs.index(max(probs))
        flag='positive'
        [prediction]=1
    else:
        idx = probs.index(min(probs))
        flag='negative'
        prediction=0
    model = [lr1,lr2,lr3,lr4,lr5][idx]
    train = [X_train1, X_train2, X_train3, X_train4, X_train5][idx]
    explanation=explain_causes(model, train, df, flag)
    explanation

    return prediction, explanation

In [185]:
def explain_causes(model,train,data, flag):

    cols= [numerical_feats+categorical_feats]
    explainer = shap.Explainer(model, pd.DataFrame(train,columns=cols), feature_names=cols)
    shap_values = explainer(data)
    print(shap_values.values)
    causes=pd.Series(shap_values.values[0], index=[numerical_feats+categorical_feats])
    if flag=='positive':
        causes=causes[causes>0]/sum(causes[causes>0]) *100
    else:
        causes=causes[causes<0]/sum(causes[causes<0]) *100

    return causes.sort_values(ascending=False)[:5]

In [186]:
sample=pd.DataFrame(X_test).loc[0,:].values

In [187]:
type(sample)

numpy.ndarray

In [188]:
sample.ndim

1

In [182]:
sample=sample.reshape(1,-1)

In [183]:
sample

array([[ 2.        ,  2.        ,  3.        ,  7.        ,  2.        ,
         0.        ,  1.        ,  2.        , -0.04705706, -0.93469745,
         0.74493761,  1.18662634, -0.43452409,  0.42361976, -1.56832827,
        -0.66122025,  0.91086309, -0.29375983, -0.51307701,  0.17846294,
        -0.35719981, -0.31525084,  1.29392532,  0.86904454, -0.05268588,
         0.33406546, -0.9625914 ,  0.0869617 ,  0.36413614, -1.0615199 ,
         1.19868536]])

In [169]:
cols= [numerical_feats+categorical_feats]
explainer = shap.Explainer(lr1, pd.DataFrame(X_train1), feature_names=cols)
shap_values = explainer(sample)

In [189]:
pred_LR(sample)

[[ 0.02253754  0.63235836  0.11109151 -0.220322    0.31007945 -0.74403044
   0.05074084 -0.03787706  0.03346022  0.26357062 -0.22557209 -0.36820801
  -0.06111127 -0.1120337   0.42978635  0.01388441 -0.30976681  0.14202147
  -0.09864275 -0.07362509 -0.10886968 -0.24914817  0.1083467   0.03313364
   0.06109977 -0.06954012  0.25929337 -0.00940343 -0.25062276 -0.52597587
  -0.15210617]]


(0,
 MonthlyIncome         20.571197
 Gender                14.542353
 DailyRate             10.180335
 YearsInCurrentRole     8.564534
 OverTime               6.929300
 dtype: float64)

In [126]:
y_pred, exp=pred_LR(X_test)

[[ 0.02253754  0.63235836  0.11109151 -0.220322    0.31007945 -0.74403044
   0.05074084 -0.03787706  0.03346022  0.26357062 -0.22557209 -0.36820801
  -0.06111127 -0.1120337   0.42978635  0.01388441 -0.30976681  0.14202147
  -0.09864275 -0.07362509 -0.10886968 -0.24914817  0.1083467   0.03313364
   0.06109977 -0.06954012  0.25929337 -0.00940343 -0.25062276 -0.52597587
  -0.15210617]]
[[ 0.02253754 -0.28410303  0.11109151  0.07607531 -0.45554882 -0.74403044
  -0.07611126  0.01865586  0.03346022 -0.52320735  0.30429523 -0.36820801
  -0.06111127  0.14383893  0.42978635  0.01662295 -0.09904109  0.14202147
   0.0991372  -0.16896945 -0.10886968 -0.36893094  0.14558953 -0.02154902
   0.44297333 -0.06954012 -0.42090959  0.10813948 -0.89324522 -0.36658925
  -0.04647688]]
[[ 0.0178051  -0.41606135 -0.35698952 -0.21163801 -0.13989724 -0.6148587
   0.20822268 -0.1014847   0.47375838 -0.10649477  0.40670364 -0.24287586
  -0.088576   -0.1349623  -0.15065864  0.05435307  0.15159001  0.97150324
   0.58

In [117]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.94      0.68      0.79       124
           1       0.31      0.78      0.44        23

    accuracy                           0.69       147
   macro avg       0.63      0.73      0.62       147
weighted avg       0.84      0.69      0.73       147



In [118]:
print(roc_auc_score(y_test,y_pred))
print(recall_score(y_test,y_pred))

0.7300140252454419
0.782608695652174


In [119]:
exp


[MonthlyIncome         18.788907
 Gender                14.760461
 DailyRate             12.161657
 YearsInCurrentRole     8.121094
 NumCompaniesWorked     6.683259
 dtype: float64,
 OverTime                17.724575
 MonthlyIncome           13.446972
 PerformanceRating        9.691884
 YearsWithCurrManager     9.473778
 JobRole                  9.240545
 dtype: float64,
 WorkLifeBalance    18.297764
 Age                12.986888
 JobRole            12.143324
 OverTime           11.483243
 Education           9.834227
 dtype: float64,
 NumCompaniesWorked       13.593486
 MonthlyIncome            11.513985
 PerformanceRating        11.487094
 HourlyRate                9.692419
 TrainingTimesLastYear     8.552825
 dtype: float64,
 Department         25.181939
 MonthlyIncome      15.150208
 WorkLifeBalance    11.144382
 DailyRate           8.222449
 EducationField      8.143841
 dtype: float64,
 MonthlyIncome            27.443990
 Education                12.469786
 PercentSalaryHike     

In [120]:
y_pred

[0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0]

# SHAP

In [58]:
import shap

In [65]:
shap.initjs()

In [86]:
ind=1
causes=pd.Series(shap_values[ind].values, index=[numerical_feats+categorical_feats])
causes=causes[causes>0]/sum(causes[causes>0]) *100
causes.sort_values(ascending=False)[:5]

MonthlyRate                 25.036045
Department                  23.084010
HourlyRate                  10.109678
PercentSalaryHike            7.895731
RelationshipSatisfaction     7.196938
dtype: float64

In [195]:
for row in pd.DataFrame(X_test).iterrows():

    pred_LR(row[1].values)
    break

[[ 0.02253754  0.63235836  0.11109151 -0.220322    0.31007945 -0.74403044
   0.05074084 -0.03787706  0.03346022  0.26357062 -0.22557209 -0.36820801
  -0.06111127 -0.1120337   0.42978635  0.01388441 -0.30976681  0.14202147
  -0.09864275 -0.07362509 -0.10886968 -0.24914817  0.1083467   0.03313364
   0.06109977 -0.06954012  0.25929337 -0.00940343 -0.25062276 -0.52597587
  -0.15210617]]


In [197]:
with open('../assets/thresholds.txt','r') as f:
    thresholds=f.read()

In [200]:
thresholds.split(',')

['0.6479366421161046',
 '0.6412177301860313',
 '0.8482939828741717',
 '0.5532917009883962',
 '0.7277587049845653']

In [3]:
import json
with open("config.json","r") as f:
    CONFIG=json.load(f)
NUMERICAL_FEATS=CONFIG["numerical_feats"]
CATEGORICAL_FEATS=CONFIG["categorical_feats"]

sample=data.loc[14,NUMERICAL_FEATS+CATEGORICAL_FEATS].values

In [4]:
from src.inference import predict

predict(sample)

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


ValueError: Specifying the columns using strings is only supported for pandas DataFrames