In [2]:
import pandas as pd

In [29]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

In [5]:
from sklearn import set_config
set_config(print_changed_only=False)

In [98]:
data = pd.read_csv('cleanOS.csv')

In [144]:
data['resign'] = data.STATUS.map({"ACTIVE" : 0, 'TERMINATED' : 1})

In [145]:
x = data.drop(columns=['STATUS','resign'])
y = data['resign']

In [146]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,train_size = 0.8, random_state = 42, stratify = y)

In [147]:
xtrain.head()

Unnamed: 0,length_of_service,department_name,job_title,gender_full,BUSINESS_UNIT
95245,5,Dairy,Dairy Person,Male,STORES
28301,5,Dairy,Dairy Person,Female,STORES
6214,23,Produce,Produce Clerk,Male,STORES
64228,13,Produce,Produce Clerk,Female,STORES
17580,10,Customer Service,Cashier,Female,STORES


In [148]:
cat_one = ['department_name','job_title','BUSINESS_UNIT','gender_full']
rest = ['length_of_service']

In [149]:
one_pipe = Pipeline([('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer([
    ('onehoten', one_pipe, cat_one),
    ('defa', 'passthrough', rest)])

# Logistic Regression

In [150]:
pipeLR = Pipeline([
    ("prep", preprocessor),
    ("model", LogisticRegression())
])

In [151]:
pipeLR.fit(xtrain,ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehoten',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                                                                 sparse=True))],
                                                    

In [152]:
ybaseLR = pipeLR.predict(xtest)

In [156]:
label = [0,1]

In [157]:
print(classification_report(ytest, ybaseLR))

              precision    recall  f1-score   support

           0       0.67      0.63      0.65      9634
           1       0.65      0.68      0.67      9634

    accuracy                           0.66     19268
   macro avg       0.66      0.66      0.66     19268
weighted avg       0.66      0.66      0.66     19268



In [158]:
pd.DataFrame(confusion_matrix(ytest,ybaseLR,labels=label),columns=label,index=label)

Unnamed: 0,0,1
0,6055,3579
1,3046,6588


In [159]:
pipeLR.get_params()

{'memory': None,
 'steps': [('prep',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('onehoten',
                                    Pipeline(memory=None,
                                             steps=[('onehot',
                                                     OneHotEncoder(categories='auto',
                                                                   drop=None,
                                                                   dtype=<class 'numpy.float64'>,
                                                                   handle_unknown='error',
                                                                   sparse=True))],
                                             verbose=False),
                                    ['department_name', 'job_title',
                                     'BUSINESS_UNIT', 'gender_full']),
                                   (

In [None]:
LogisticRegression()

In [192]:
paramLR = {
    'model__C': [1.0, 0.1,0.08],
    'model__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    'model__penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'model__max_iter': [100, 600, 1000],
}

In [193]:
gridLR = GridSearchCV(estimator=pipeLR, param_grid=paramLR, verbose=1, n_jobs=-1, cv=4,scoring='recall')

In [194]:
gridLR.fit(xtrain,ytrain)

Fitting 4 folds for each of 180 candidates, totalling 720 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:  8.0min
[Parallel(n_jobs=-1)]: Done 720 out of 720 | elapsed: 14.4min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('prep',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehoten',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('onehot',
                                                                                          OneHotEncoder(categories='auto',
                                                                                                        drop=None,
                                          

In [195]:
gridLR.best_params_

{'model__C': 1.0,
 'model__max_iter': 100,
 'model__penalty': 'l1',
 'model__solver': 'liblinear'}

In [196]:
bestLR = gridLR.best_estimator_

In [197]:
ygridLR = bestLR.predict(xtest)

In [198]:
print(classification_report(ytest, ygridLR))

              precision    recall  f1-score   support

           0       0.66      0.62      0.64      9634
           1       0.65      0.68      0.66      9634

    accuracy                           0.65     19268
   macro avg       0.65      0.65      0.65     19268
weighted avg       0.65      0.65      0.65     19268



In [199]:
pd.DataFrame(confusion_matrix(ytest,ygridLR,labels=label),columns=label,index=label)

Unnamed: 0,0,1
0,6019,3615
1,3046,6588


# DECISION TREE

In [248]:
pipeDT = Pipeline([
    ("prep", preprocessor),
    ("model", DecisionTreeClassifier())
])

In [204]:
pipeDT.fit(xtrain,ytrain)

Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehoten',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                                                                 sparse=True))],
                                                    

In [205]:
ybaseDT = pipeDT.predict(xtest)

In [206]:
print(classification_report(ytest, ybaseDT))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      9634
           1       0.90      0.84      0.87      9634

    accuracy                           0.87     19268
   macro avg       0.87      0.87      0.87     19268
weighted avg       0.87      0.87      0.87     19268



In [208]:
pd.DataFrame(confusion_matrix(ytest,ybaseDT,labels=label),columns=label,index=label)

Unnamed: 0,0,1
0,8686,948
1,1498,8136


In [209]:
pipeDT.get_params()

{'memory': None,
 'steps': [('prep',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('onehoten',
                                    Pipeline(memory=None,
                                             steps=[('onehot',
                                                     OneHotEncoder(categories='auto',
                                                                   drop=None,
                                                                   dtype=<class 'numpy.float64'>,
                                                                   handle_unknown='error',
                                                                   sparse=True))],
                                             verbose=False),
                                    ['department_name', 'job_title',
                                     'BUSINESS_UNIT', 'gender_full']),
                                   (

In [286]:
paramDT = {
    'model__max_depth': [None,100,10],
    'model__max_features': [None,100,10],
    'model__min_samples_leaf': [1],
    'model__min_samples_split': [2]
}

In [287]:
gridDT = GridSearchCV(estimator=pipeDT, param_grid=paramDT, verbose=1, n_jobs=-1, cv=4,scoring='recall')

In [288]:
gridDT.fit(xtrain,ytrain)

Fitting 4 folds for each of 9 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    2.6s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('prep',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehoten',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('onehot',
                                                                                          OneHotEncoder(categories='auto',
                                                                                                        drop=None,
                                          

In [289]:
gridDT.best_params_

{'model__max_depth': 10,
 'model__max_features': None,
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2}

In [290]:
bestDT = gridDT.best_estimator_

In [291]:
ygridDT = bestDT.predict(xtest)

In [292]:
print(classification_report(ytest, ygridDT))

              precision    recall  f1-score   support

           0       0.85      0.89      0.87      9634
           1       0.89      0.85      0.87      9634

    accuracy                           0.87     19268
   macro avg       0.87      0.87      0.87     19268
weighted avg       0.87      0.87      0.87     19268



In [285]:
pd.DataFrame(confusion_matrix(ytest,ygridDT,labels=label),columns=label,index=label)

Unnamed: 0,0,1
0,8591,1043
1,1460,8174


# XGBoost

In [247]:
pipeXG = Pipeline([
    ("prep", preprocessor),
    ("model", XGBClassifier())
])

In [249]:
pipeXG.fit(xtrain,ytrain)

Pipeline(memory=None,
         steps=[('prep',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehoten',
                                                  Pipeline(memory=None,
                                                           steps=[('onehot',
                                                                   OneHotEncoder(categories='auto',
                                                                                 drop=None,
                                                                                 dtype=<class 'numpy.float64'>,
                                                                                 handle_unknown='error',
                                                                                 sparse=True))],
                                                    

In [251]:
ybaseXG = pipeXG.predict(xtest)

In [252]:
print(classification_report(ytest, ybaseXG))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      9634
           1       0.90      0.84      0.87      9634

    accuracy                           0.87     19268
   macro avg       0.87      0.87      0.87     19268
weighted avg       0.87      0.87      0.87     19268



In [253]:
pd.DataFrame(confusion_matrix(ytest,ybaseXG,labels=label),columns=label,index=label)

Unnamed: 0,0,1
0,8709,925
1,1535,8099


In [254]:
pipeXG.get_params()

{'memory': None,
 'steps': [('prep',
   ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                     transformer_weights=None,
                     transformers=[('onehoten',
                                    Pipeline(memory=None,
                                             steps=[('onehot',
                                                     OneHotEncoder(categories='auto',
                                                                   drop=None,
                                                                   dtype=<class 'numpy.float64'>,
                                                                   handle_unknown='error',
                                                                   sparse=True))],
                                             verbose=False),
                                    ['department_name', 'job_title',
                                     'BUSINESS_UNIT', 'gender_full']),
                                   (

In [272]:
paramXG = {
    'model__gamma': [10,13],
    'model__max_depth': [9,10,11],
    'model__learning_rate': [0.300000012,0.2,0.4],
}

In [273]:
gridXG = GridSearchCV(estimator=pipeXG, param_grid=paramXG, verbose=1, n_jobs=-1, cv=4,scoring='recall')

In [274]:
gridXG.fit(xtrain,ytrain)

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   51.1s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('prep',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('onehoten',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('onehot',
                                                                                          OneHotEncoder(categories='auto',
                                                                                                        drop=None,
                                          

In [275]:
gridXG.best_params_

{'model__gamma': 10,
 'model__learning_rate': 0.300000012,
 'model__max_depth': 9}

In [276]:
bestXG = gridXG.best_estimator_

In [277]:
ygridXG = bestXG.predict(xtest)

In [278]:
print(classification_report(ytest, ygridXG))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      9634
           1       0.89      0.85      0.87      9634

    accuracy                           0.87     19268
   macro avg       0.87      0.87      0.87     19268
weighted avg       0.87      0.87      0.87     19268



In [279]:
pd.DataFrame(confusion_matrix(ytest,ygridXG,labels=label),columns=label,index=label)

Unnamed: 0,0,1
0,8649,985
1,1477,8157


# Perbandingan Model

In [300]:
print('Logistic Regression')
pd.DataFrame(confusion_matrix(ytest,ygridLR,labels=label),columns=label,index=label)

Logistic Regression


Unnamed: 0,0,1
0,6019,3615
1,3046,6588


In [301]:
print('Decision Tree')
pd.DataFrame(confusion_matrix(ytest,ygridDT,labels=label),columns=label,index=label)

Decision Tree


Unnamed: 0,0,1
0,8591,1043
1,1460,8174


In [302]:
print('XGBoost')
pd.DataFrame(confusion_matrix(ytest,ygridXG,labels=label),columns=label,index=label)

XGBoost


Unnamed: 0,0,1
0,8649,985
1,1477,8157


In [297]:
print('Logistic Regression')
print(classification_report(ytest, ygridLR))

Logistic Regression
              precision    recall  f1-score   support

           0       0.66      0.62      0.64      9634
           1       0.65      0.68      0.66      9634

    accuracy                           0.65     19268
   macro avg       0.65      0.65      0.65     19268
weighted avg       0.65      0.65      0.65     19268



In [298]:
print('Decision Tree')
print(classification_report(ytest, ygridDT))

Decision Tree
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      9634
           1       0.89      0.85      0.87      9634

    accuracy                           0.87     19268
   macro avg       0.87      0.87      0.87     19268
weighted avg       0.87      0.87      0.87     19268



In [299]:
print('XGBoost')
print(classification_report(ytest, ygridXG))

XGBoost
              precision    recall  f1-score   support

           0       0.85      0.90      0.88      9634
           1       0.89      0.85      0.87      9634

    accuracy                           0.87     19268
   macro avg       0.87      0.87      0.87     19268
weighted avg       0.87      0.87      0.87     19268



# Kesimpulan

1. Kesimpulan Model:

- Tujuan dari model adalah untuk memprediksi karyawan apakah dia akan mengajukan resign atau tidak.
- Setelah dilakukan evaluasi terhadap ketiga model diatas, dapat dilihat bahwa model Decision Tree lebih cocok untuk permasalahan ini
- Score yang digunakan dalam menentukan best model adalah recall karena fokus untuk mendapatkan tebakan yang benar TP dibandingkan dengan seluruh data Positif yang ada.
- Meskipun score model Decision Tree sama dengan XGBoost, tetapi dapat dilihat pada confussion matrix, False Negatif untuk Decision Tree lebih rendah
- False Negatif adalah karyawan yang aktualnya akan mengajukan resign namun di prediksi tidak mengajikan resign, jika nilai FN besar maka perusahaan akan mendapatkan kerugian karena tidak dapat mengantisipasi karyawan yang akan mengajukan resign tersebut.

2. Kesimpulan EDA:

- Karyawan paling banyak mengajukan resign pada posisi Cashier dan Shelf Stocker
- Karyawan yang berada di 98% karyawannya HEADOFFICE cenderung berhenti karena sudah memasuki pensiun, sedangkan di STORES 30% karyawan berhenti karena mengajukan resign
- Karyawan dalam department Meats paling banyak mengajukan pemberhentian dibandingkan departmen lain

# Saran

- Perusahaan memperketat peraturan resign untuk posisi tertentu, seperti Cashier dan Shelf Stocker
- Karyawan HEADOFFICE cenderung lebih sedikit yang mengajukan resign dibanding STORES bisa sehingga dibutuhkan perhatian lebih kepada karyawan yang bekerja di unit STORES
- Karyawan di department Meats paling banyak mengajukan resign dengan alasan retirement. Sehingga perlu diperhatikan usia karyawan pada departmen Meats untuk mengantisipasi karyawan lain yang akan mengajikan resign - retirement.

In [303]:
import joblib

In [305]:
joblib.dump(bestDT, "DecTreeEmployee")

['DecTreeEmployee']