In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import sys
import os

root_path = os.path.abspath("..")
if root_path not in sys.path:
    sys.path.append(root_path)

from src.data_preprocessing import clean_data
from src.feature_engineering import add_features
from src.modeling import train_model
from src.evaluation import evaluate_classification
import joblib


1. Load Cleaned Dataset

In [2]:
df = pd.read_csv("../data/HR_capstone_dataset.csv")

df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


2. Preprocessing

In [3]:
df_clean = clean_data(df)
df_fe = add_features(df_clean)

df_fe.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,hours_per_project,long_tenure,...,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_low,salary_medium,satisfaction_bucket_medium,satisfaction_bucket_high
0,0.38,0.53,2,157,3,0,1,0,78.5,0,...,False,False,False,True,False,False,True,False,True,False
1,0.8,0.86,5,262,6,0,1,0,52.4,1,...,False,False,False,True,False,False,False,True,False,True
2,0.11,0.88,7,272,4,0,1,0,38.857143,1,...,False,False,False,True,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,44.6,1,...,False,False,False,True,False,False,True,False,False,True
4,0.37,0.52,2,159,3,0,1,0,79.5,0,...,False,False,False,True,False,False,True,False,True,False


3. Train-test split

In [4]:
X = df_fe.drop("left", axis=1)
y = df_fe["left"].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

In [5]:
print(type(y_train))
print(y_train.shape)
print(y_train[:10])

<class 'numpy.ndarray'>
(9592,)
[0 0 0 0 0 0 0 0 0 0]


4. Train model

In [6]:
model = train_model(X_train, y_train)


5. Evaluate

In [7]:
evaluate_classification(model, X_test, y_test)

Model Performance:
Accuracy: 0.9850
Recall:   0.9246

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2001
           1       0.98      0.92      0.95       398

    accuracy                           0.98      2399
   macro avg       0.98      0.96      0.97      2399
weighted avg       0.98      0.98      0.98      2399



6. Save model

In [8]:
joblib.dump(model, "../models/attrition_model.pkl")

['../models/attrition_model.pkl']