In [1]:
"""
hr_attrition_clean_pipeline.py

Clean pipeline:
 - Loads hr_attrition.csv
 - Drops unwanted columns
 - Protects against leakage (keep EmployeeNumber aside, drop any feature identical to target)
 - Preprocess (impute, scale, one-hot) inside a Pipeline
 - Train/Test split (stratified)
 - Train Logistic Regression & RandomForest (class_weight='balanced')
 - Evaluate on test set
 - Retrain models on full data for final predictions for all employees
 - KMeans clustering on preprocessed full data
 - Export CSVs for Tableau and save joblib models

Requirements:
 pip install pandas scikit-learn joblib matplotlib
"""

"\nhr_attrition_clean_pipeline.py\n\nClean pipeline:\n - Loads hr_attrition.csv\n - Drops unwanted columns\n - Protects against leakage (keep EmployeeNumber aside, drop any feature identical to target)\n - Preprocess (impute, scale, one-hot) inside a Pipeline\n - Train/Test split (stratified)\n - Train Logistic Regression & RandomForest (class_weight='balanced')\n - Evaluate on test set\n - Retrain models on full data for final predictions for all employees\n - KMeans clustering on preprocessed full data\n - Export CSVs for Tableau and save joblib models\n\nRequirements:\n pip install pandas scikit-learn joblib matplotlib\n"

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, average_precision_score
)

In [2]:
# ---------- USER SETTINGS ----------
INPUT_CSV = "hr_attrition.csv"
OUTPUT_DIR = Path("model_outputs_clean")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 442
TEST_SIZE = 0.30
N_CLUSTERS = 4
# -----------------------------------

# ---------- Columns (use/drop) ----------
use_cols = [
    "Age", "Department", "JobRole", "JobLevel", "DistanceFromHome",
    "BusinessTravel", "OverTime", "MonthlyIncome", "PercentSalaryHike",
    "PerformanceRating", "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
    "YearsSinceLastPromotion", "YearsWithCurrManager", "JobSatisfaction",
    "EnvironmentSatisfaction", "RelationshipSatisfaction", "WorkLifeBalance",
    "EducationField", "Education", "Gender", "MaritalStatus", "StockOptionLevel",
    # keep extra potentially useful fields you had
    "TrainingTimesLastYear", "NumCompaniesWorked", "JobInvolvement"
]

drop_cols = [
    "Random Number", "EmployeeCount", "DailyRate", "HourlyRate", "MonthlyRate",
    "Over18", "StandardHours", "attrition date"
    # DO NOT drop EmployeeNumber here because we'll keep it aside if present
]

# ---------- Load ----------
df_raw = pd.read_csv(INPUT_CSV)
print(f"Loaded {len(df_raw)} rows from {INPUT_CSV}")

# Keep EmployeeNumber if present (for re-attachment) but don't use in modeling
employee_id_col = None
for candidate in ["EmployeeNumber", "EmployeeId", "Employee_ID", "EmpID"]:
    if candidate in df_raw.columns:
        employee_id_col = candidate
        break

if employee_id_col:
    print(f"Found employee ID column: {employee_id_col}")
    df_raw[employee_id_col] = df_raw[employee_id_col].astype(str).str.strip()
else:
    print("No employee ID column found. Predictions will not include an ID column.")

# Drop columns explicitly
for c in drop_cols:
    if c in df_raw.columns:
        df_raw.drop(columns=c, inplace=True)

Loaded 1470 rows from hr_attrition.csv
Found employee ID column: EmployeeNumber


In [3]:
# Keep only the use_cols that exist + Attrition + employee id (if present)
present_use = [c for c in use_cols if c in df_raw.columns]
cols_to_keep = ["Attrition"] + present_use
if employee_id_col:
    # keep employee id in the master dataframe
    cols_to_keep = [employee_id_col] + cols_to_keep

df = df_raw[cols_to_keep].copy()
print("Columns kept for processing:", df.columns.tolist())

Columns kept for processing: ['EmployeeNumber', 'Attrition', 'Age', 'Department', 'JobRole', 'JobLevel', 'DistanceFromHome', 'BusinessTravel', 'OverTime', 'MonthlyIncome', 'PercentSalaryHike', 'PerformanceRating', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance', 'EducationField', 'Education', 'Gender', 'MaritalStatus', 'StockOptionLevel', 'TrainingTimesLastYear', 'NumCompaniesWorked', 'JobInvolvement']


In [4]:
# ---------- Target encoding ----------
if "Attrition" not in df.columns:
    raise ValueError("Attrition column not found.")

df["Attrition"] = df["Attrition"].astype(str).str.strip()
df = df[df["Attrition"].notna()].copy()
df["Attrition_bin"] = df["Attrition"].apply(lambda x: 1 if str(x).lower().startswith("y") else 0)
print("Attrition distribution:", df["Attrition_bin"].value_counts().to_dict())


Attrition distribution: {0: 1233, 1: 237}


In [5]:
# ---------- Prevent obvious leakage ----------
# 1) Check features identical to target -> drop them
candidate_features = [c for c in df.columns if c not in ["Attrition", "Attrition_bin", employee_id_col]]
identical_to_target = []
for c in candidate_features:
    try:
        if df[c].dtype != object and df[c].equals(df["Attrition_bin"]):
            identical_to_target.append(c)
    except Exception:
        pass

if identical_to_target:
    print("Dropping features identical to the target (leakage):", identical_to_target)
    df.drop(columns=identical_to_target, inplace=True)

In [6]:
# ---------- Feature lists: adaptive detection ----------
numeric_candidates = [
    "Age", "DistanceFromHome", "MonthlyIncome", "PercentSalaryHike",
    "TotalWorkingYears", "YearsAtCompany", "YearsInCurrentRole",
    "YearsSinceLastPromotion", "YearsWithCurrManager", "StockOptionLevel",
    "TrainingTimesLastYear", "NumCompaniesWorked" 
]
ordinal_candidates = ["PerformanceRating", "JobLevel", "Education", "JobSatisfaction", "EnvironmentSatisfaction",
                      "RelationshipSatisfaction", "WorkLifeBalance", "JobInvolvement"]
cat_candidates = ["Department", "JobRole", "BusinessTravel", "OverTime",
                  "EducationField", "Gender", "MaritalStatus"]

numeric_candidates = [c for c in numeric_candidates if c in df.columns]
ordinal_candidates = [c for c in ordinal_candidates if c in df.columns]
cat_candidates = [c for c in cat_candidates if c in df.columns]

# helper: numeric-like detection
def is_numeric_like(series, thresh=0.9):
    coerced = pd.to_numeric(series, errors="coerce")
    return coerced.notna().mean() >= thresh

numeric_features = []
ordinal_features = []
cat_features = []

for c in numeric_candidates:
    if is_numeric_like(df[c]):
        numeric_features.append(c)
    else:
        cat_features.append(c)

for c in ordinal_candidates:
    if is_numeric_like(df[c]):
        ordinal_features.append(c)
    else:
        # if text education mapping is desired, map manually before running the script
        cat_features.append(c)

# add remaining categorical candidates
for c in cat_candidates:
    if c not in numeric_features + ordinal_features and c in df.columns:
        cat_features.append(c)

# final feature list
feature_columns = numeric_features + ordinal_features + cat_features
print("Final feature sets:")
print("  numeric:", numeric_features)
print("  ordinal:", ordinal_features)
print("  categorical:", cat_features)
print("  total features used:", len(feature_columns))

Final feature sets:
  numeric: ['Age', 'DistanceFromHome', 'MonthlyIncome', 'PercentSalaryHike', 'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager', 'StockOptionLevel', 'TrainingTimesLastYear', 'NumCompaniesWorked']
  ordinal: ['PerformanceRating', 'JobLevel', 'JobSatisfaction', 'EnvironmentSatisfaction', 'RelationshipSatisfaction', 'WorkLifeBalance', 'JobInvolvement']
  categorical: ['Education', 'Department', 'JobRole', 'BusinessTravel', 'OverTime', 'EducationField', 'Gender', 'MaritalStatus']
  total features used: 27


In [7]:
# ---------- Prepare X and y and keep employee ids aligned ----------
# Keep a copy of the employee id column aligned with df rows
if employee_id_col:
    df = df.reset_index(drop=True)
    ids = df[employee_id_col].astype(str).reset_index(drop=True)
else:
    ids = pd.Series([None]*len(df))

X = df[feature_columns].copy()
y = df["Attrition_bin"].copy()

# ---------- Train/Test split (stratified) ----------
X_train, X_test, y_train, y_test, ids_train, ids_test = train_test_split(
    X, y, ids, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print("Train/test sizes:", X_train.shape[0], X_test.shape[0])

Train/test sizes: 1029 441


In [9]:
X.columns

Index(['Age', 'DistanceFromHome', 'MonthlyIncome', 'PercentSalaryHike',
       'TotalWorkingYears', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager', 'StockOptionLevel',
       'TrainingTimesLastYear', 'NumCompaniesWorked', 'PerformanceRating',
       'JobLevel', 'JobSatisfaction', 'EnvironmentSatisfaction',
       'RelationshipSatisfaction', 'WorkLifeBalance', 'JobInvolvement',
       'Education', 'Department', 'JobRole', 'BusinessTravel', 'OverTime',
       'EducationField', 'Gender', 'MaritalStatus'],
      dtype='object')

In [12]:
# ---------- Preprocessing pipelines (fit only on train inside pipeline) ----------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
ordinal_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("ord", ordinal_transformer, ordinal_features),
    ("cat", categorical_transformer, cat_features)
], remainder="drop", sparse_threshold=0)


In [13]:

# helper to get feature names after transformation
def get_feature_names_from_column_transformer(ct: ColumnTransformer):
    names = []
    for name, transformer, cols in ct.transformers_:
        if name == "remainder":
            continue
        if isinstance(transformer, Pipeline) and "ohe" in transformer.named_steps:
            ohe = transformer.named_steps["ohe"]
            cols_list = list(cols)
            try:
                names.extend(list(ohe.get_feature_names_out(cols_list)))
            except Exception:
                # fallback generic names for categories
                for c in cols_list:
                    names.append(c)
        else:
            names.extend(list(cols))
    return names

In [14]:
# ---------- Build pipelines ----------
pipe_lr = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE))
])

pipe_rf = Pipeline(steps=[
    ("preproc", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=RANDOM_STATE))
])

# ---------- Train on train set ----------
print("Training Logistic Regression on train set...")
pipe_lr.fit(X_train, y_train)

print("Training Random Forest on train set...")
pipe_rf.fit(X_train, y_train)

# ---------- Evaluate on test set ----------
pred_lr = pipe_lr.predict(X_test)
proba_lr = pipe_lr.predict_proba(X_test)[:, 1]

pred_rf = pipe_rf.predict(X_test)
proba_rf = pipe_rf.predict_proba(X_test)[:, 1]

lr_metrics = {
    "accuracy": accuracy_score(y_test, pred_lr),
    "precision": precision_score(y_test, pred_lr, zero_division=0),
    "recall": recall_score(y_test, pred_lr, zero_division=0),
    "f1": f1_score(y_test, pred_lr, zero_division=0),
    "roc_auc": roc_auc_score(y_test, proba_lr),
    "pr_auc": average_precision_score(y_test, proba_lr)
}

rf_metrics = {
    "accuracy": accuracy_score(y_test, pred_rf),
    "precision": precision_score(y_test, pred_rf, zero_division=0),
    "recall": recall_score(y_test, pred_rf, zero_division=0),
    "f1": f1_score(y_test, pred_rf, zero_division=0),
    "roc_auc": roc_auc_score(y_test, proba_rf),
    "pr_auc": average_precision_score(y_test, proba_rf)
}

print("Logistic Regression (test):", lr_metrics)
print("Random Forest (test):", rf_metrics)


Training Logistic Regression on train set...
Training Random Forest on train set...
Logistic Regression (test): {'accuracy': 0.7437641723356009, 'precision': 0.3618421052631579, 'recall': 0.7746478873239436, 'f1': 0.49327354260089684, 'roc_auc': 0.8044156832889227, 'pr_auc': 0.6470101455104612}
Random Forest (test): {'accuracy': 0.8616780045351474, 'precision': 0.8125, 'recall': 0.18309859154929578, 'f1': 0.2988505747126437, 'roc_auc': 0.8061857632280167, 'pr_auc': 0.5665101897464231}


In [15]:

# save test evaluation
pd.DataFrame([lr_metrics, rf_metrics], index=["LogisticRegression", "RandomForest"]).to_csv(OUTPUT_DIR / "test_metrics.csv")

# ---------- Feature importances & coefficients mapping (use preprocessor fitted on train) ----------
# get feature names by fitting preprocessor on X_train (we need feature names for importances)
preprocessor_fitted = pipe_rf.named_steps["preproc"]
try:
    feat_names_post = get_feature_names_from_column_transformer(preprocessor_fitted)
except Exception:
    feat_names_post = [f"f{i}" for i in range(pipe_rf.named_steps["clf"].n_features_in_)]

# Logistic coefficients
try:
    lr_coefs = pipe_lr.named_steps["clf"].coef_[0]
    df_lr_coef = pd.DataFrame({"feature": feat_names_post, "coefficient": lr_coefs})
    df_lr_coef["abs_coef"] = df_lr_coef["coefficient"].abs()
    df_lr_coef.sort_values(by="abs_coef", ascending=False, inplace=True)
    df_lr_coef.drop(columns="abs_coef", inplace=True)
    df_lr_coef.to_csv(OUTPUT_DIR / "logistic_coefficients.csv", index=False)
    print("Wrote logistic_coefficients.csv")
except Exception as e:
    print("Could not map LR coefficients:", e)

# Random Forest feature importance
try:
    rf_importances = pipe_rf.named_steps["clf"].feature_importances_
    df_rf_feat = pd.DataFrame({"feature": feat_names_post, "importance": rf_importances})
    df_rf_feat.sort_values(by="importance", ascending=False, inplace=True)
    df_rf_feat.to_csv(OUTPUT_DIR / "rf_feature_importance.csv", index=False)
    print("Wrote rf_feature_importance.csv")
except Exception as e:
    print("Could not map RF importances:", e)

# Save models trained on train data (optional)
joblib.dump(pipe_lr, OUTPUT_DIR / "logistic_model_train.joblib")
joblib.dump(pipe_rf, OUTPUT_DIR / "random_forest_model_train.joblib")

Wrote logistic_coefficients.csv
Wrote rf_feature_importance.csv


['model_outputs_clean/random_forest_model_train.joblib']

In [16]:


# ---------- Retrain on FULL data for final predictions for all employees ----------
print("Retraining models on full dataset for final scoring...")

# Rebuild preprocessor (fresh) -- but use same variable names for clarity
preprocessor_full = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("ord", ordinal_transformer, ordinal_features),
    ("cat", categorical_transformer, cat_features)
], remainder="drop", sparse_threshold=0)

pipe_lr_full = Pipeline(steps=[("preproc", preprocessor_full),
                               ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", random_state=RANDOM_STATE))])
pipe_rf_full = Pipeline(steps=[("preproc", preprocessor_full),
                               ("clf", RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=RANDOM_STATE))])

pipe_lr_full.fit(X, y)
pipe_rf_full.fit(X, y)

proba_lr_all = pipe_lr_full.predict_proba(X)[:, 1]
pred_lr_all = pipe_lr_full.predict(X)

proba_rf_all = pipe_rf_full.predict_proba(X)[:, 1]
pred_rf_all = pipe_rf_full.predict(X)

# Add predictions back to original df aligned by index
df_all = df.reset_index(drop=True).copy()
if employee_id_col:
    df_all[employee_id_col] = ids.values

df_all["Predicted_Attrition_LR"] = pred_lr_all
df_all["Attrition_Probability_LR"] = proba_lr_all
df_all["Predicted_Attrition_RF"] = pred_rf_all
df_all["Attrition_Probability_RF"] = proba_rf_all

df_all["Predicted_Attrition_LR_label"] = df_all["Predicted_Attrition_LR"].map({1: "Yes", 0: "No"})
df_all["Predicted_Attrition_RF_label"] = df_all["Predicted_Attrition_RF"].map({1: "Yes", 0: "No"})

out_all = OUTPUT_DIR / "attrition_predictions_all_employees.csv"
df_all.to_csv(out_all, index=False)
print("Wrote full predictions to:", out_all.resolve())

# Save final models (pipelines fitted on full data)
joblib.dump(pipe_lr_full, OUTPUT_DIR / "logistic_model_full.joblib")
joblib.dump(pipe_rf_full, OUTPUT_DIR / "random_forest_model_full.joblib")



Retraining models on full dataset for final scoring...
Wrote full predictions to: /Users/ghazalayobi/portfolio_projects/hr_attrition/model_outputs_clean/attrition_predictions_all_employees.csv


['model_outputs_clean/random_forest_model_full.joblib']

In [17]:
# ---------- Clustering on full preprocessed data ----------
print("Performing KMeans clustering on all employees (preprocessed features)...")
X_all_prep = pipe_rf_full.named_steps["preproc"].transform(X)  # numpy array
kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_STATE, n_init=20)
clusters = kmeans.fit_predict(X_all_prep)

df_all["Cluster"] = clusters
df_all.to_csv(OUTPUT_DIR / "attrition_predictions_with_clusters.csv", index=False)
print("Wrote attrition_predictions_with_clusters.csv with cluster labels")

# Cluster profiles
profile_cols = numeric_features + ordinal_features
existing_profile_cols = [c for c in profile_cols if c in df_all.columns]
cluster_profile = df_all.groupby("Cluster")[existing_profile_cols].mean().round(3)
cluster_profile["count"] = df_all.groupby("Cluster").size()
cluster_profile.to_csv(OUTPUT_DIR / "cluster_profiles.csv")
print("Wrote cluster_profiles.csv")

# ---------- Confusion/Classification report on test set ----------
pd.DataFrame(classification_report(y_test, pred_rf, output_dict=True)).transpose().to_csv(OUTPUT_DIR / "rf_classification_report_test.csv")
pd.DataFrame(confusion_matrix(y_test, pred_rf), index=["Actual_0","Actual_1"], columns=["Pred_0","Pred_1"]).to_csv(OUTPUT_DIR / "rf_confusion_matrix_test.csv")
print("Wrote rf classification report and confusion matrix for test set")

print("All outputs written to", OUTPUT_DIR.resolve())

Performing KMeans clustering on all employees (preprocessed features)...
Wrote attrition_predictions_with_clusters.csv with cluster labels
Wrote cluster_profiles.csv
Wrote rf classification report and confusion matrix for test set
All outputs written to /Users/ghazalayobi/portfolio_projects/hr_attrition/model_outputs_clean


In [19]:
# Keep a clean copy of EmployeeNumber (not used in training)
employee_ids = df_raw["EmployeeNumber"].reset_index(drop=True)

# Create a final dataframe with all features and predictions
df_final = df.reset_index(drop=True).copy()
df_final["EmployeeNumber"] = employee_ids

# Add model predictions
df_final["Predicted_Attrition_LR"] = pred_lr_all
df_final["Attrition_Probability_LR"] = proba_lr_all
df_final["Predicted_Attrition_RF"] = pred_rf_all
df_final["Attrition_Probability_RF"] = proba_rf_all

# Add cluster labels if you did clustering
df_final["Cluster"] = clusters

# Optional: create human-readable labels
df_final["Predicted_Attrition_LR_label"] = df_final["Predicted_Attrition_LR"].map({1: "Yes", 0: "No"})
df_final["Predicted_Attrition_RF_label"] = df_final["Predicted_Attrition_RF"].map({1: "Yes", 0: "No"})

# Save for Tableau
output_all_path = OUTPUT_DIR / "attrition_predictions_all_employees.csv"
df_final.to_csv(output_all_path, index=False)
print(f"✅ Final predictions saved: {output_all_path.resolve()} (rows: {len(df_final)})")

✅ Final predictions saved: /Users/ghazalayobi/portfolio_projects/hr_attrition/model_outputs_clean/attrition_predictions_all_employees.csv (rows: 1470)
