In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, classification_report
)
from xgboost import XGBRegressor, XGBClassifier
import joblib
from sklearn.preprocessing import LabelEncoder

In [2]:
# ======== LOAD DATA ========
df = pd.read_csv("dataset/employee_data.csv")

In [3]:
# drop embedding if it exists but is blank
if "communication_embedding_vector" in df.columns:
    df = df.drop(columns=["communication_embedding_vector"])

In [4]:
# ======== CONFIG ========
target_cols = [col for col in df.columns if col in [
    "performance_score", "burnout_risk_score",
    "employee_engagement_score", "attrition_risk"
]]

In [5]:
feature_cols = [c for c in df.columns if c not in target_cols]
X = df[feature_cols]

In [6]:
# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
from math import sqrt

def train_regression_model(y, y_name):
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )

    model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        tree_method="hist"
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = sqrt(mean_squared_error(y_test, y_pred))  # ✅ manual RMSE computation
    r2 = r2_score(y_test, y_pred)
    print(f"\n--- {y_name.upper()} ---")
    print(f"MAE: {mae:.3f}  RMSE: {rmse:.3f}  R²: {r2:.3f}")

    joblib.dump(model, f"model_{y_name}.pkl")
    return model

In [8]:
def train_classification_model(y, y_name):
    # if numeric continuous probabilities, binarize into categories
    if y.dtype in [np.float64, np.float32]:
        y = pd.cut(
            y, bins=[-0.01, 0.33, 0.66, 1.0],
            labels=["low", "medium", "high"]
        )

    # Encode categorical labels as integers for XGBoost
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y_encoded, test_size=0.2, random_state=42
    )

    model = XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        tree_method="hist"
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Decode for human-readable classification report
    print(f"\n--- {y_name.upper()} ---")
    print(classification_report(
        le.inverse_transform(y_test),
        le.inverse_transform(y_pred)
    ))

    # Save both model and label encoder
    joblib.dump(model, f"model_{y_name}.pkl")
    joblib.dump(le, f"label_encoder_{y_name}.pkl")
    return model

In [9]:
# ======== TRAIN MODELS FOR EACH Y ========
trained_models = {}

for target in target_cols:
    y = df[target]
    if y.nunique() <= 5 or y.dtype == 'object':  # likely categorical
        trained_models[target] = train_classification_model(y, target)
    elif 'risk' in target.lower():  # burnout_risk or attrition_risk → classification
        trained_models[target] = train_classification_model(y, target)
    else:
        trained_models[target] = train_regression_model(y, target)


--- PERFORMANCE_SCORE ---
MAE: 0.028  RMSE: 0.034  R²: 0.757

--- BURNOUT_RISK_SCORE ---
              precision    recall  f1-score   support

        high       0.00      0.00      0.00         2
         low       0.72      0.67      0.69       117
      medium       0.86      0.89      0.87       281

    accuracy                           0.82       400
   macro avg       0.52      0.52      0.52       400
weighted avg       0.81      0.82      0.82       400



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [10]:
# ======== SAVE SCALER ========
joblib.dump(scaler, "feature_scaler.pkl")
print("\n✅ All models trained and saved successfully.")


✅ All models trained and saved successfully.
