In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.preprocessing import LabelEncoder
# import pickle

# # ────────────────────────── Load Dataset ──────────────────────────
# df = pd.read_csv("turnover.csv", encoding="ISO-8859-1")
# print("Initial Shape:", df.shape)

# # ────────────────────────── Drop/Replace Columns ──────────────────────────
# # Drop irrelevant or high-NaN columns (customize based on EDA)
# df.dropna(axis=1, thresh=len(df) * 0.5, inplace=True)  # Drop cols with >50% NaNs

# # ────────────────────────── Fill Missing Values ──────────────────────────
# for col in df.select_dtypes(include="number").columns:
#     df[col].fillna(df[col].median(), inplace=True)

# for col in df.select_dtypes(include="object").columns:
#     df[col].fillna(df[col].mode()[0], inplace=True)

# # ────────────────────────── Label Encoding ──────────────────────────
# categorical_cols = df.select_dtypes(include="object").columns
# label_encoders = {}

# for col in categorical_cols:
#     le = LabelEncoder()
#     df[col] = le.fit_transform(df[col])
#     label_encoders[col] = le

# # ────────────────────────── Feature Engineering (Optional) ──────────────────────────
# if 'stag' in df.columns and 'age' in df.columns:
#     df["tenure_years"] = df["stag"] / 12
#     df["tenure_age_ratio"] = df["stag"] / (df["age"] + 1)

# # ────────────────────────── Target & Features ──────────────────────────
# target_col = "event"  # Assuming this is your target
# if target_col not in df.columns:
#     raise ValueError("❌ 'event' column not found in dataset!")

# X = df.drop(columns=[target_col])
# y = df[target_col]

# # ────────────────────────── Train/Test Split ──────────────────────────
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # ────────────────────────── Model Training ──────────────────────────
# model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
# model.fit(X_train, y_train)

# # ────────────────────────── Evaluation ──────────────────────────
# y_pred = model.predict(X_test)
# print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

# # ────────────────────────── Save Model ──────────────────────────
# with open("employee_turnover_optimized.pkl", "wb") as f:
#     pickle.dump(model, f)

# # ────────────────────────── Save Label Encoders (if needed) ──────────────────────────
# with open("label_encoders.pkl", "wb") as f:
#     pickle.dump(label_encoders, f)

# print("✅ Model and encoders saved successfully.")

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pickle

# ───────────────────── Load & Clean ─────────────────────
df = pd.read_csv("turnover.csv", encoding="latin1")
df.drop(columns=["greywage"], inplace=True)  # Remove bad wage column

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include="object").columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# ───────────────────── Feature Engineering ─────────────────────
df["tenure_years"] = df["stag"] / 12
df["tenure_age_ratio"] = df["stag"] / (df["age"] + 1)
df["age_group"] = pd.cut(df["age"], bins=[18,25,35,45,60,100], labels=[0,1,2,3,4])

# ───────────────────── Prepare Data ─────────────────────
X = df.drop(columns=["event"])
y = df["event"]

# Handle missing age_group
X["age_group"] = X["age_group"].astype("float").fillna(0).astype("int")

# ───────────────────── Balance Classes ─────────────────────
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# ───────────────────── Train Model ─────────────────────
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# ───────────────────── Evaluate ─────────────────────
print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))

# ───────────────────── Save ─────────────────────
with open("employee_turnover_optimized.pkl", "wb") as f:
    pickle.dump(model, f)

print("✅ Model saved successfully.")

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.71      0.68       108
           1       0.72      0.65      0.68       121

    accuracy                           0.68       229
   macro avg       0.68      0.68      0.68       229
weighted avg       0.68      0.68      0.68       229

✅ Model saved successfully.
