In [19]:
from google.colab import drive
drive.mount('/content/drive')

import joblib, pandas as pd, numpy as np, json
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
import os

# ---- target column ----
target_col = "Attrition"

# Optional winsorizer for outliers
class Winsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, cols=None, lower=0.01, upper=0.99):
        self.cols, self.lower, self.upper = cols, lower, upper
        self.bounds_ = {}
    def fit(self, X, y=None):
        X = X.copy()
        if self.cols is None:
            self.cols = X.columns
        for c in self.cols:
            if pd.api.types.is_numeric_dtype(X[c]):
                lo, hi = X[c].quantile(self.lower), X[c].quantile(self.upper)
                self.bounds_[c] = (lo, hi)
        return self
    def transform(self, X):
        X = X.copy()
        for c,(lo,hi) in self.bounds_.items():
            if c in X and pd.api.types.is_numeric_dtype(X[c]):
                X[c] = X[c].clip(lo, hi)
        return X

# ---------------------------
# Load combined TRAIN + VAL
# ---------------------------
SPLITS_DIR = "/content/drive/My Drive/FDM Project Files/splits"

X_train = pd.read_csv(os.path.join(SPLITS_DIR, "X_train.csv"))
y_train = pd.read_csv(os.path.join(SPLITS_DIR, "y_train.csv"))
X_val   = pd.read_csv(os.path.join(SPLITS_DIR, "X_val.csv"))
y_val   = pd.read_csv(os.path.join(SPLITS_DIR, "y_val.csv"))

# Combine
X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

print("Combined TRAIN+VAL shape:", X_full.shape, y_full.shape)
print("Columns before fix:", X_full.columns.tolist())

# ✅ Fix Overtime column: convert Yes/No to 1/0
if "Overtime" in X_full.columns:
    X_full["Overtime"] = (X_full["Overtime"] == "Yes").astype(int)

print("Unique Overtime values:", X_full["Overtime"].unique())

# ---------------------------
# Treat all features as numeric (already encoded now)
# ---------------------------
numeric_features = list(X_full.columns)
if target_col in numeric_features:
    numeric_features.remove(target_col)

# ---------------------------
# Build pipeline
# ---------------------------
preproc = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
    ],
    remainder="drop"
)

gbm = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

pipe = Pipeline([
    ("winsor", Winsorizer(cols=numeric_features, lower=0.01, upper=0.99)),
    ("prep", preproc),
    ("model", gbm)
])

# ---------------------------
# Fit pipeline & save
# ---------------------------
pipe.fit(X_full, y_full)

GBM_DIR = "/content/drive/My Drive/FDM Project Files/models/gradiant_boosting"
os.makedirs(GBM_DIR, exist_ok=True)

# Save pipeline
joblib.dump(pipe, os.path.join(GBM_DIR, "gbm_pipeline.pkl"))

# Save training columns
columns_path = os.path.join(GBM_DIR, "gbm_columns.json")
with open(columns_path, "w") as f:
    json.dump(list(X_full.columns), f)

print("✅ Saved gbm_pipeline.pkl and gbm_columns.json")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Combined TRAIN+VAL shape: (59598, 31) (59598, 1)
Columns before fix: ['Work-Life Balance', 'Job Satisfaction', 'Performance Rating', 'Company Reputation', 'Employee Recognition', 'Gender_Male', 'Job Role_Finance', 'Job Role_Healthcare', 'Job Role_Media', 'Job Role_Technology', 'Education Level_Bachelor’s Degree', 'Education Level_High School', 'Education Level_Master’s Degree', 'Education Level_PhD', 'Marital Status_Married', 'Marital Status_Single', 'Job Level_Mid', 'Job Level_Senior', 'Company Size_Medium', 'Company Size_Small', 'Remote Work_Yes', 'Leadership Opportunities_Yes', 'Innovation Opportunities_Yes', 'Age', 'Years at Company', 'Monthly Income', 'Number of Promotions', 'Distance from Home', 'Company Tenure', 'Overtime', 'Number of Dependents']
Unique Overtime values: [0 1]


  y = column_or_1d(y, warn=True)


✅ Saved gbm_pipeline.pkl and gbm_columns.json
