In [None]:
"""
Subscription Churn Forecasting (Jan 2025 - May 2025)
Author: Dr. Neha Sardana

Pipeline:
 - Load and clean telecom dataset (200K+ customers)
 - Handle missing values & categorical encoding
 - Balance data with SMOTE
 - Train Gradient Boosting model
 - Evaluate (precision, recall, F1, ROC-AUC)
 - Export feature importance for retention strategies
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

try:
    from imblearn.over_sampling import SMOTE
    IMB_AVAILABLE = True
except ImportError:
    IMB_AVAILABLE = False


# ---------------- Data Cleaning ----------------
def load_and_clean(file, target_col="churn"):
    df = pd.read_csv(file)

    # Drop duplicates
    df = df.drop_duplicates()

    # Handle missing values (example strategy)
    for col in df.select_dtypes(include=["float64", "int64"]).columns:
        df[col] = df[col].fillna(df[col].median())
    for col in df.select_dtypes(include=["object"]).columns:
        df[col] = df[col].fillna("Unknown")

    # Encode categorical features
    for col in df.select_dtypes(include=["object"]).columns:
        if col != target_col:
            df[col] = LabelEncoder().fit_transform(df[col])

    X = df.drop(columns=[target_col])
    y = df[target_col].map({"Yes": 1, "No": 0}) if df[target_col].dtype == "object" else df[target_col]

    return X, y


# ---------------- Train/Test Split + SMOTE ----------------
def prepare_data(X, y, test_size=0.2, scale=True, balance=True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=42)

    if balance and IMB_AVAILABLE:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    if scale:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test


# ---------------- Modeling ----------------
def train_and_evaluate(X_train, X_test, y_train, y_test, feature_names):
    model = GradientBoostingClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=5, random_state=42
    )
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    print("\n=== Gradient Boosting Evaluation ===")
    print(classification_report(y_test, preds, digits=4))
    print("ROC-AUC:", round(roc_auc_score(y_test, probs), 4))

    # Confusion matrix
    cm = confusion_matrix(y_test, preds)
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

    # Feature importance
    importances = model.feature_importances_
    feat_imp = pd.DataFrame({"feature": feature_names, "importance": importances})
    feat_imp = feat_imp.sort_values("importance", ascending=False)
    feat_imp.to_csv("reports/feature_importances.csv", index=False)

    plt.figure(figsize=(8, 6))
    sns.barplot(x="importance", y="feature", data=feat_imp.head(15), palette="viridis")
    plt.title("Top 15 Churn Drivers")
    plt.tight_layout()
    plt.show()

    return model


# ---------------- Main ----------------
def main():
    X, y = load_and_clean("data/customers.csv", target_col="churn")
    X_train, X_test, y_train, y_test = prepare_data(X, y)

    model = train_and_evaluate(X_train, X_test, y_train, y_test, feature_names=list(X.columns))


if __name__ == "__main__":
    main()
