<a href="https://colab.research.google.com/github/dadakys/telecom-churn-prediction/blob/main/thesis_telecom_churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import necessary libraries
from google.colab import drive
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import tensorflow as tf

In [None]:
#import dataset from google drive
drive.mount('/content/drive')
file_path='/content/drive/My Drive/SXOLH2/thesis/cell2celltrain.csv'
df=pd.read_csv(file_path)

In [None]:
# Basic info for the dataset
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.head())
print(df.describe())


In [None]:
#VISUALIZATIONS
# Churn distribution with value labels
plt.figure(figsize=(6, 5))
ax = sns.countplot(data=df, x='Churn')
plt.title('Churn Distribution')
plt.xlabel('Churned')
plt.ylabel('Customers')

# Add count labels on top of each bar
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width() / 2., height + 500, int(height), ha="center", fontsize=12)

plt.show()


In [None]:
#VISUALIZE BOXPLOTS

# Make sure Churn is treated as a category
df['Churn'] = df['Churn'].astype(str)

# Select numeric columns (exclude ID/encoded target if necessary)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(['CustomerID'])

# Create a boxplot for each numeric column grouped by Churn
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.boxplot(data=df, x='Churn', y=col)
    plt.title(f'Boxplot of {col} ')
    plt.xlabel('Churn')
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()



In [None]:
#VISUALIZE HISTOGRAMS

# Select numeric features (excluding CustomerID and target)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(['CustomerID'])

# Loop through and create histograms for each feature by class
for col in numeric_cols:
    plt.figure(figsize=(8, 4))

    # Plot histogram for each class
    for label in df['Churn'].unique():
        subset = df[df['Churn'] == label]
        plt.hist(subset[col], bins=30, alpha=0.6, label=f'Churn = {label}', density=True,edgecolor='black')

    plt.title(f'Histogram of {col} by Churn')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.legend()
    plt.tight_layout()
    plt.show()


In [None]:
# DATA PREPARATION / PREPROCESSING

# 1. Duplicate check
duplicate_rows = df.duplicated()
num_duplicates = duplicate_rows.sum()
print(f"Number of duplicate rows: {num_duplicates}")

# 2. Missing value check (now includes percentage)
missing_count = df.isnull().sum()
missing_percent = (missing_count / len(df)) * 100

# Filter only columns with missing values
missing_data = pd.DataFrame({
    'Missing Values': missing_count[missing_count > 0],
    'Percent Missing (%)': missing_percent[missing_count > 0].round(2)
})

if not missing_data.empty:
    print("There are missing values in the dataset.")
    print("Here is the count and percentage of missing values for each column:\n")
    print(missing_data)

    # Columns to fill
    num_cols_to_fill = [
        'MonthlyRevenue', 'MonthlyMinutes', 'TotalRecurringCharge',
        'DirectorAssistedCalls', 'OverageMinutes', 'RoamingCalls',
        'PercChangeMinutes', 'PercChangeRevenues',
        'Handsets', 'HandsetModels', 'CurrentEquipmentDays',
        'AgeHH1', 'AgeHH2'
    ]
    cat_cols_to_fill = ['ServiceArea']

    # Fill numerical columns with median
    for col in num_cols_to_fill:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled missing values in '{col}' with median: {median_val}")

    # Fill categorical column with mode
    for col in cat_cols_to_fill:
        mode_val = df[col].mode()[0]
        df[col] = df[col].fillna(mode_val)
        print(f"Filled missing values in '{col}' with mode: {mode_val}")

    print("\n✅ Missing values handled.")
else:
    print("No missing values found in the dataset.")


In [None]:
# Check for outliers
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.drop(['CustomerID'])

print("🔍 Outlier Clipping Report:\n")
outlier_summary = []
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR

    # Count outliers BEFORE clipping
    outliers_before = ((df[col] < lower_bound) | (df[col] > upper_bound)).sum()

    if outliers_before > 0:
        min_before, max_before = df[col].min(), df[col].max()

        # Clip the values
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
        min_after, max_after = df[col].min(), df[col].max()
        outlier_summary.append({
            "Feature Name": col,
            "Outliers Clipped": outliers_before
        })
        print(f"🧾 {col}:")
        print(f"   Outliers clipped: {outliers_before}")
        print(f"   Min/Max before: {min_before} / {max_before}")
        print(f"   Min/Max after:  {min_after} / {max_after}\n")
outlier_df=pd.DataFrame(outlier_summary)
outlier_df.to_csv("outliers.csv",index=False)
print("✅ Outliers handled.")





In [None]:
#FEATURE SELECTION
from sklearn.feature_selection import VarianceThreshold

# Remove features with near-zero variance
selector = VarianceThreshold(threshold=0.01)
selector.fit(df.select_dtypes(include=['number']))  # only numeric columns

low_variance_cols = df.select_dtypes(include=['number']).columns[~selector.get_support()]
print("Low-variance features to consider removing:", list(low_variance_cols))


In [None]:
# Drop features with extremely low variance or clipped entirely to 0 after outlier handling
columns_to_drop = [
    'RetentionCalls',
    'RetentionOffersAccepted',
    'ReferralsMadeBySubscriber',
    'CallForwardingCalls',
    'AdjustmentsToCreditRating'
]
df.drop(columns=columns_to_drop, inplace=True)
print("Dropped columns:", columns_to_drop)
# Dropping features due to extremely low variance or being fully clipped to 0 during outlier treatment.
# These features provide minimal signal for modeling and could introduce noise or unnecessary complexity.


In [None]:
# Compute correlation matrix
corr_matrix = df.select_dtypes(include=['number']).corr().abs()

# Upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation > 0.8
to_drop = [column for column in upper.columns if any(upper[column] > 0.8)]

print("Highly correlated features to consider dropping:", to_drop)


In [None]:
  from sklearn.ensemble import RandomForestClassifier

  # Prepare data
  X = df.drop(columns=['Churn', 'CustomerID'])  # exclude target and ID
  y = df['Churn'].map({'Yes': 1, 'No': 0})       # encode target

  # Handle categorical features (basic encoding)
  X_encoded = pd.get_dummies(X)

  # Fit random forest
  model = RandomForestClassifier(n_estimators=100, random_state=42)
  model.fit(X_encoded, y)

  # Feature importances
  importances = pd.Series(model.feature_importances_, index=X_encoded.columns)
  important_features = importances.sort_values(ascending=False)
  top_20_features = important_features.head(20).index.tolist()
  # Plot top 20 features
  important_features.head(20).plot(kind='barh', figsize=(10, 8), title='Top 20 Important Features')
  plt.gca().invert_yaxis()
  plt.show()


In [None]:
#check correlation for potential drop
print(df[['DroppedCalls', 'BlockedCalls', 'DroppedBlockedCalls']].corr())


In [None]:
# Dropping 'DroppedBlockedCalls' as it is a derived feature (sum of DroppedCalls and BlockedCalls)
# and is highly correlated with both. Also dropping 'ActiveSubs' and 'HandsetModels' due to high correlation
# and low importance in feature ranking.
df.drop(columns=['DroppedBlockedCalls', 'ActiveSubs', 'HandsetModels'], inplace=True)


In [None]:
#check the dataset to see if the features dropped
print(df.shape)

In [None]:
#check unique values (categories) for each column before encoding
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"\nColumn: {col}")
        print(df[col].unique())


In [None]:
#  One-plot summary: “Unknown” vs numeric values in HandsetPrice
import os

out_dir = "/content/drive/MyDrive/plots"
os.makedirs(out_dir, exist_ok=True)

# ── detect rows that were literal "Unknown" before conversion ──
mask_unknown = df["HandsetPrice"].astype(str).str.lower() == "unknown"
df["HandsetPriceStatus"] = pd.Series(
    np.where(mask_unknown, "Unknown", "Numeric"), index=df.index)

counts = df["HandsetPriceStatus"].value_counts().reindex(["Numeric", "Unknown"])

# ── plot & save ────────────────────────────────────────────────
plt.figure(figsize=(5, 4))
sns.barplot(x=counts.index, y=counts.values, palette="Blues")
plt.title("HandsetPrice Distribution")
plt.ylabel("Count"); plt.xlabel("")
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "handset_unknown_vs_numeric.png"), dpi=300)
plt.show()



In [None]:
#  Final Preprocessing Cell

from sklearn.preprocessing import StandardScaler

# 1. Group rare ServiceAreas
area_counts = df['ServiceArea'].value_counts()
top_areas = area_counts[area_counts > 100].index
df['ServiceArea'] = df['ServiceArea'].where(df['ServiceArea'].isin(top_areas), other='Other')

# 2. Encode target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 3. Encode binary "Yes"/"No" features
binary_cols = [
    'HandsetRefurbished', 'HandsetWebCapable', 'TruckOwner', 'RVOwner',
    'BuysViaMailOrder', 'RespondsToMailOffers',
    'OptOutMailings', 'NonUSTravel', 'OwnsComputer', 'HasCreditCard',
    'NewCellphoneUser', 'NotNewCellphoneUser', 'OwnsMotorcycle',
    'ChildrenInHH', 'MadeCallToRetentionTeam'
]
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

# 4. Encode 'Homeownership' and 'MaritalStatus'
df['Homeownership'] = df['Homeownership'].map({'Known': 1, 'Unknown': 0})
df['MaritalStatus'] = df['MaritalStatus'].map({'Yes': 1, 'No': 0, 'Unknown': 2})

# 5. Handle 'HandsetPrice' (convert and fill missing)
df['HandsetPrice'] = pd.to_numeric(df['HandsetPrice'], errors='coerce')
df['HandsetPrice'].fillna(df['HandsetPrice'].median(), inplace=True)

# 6. Ordinal encode 'CreditRating'
credit_rating_map = {
    '1-Highest': 1,
    '2-High': 2,
    '3-Good': 3,
    '4-Medium': 4,
    '5-Low': 5,
    '6-Very Low': 6,
    '7-Lowest': 7
}
df['CreditRating'] = df['CreditRating'].map(credit_rating_map)
df['CreditRating'].fillna(df['CreditRating'].median(), inplace=True)

# 7. Encode ServiceArea (after grouping)
service_area_mapping = {area: idx for idx, area in enumerate(sorted(df['ServiceArea'].unique()))}
df['ServiceArea'] = df['ServiceArea'].map(service_area_mapping)

# 8. Encode PrizmCode
prizm_code_map = {
    'Suburban': 0,
    'Town': 1,
    'Other': 2,
    'Rural': 3
}
df['PrizmCode'] = df['PrizmCode'].map(prizm_code_map)

# 9. Encode Occupation
occupation_map = {
    'Professional': 0,
    'Crafts': 1,
    'Other': 2,
    'Self': 3,
    'Retired': 4,
    'Homemaker': 5,
    'Clerical': 6,
    'Student': 7
}
df['Occupation'] = df['Occupation'].map(occupation_map)

# 10. Separate features and target
X = df.drop(columns=['Churn', 'CustomerID'])
y = df['Churn']

# 11. Scale numeric features
scaler = StandardScaler()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
X[num_cols] = scaler.fit_transform(X[num_cols])

# 12. Final check
print("\n✅ Final Check:")
print(f"Remaining missing values: {X.isnull().sum().sum()}")
print(f"All columns numeric: {all(dtype in ['int64', 'float64'] for dtype in X.dtypes)}")


In [None]:
#Check for remaining columns that need encoding
non_numeric_cols = df.select_dtypes(include=['object', 'category']).columns
print("Columns that still need encoding:", list(non_numeric_cols))


In [None]:
# Save features barplots/histograms

out_dir = "/content/drive/MyDrive/plots"
os.makedirs(out_dir, exist_ok=True)

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# ── inverse look-ups ----------------------------------------------------------
prizm_inv = {0: "Suburban", 1: "Town", 2: "Other", 3: "Rural"}
occupation_inv = {
    0: "Professional", 1: "Crafts", 2: "Other", 3: "Self",
    4: "Retired", 5: "Homemaker", 6: "Clerical", 7: "Student"
}
marital_inv = {1: "Yes", 0: "No", 2: "Unknown"}
credit_inv = {
    1: "1-Highest", 2: "2-High", 3: "3-Good", 4: "4-Medium",
    5: "5-Low", 6: "6-Very Low", 7: "7-Lowest"
}

# ── helper to save bar plots --------------------------------------------------
def barplot_and_save(counts: pd.Series, title: str, filename: str, rot=0):
    plt.figure(figsize=(6, 4))
    sns.barplot(x=counts.index, y=counts.values)
    plt.title(title)
    plt.ylabel("Count")
    plt.xlabel("")          # leave x-label blank; title says enough
    plt.xticks(rotation=rot)
    plt.tight_layout()
    plt.savefig(os.path.join(out_dir, filename), dpi=300)
    plt.close()

# 1. HandsetPrice histogram ----------------------------------------------------
plt.figure(figsize=(6, 4))
sns.histplot(df["HandsetPrice"], bins=30, kde=True)
plt.title("HandsetPrice Distribution")
plt.xlabel("HandsetPrice")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "handsetprice_dist.png"), dpi=300)
plt.close()

# 2. PrizmCode barplot ---------------------------------------------------------
prizm_counts = df["PrizmCode"].map(prizm_inv).value_counts().sort_index()
barplot_and_save(prizm_counts, "PrizmCode Distribution", "prizmcode_dist.png")

# 3. Occupation barplot --------------------------------------------------------
occ_counts = df["Occupation"].map(occupation_inv).value_counts().sort_index()
barplot_and_save(occ_counts, "Occupation Distribution", "occupation_dist.png", rot=35)

# 4. MaritalStatus barplot -----------------------------------------------------
mar_counts = df["MaritalStatus"].map(marital_inv).value_counts().sort_index()
barplot_and_save(mar_counts, "Marital Status Distribution", "maritalstatus_dist.png")

# 5. CreditRating barplot ------------------------------------------------------
cred_counts = df["CreditRating"].map(credit_inv).value_counts().sort_index()
barplot_and_save(cred_counts, "Credit Rating Distribution", "creditrating_dist.png")

print(f"🎉  Saved five separate PNG files to {out_dir}")


In [None]:
# Distribution plots for key variables (HandsetPrice, PrizmCode, Occupation,
#    MaritalStatus, CreditRating)


# ── inverse look-ups to restore human-readable category names ────────────────
prizm_inv = {0: "Suburban", 1: "Town", 2: "Other", 3: "Rural"}
occupation_inv = {
    0: "Professional", 1: "Crafts", 2: "Other", 3: "Self",
    4: "Retired", 5: "Homemaker", 6: "Clerical", 7: "Student"
}
marital_inv = {1: "Yes", 0: "No", 2: "Unknown"}
credit_inv = {
    1: "1-Highest", 2: "2-High", 3: "3-Good", 4: "4-Medium",
    5: "5-Low", 6: "6-Very Low", 7: "7-Lowest"
}

# ── value counts for categorical columns ─────────────────────────────────────
prizm_counts      = df["PrizmCode"].map(prizm_inv).value_counts().sort_index()
occupation_counts = df["Occupation"].map(occupation_inv).value_counts().sort_index()
marital_counts    = df["MaritalStatus"].map(marital_inv).value_counts().sort_index()
credit_counts     = df["CreditRating"].map(credit_inv).value_counts().sort_index()

# ── create a 3 × 2 subplot grid (5 plots, last cell empty) ──────────────────
fig, axes = plt.subplots(3, 2, figsize=(15, 14))
axes = axes.flatten()              # easier indexing (total 6 axes)

# 1. HandsetPrice (numeric histogram)
sns.histplot(df["HandsetPrice"], bins=30, kde=True, ax=axes[0])
axes[0].set_title("HandsetPrice Distribution")
axes[0].set_xlabel("HandsetPrice")

# 2. PrizmCode
sns.barplot(x=prizm_counts.index, y=prizm_counts.values, ax=axes[1])
axes[1].set_title("PrizmCode Distribution")
axes[1].set_xlabel("PrizmCode")
axes[1].set_ylabel("Count")

# 3. Occupation
sns.barplot(x=occupation_counts.index, y=occupation_counts.values, ax=axes[2])
axes[2].set_title("Occupation Distribution")
axes[2].set_xlabel("Occupation")
axes[2].set_ylabel("Count")
axes[2].tick_params(axis="x", rotation=35)

# 4. MaritalStatus
sns.barplot(x=marital_counts.index, y=marital_counts.values, ax=axes[3])
axes[3].set_title("Marital Status Distribution")
axes[3].set_xlabel("Marital Status")
axes[3].set_ylabel("Count")

# 5. CreditRating
sns.barplot(x=credit_counts.index, y=credit_counts.values, ax=axes[4])
axes[4].set_title("Credit Rating Distribution")
axes[4].set_xlabel("Credit Rating")
axes[4].set_ylabel("Count")

# Hide unused 6th subplot (axes[5])
axes[5].axis("off")

plt.tight_layout()
plt.show()


In [None]:
# Split dataset with Stratified k fold

from sklearn.model_selection import StratifiedKFold

# X and y are already prepared from your preprocessing
# (X = features, y = churn target)

# Initialize StratifiedKFold with 4 splits
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Loop through the folds
for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Get counts and percentages for training set
    train_counts = y_train.value_counts()
    train_percentages = y_train.value_counts(normalize=True) * 100

    # Get counts and percentages for testing set
    test_counts = y_test.value_counts()
    test_percentages = y_test.value_counts(normalize=True) * 100

    print(f"Fold {fold + 1}:")
    print("Training set:")
    print(f"  Not Churned (0): {train_counts[0]} ({train_percentages[0]:.2f}%)")
    print(f"  Churned (1): {train_counts[1]} ({train_percentages[1]:.2f}%)")
    print("Testing set:")
    print(f"  Not Churned (0): {test_counts[0]} ({test_percentages[0]:.2f}%)")
    print(f"  Churned (1): {test_counts[1]} ({test_percentages[1]:.2f}%)")
    print("-" * 40)



In [None]:
#Plot class distribution across folds (train/test)
from sklearn.model_selection import StratifiedKFold

# Create list to store results
fold_distributions = []

# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Loop through the folds
for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), start=1):
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    train_dist = y_train.value_counts(normalize=True) * 100
    test_dist = y_test.value_counts(normalize=True) * 100

    fold_distributions.append({
        'Fold': fold,
        'Set': 'Train',
        'Churned (%)': train_dist[1],
        'Not Churned (%)': train_dist[0]
    })
    fold_distributions.append({
        'Fold': fold,
        'Set': 'Test',
        'Churned (%)': test_dist[1],
        'Not Churned (%)': test_dist[0]
    })

# Convert to DataFrame
dist_df = pd.DataFrame(fold_distributions)

# Melt for seaborn visualization
df_melted = dist_df.melt(id_vars=['Fold', 'Set'],
                         value_vars=['Churned (%)', 'Not Churned (%)'],
                         var_name='Class', value_name='Percentage')

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=df_melted, x='Fold', y='Percentage', hue='Class', ci=None, palette='Set2')
plt.title('Class Distribution in Train/Test Sets Across Folds')
plt.ylabel('Percentage (%)')
plt.xlabel('Fold Number')
plt.legend(title='Class Label')
plt.tight_layout()
plt.show()


In [None]:
# Experiment 1: Baseline (Default)

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

from scipy.stats import randint, uniform

# Initialize logs
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# Classifier list
classifiers_and_params = {
    "kNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": randint(1, 50),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        }
    ),
    "LDA": (
        LinearDiscriminantAnalysis(),
        [
            {"solver": ["svd"], "shrinkage": [None], "tol": uniform(1e-5, 1e-2)},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto"], "tol": uniform(1e-5, 1e-2)}
        ]
    ),
    "LogReg": (
        LogisticRegression(random_state=42, max_iter=1000),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l1", "l2"],
            "solver": ["liblinear", "saga"]
        }
    ),
    "DecisionTree": (
        DecisionTreeClassifier(random_state=42),
        {
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "NaiveBayes": (
        GaussianNB(),
        {
            "var_smoothing": uniform(1e-9, 1e-5)
        }
    ),
    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.5)
        }
    )
}

# Begin loop
fold_number = 1
for train_index, test_index in skf.split(X, y):
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in classifiers_and_params.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1] if hasattr(best_model, "predict_proba") else None
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1] if hasattr(best_model, "predict_proba") else None

        # Evaluation
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob) if y_test_prob is not None else 0.0
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion Matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curve
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        if y_train_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
            ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        if y_test_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
            ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save metrics
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        # Detailed logs
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Summary
print("\n📊 Final Summary by Classifier (Average Across Folds):")
summary_df = pd.DataFrame(fold_metrics)
grouped = summary_df.groupby("Classifier")
print(grouped.mean(numeric_only=True).round(3))

# Save results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results saved to churn_classifiers_output.csv")


In [None]:
# Experiment 1: Adding 2 classifiers (boosters)
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)


from scipy.stats import randint, uniform

# Initialize logs
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

extra_classifiers = {
    "XGBoost": (
        XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "reg_alpha": uniform(0, 1),
            "reg_lambda": uniform(1, 3)
        }
    ),
    "GradientBoosting": (
        GradientBoostingClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4)
        }
    )
}
fold_number = 1
# New experiments loop
for train_index, test_index in skf.split(X, y):
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in extra_classifiers.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1]
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1]

        # Metrics
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob)
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curves
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
        ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
        ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save to same CSV
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Save combined results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results (including XGBoost and GBM) saved to churn_classifiers_output.csv")


In [None]:
# Experiment 1: Added neural network
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, RocCurveDisplay
from sklearn.preprocessing import StandardScaler

# For logging results
nn_results = []

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

fold_number = 1
for train_index, test_index in skf.split(X_scaled, y):
    print(f"\n🧠 Neural Network - Fold {fold_number}")
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    # Model definition
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train_fold.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0,2),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
    optimizer=Adam(learning_rate=0.0005),
    loss='binary_crossentropy',
    metrics=['accuracy'])


    early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

    # Training
    history = model.fit(
        X_train_fold, y_train_fold,
        validation_split=0.2,
        epochs=50,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )

    print(f"Epochs trained: {len(history.history['loss'])}")

    # Predictions
    y_test_prob = model.predict(X_test_fold).flatten()
    y_test_pred = (y_test_prob > 0.5).astype(int)

    # Evaluation
    acc = accuracy_score(y_test_fold, y_test_pred)
    prec = precision_score(y_test_fold, y_test_pred, zero_division=0)
    rec = recall_score(y_test_fold, y_test_pred, zero_division=0)
    f1 = f1_score(y_test_fold, y_test_pred, zero_division=0)
    auc = roc_auc_score(y_test_fold, y_test_prob)

    print(f"→ Accuracy: {acc:.3f} | F1: {f1:.3f} | AUC: {auc:.3f}")

    # Log results
    nn_results.append({
        "Classifier": "NeuralNetwork",
        "Fold": fold_number,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "AUC ROC": auc
    })

    # Confusion Matrix
    cm = confusion_matrix(y_test_fold, y_test_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"NN - Confusion Matrix - Fold {fold_number}")
    plt.show()

    # ROC Curve
    RocCurveDisplay.from_predictions(y_test_fold, y_test_prob)
    plt.title(f"NN - ROC Curve - Fold {fold_number}")
    plt.show()

    fold_number += 1
    # Plot Accuracy
plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Convert to DataFrame if needed
import pandas as pd
nn_results_df = pd.DataFrame(nn_results)
print("\n📊 Neural Network Evaluation (Average Across Folds):")
print(nn_results_df.groupby("Classifier").mean(numeric_only=True).round(3))


In [None]:
# Plot the neural network architecture

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import plot_model
import matplotlib.pyplot as plt

# Define the model architecture (no training)
model = Sequential([
    Dense(256, activation='relu', input_shape=(58,), name="Dense (256)"),
    BatchNormalization(name="BatchNorm"),
    Dropout(0.3, name="Dropout_0.3"),
    Dense(128, activation='relu', name="Dense (128)"),
    Dropout(0.2, name="Dropout_0.2"),
    Dense(64, activation='relu', name="Dense (64)"),
    Dense(1, activation='sigmoid', name="Output_Sigmoid")
])

# Plot and save the model architecture diagram
plot_model(model, to_file="nn_architecture.png", show_shapes=True, show_layer_names=True)

# Display the image in notebook (optional)
import IPython.display as display
display.Image("nn_architecture.png")
model.save("nn_model.h5")

In [None]:
!pip install keras-tuner --upgrade

In [None]:
# Keras tuner parameter optimization for neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from kerastuner.tuners import RandomSearch
from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Build model with tunable hyperparameters
def build_model(hp):
    model = Sequential()
    # Input layer
    model.add(Dense(
        units=hp.Int('units_input', min_value=64, max_value=512, step=64),
        activation='relu',
        input_shape=(X_scaled.shape[1],)
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_input', 0.0, 0.5, step=0.1)))

    # Hidden layers: up to 5
    for i in range(hp.Int('num_layers', 1, 5)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
            activation='relu'
        ))
        model.add(Dropout(hp.Float(f'dropout_{i}', 0.0, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile(
        optimizer=Adam(
            learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 5e-4, 1e-4])
        ),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

# Define tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=15,
    executions_per_trial=1,
    directory='nn_tuning',
    project_name='churn_tuning'
)

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Run search
tuner.search(
    X_scaled, y,
    epochs=30,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Retrieve and summarize best model
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparams = tuner.get_best_hyperparameters(1)[0]
print("\n✅ Best hyperparameters found:")
for param in best_hyperparams.values:
    print(f"{param}: {best_hyperparams.get(param)}")


In [None]:
#NN   KERAS TUNER  PARAMETERS OPTIMIZATION (recall)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import keras_tuner as kt

from sklearn.preprocessing import StandardScaler

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Build model with tunable hyperparameters
def build_model(hp):
    model = Sequential()
    # Input layer
    model.add(Dense(
        units=hp.Int('units_input', min_value=64, max_value=512, step=64),
        activation='relu',
        input_shape=(X_scaled.shape[1],)
    ))
    model.add(BatchNormalization())
    model.add(Dropout(hp.Float('dropout_input', 0.0, 0.5, step=0.1)))

    # Hidden layers: up to 5
    for i in range(hp.Int('num_layers', 1, 5)):
        model.add(Dense(
            units=hp.Int(f'units_{i}', min_value=32, max_value=256, step=32),
            activation='relu'
        ))
        model.add(Dropout(hp.Float(f'dropout_{i}', 0.0, 0.5, step=0.1)))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile model
    model.compile(
        optimizer=Adam(
            learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 5e-4, 1e-4])
        ),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.Recall(name='recall')]
    )
    return model

# Define tuner
tuner = kt.RandomSearch(
    build_model,
    objective=kt.Objective("val_recall", direction="max"),
    max_trials=15,
    executions_per_trial=1,
    directory='nn_tuning',
    project_name='churn_tuning'
)

# Early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Run search
tuner.search(
    X_scaled, y,
    epochs=30,
    validation_split=0.2,
    callbacks=[early_stop],
    verbose=1
)

# Retrieve and summarize best model
best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparams = tuner.get_best_hyperparameters(1)[0]
print("\n✅ Best hyperparameters found:")
for param in best_hyperparams.values:
    print(f"{param}: {best_hyperparams.get(param)}")


In [None]:
# ✅ NEURAL NETWORK EXP (RECALL-TUNED FULL PARAM SET ACROSS 4 FOLDS)

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Log results
nn_results = []

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Stratified K-Fold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
fold_number = 1

for train_index, test_index in skf.split(X_scaled, y):
    print(f"\n🧠 Neural Network - Fold {fold_number}")
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    # Model with ALL recall-tuned hyperparameters
    model = Sequential()
    # Input layer
    model.add(Dense(192, activation='relu', input_shape=(X_train_fold.shape[1],)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    # Only 1 hidden layer as per 'num_layers'
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.0))

    # Remaining hidden layers (not used but retained for clarity)
    model.add(Dense(224, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(224, activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(192, activation='relu'))
    model.add(Dropout(0.1))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.Recall(name='recall')]
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

    # Training
    history = model.fit(
        X_train_fold, y_train_fold,
        validation_split=0.2,
        epochs=50,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )

    # Predictions
    y_test_prob = model.predict(X_test_fold).flatten()
    y_test_pred = (y_test_prob > 0.5).astype(int)

    # Evaluation
    acc = accuracy_score(y_test_fold, y_test_pred)
    prec = precision_score(y_test_fold, y_test_pred, zero_division=0)
    rec = recall_score(y_test_fold, y_test_pred, zero_division=0)
    f1 = f1_score(y_test_fold, y_test_pred, zero_division=0)
    auc = roc_auc_score(y_test_fold, y_test_prob)

    print(f"→ Accuracy: {acc:.3f} | F1: {f1:.3f} | AUC: {auc:.3f}")

    # Log results
    nn_results.append({
        "Classifier": "NeuralNetwork_RecallTuned_FullLayers",
        "Fold": fold_number,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "AUC ROC": auc
    })

    # Confusion Matrix
    cm = confusion_matrix(y_test_fold, y_test_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"NN - Confusion Matrix - Fold {fold_number}")
    plt.show()

    # ROC Curve
    RocCurveDisplay.from_predictions(y_test_fold, y_test_prob)
    plt.title(f"NN - ROC Curve - Fold {fold_number}")
    plt.show()

    # Accuracy & Loss Plots
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Accuracy Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Loss Over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

    fold_number += 1

# Results summary
nn_results_df = pd.DataFrame(nn_results)
print("\n📊 Neural Network Evaluation (Average Across Folds):")
print(nn_results_df.groupby("Classifier").mean(numeric_only=True).round(3))


In [None]:
#NN OPTIMIZED
# Build final model with best hyperparameters (accuracy)
best_hp = tuner.get_best_hyperparameters(1)[0]

final_model = build_model(best_hp)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=8,
    restore_best_weights=True
)

history = final_model.fit(
    X_train_fold, y_train_fold,
    validation_split=0.2,
    epochs=50,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

# Predict and evaluate
y_test_prob = final_model.predict(X_test_fold).flatten()
y_test_pred = (y_test_prob > 0.5).astype(int)

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)

# Compute metrics
acc = accuracy_score(y_test_fold, y_test_pred)
prec = precision_score(y_test_fold, y_test_pred, zero_division=0)
rec = recall_score(y_test_fold, y_test_pred, zero_division=0)
f1 = f1_score(y_test_fold, y_test_pred, zero_division=0)
auc = roc_auc_score(y_test_fold, y_test_prob)

print(f"\n✅ Final Model Evaluation:")
print(f"Accuracy: {acc:.3f} | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f} | AUC: {auc:.3f}")

# Confusion matrix
cm = confusion_matrix(y_test_fold, y_test_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Final Model - Confusion Matrix")
plt.show()

# ROC Curve
RocCurveDisplay.from_predictions(y_test_fold, y_test_prob)
plt.title("Final Model - ROC Curve")
plt.show()

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title("Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title("Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Experiment 2: Added cost-sensitive learning (class weighting)

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.stats import randint, uniform


# Initialize results storage
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

# Logging function
def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# Classifiers (some with class_weight balanced)
classifiers_and_params = {
    "kNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": randint(1, 50),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        }
    ),
    "LDA": (
        LinearDiscriminantAnalysis(),
        [
            {"solver": ["svd"], "shrinkage": [None], "tol": uniform(1e-5, 1e-2)},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto"], "tol": uniform(1e-5, 1e-2)}
        ]
    ),
    "LogReg": (
        LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l1", "l2"],
            "solver": ["liblinear", "saga"]
        }
    ),
    "DecisionTree": (
        DecisionTreeClassifier(random_state=42, class_weight='balanced'),
        {
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42, class_weight='balanced'),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "NaiveBayes": (
        GaussianNB(),
        {
            "var_smoothing": uniform(1e-9, 1e-5)
        }
    ),
    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.5)
        }
    )
}
# Begin loop
fold_number = 1
for train_index, test_index in skf.split(X, y):
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in classifiers_and_params.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1] if hasattr(best_model, "predict_proba") else None
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1] if hasattr(best_model, "predict_proba") else None

        # Evaluation
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob) if y_test_prob is not None else 0.0
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion Matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curve
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        if y_train_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
            ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        if y_test_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
            ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save metrics
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        # Detailed logs
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Summary
print("\n📊 Final Summary by Classifier (Average Across Folds):")
summary_df = pd.DataFrame(fold_metrics)
grouped = summary_df.groupby("Classifier")
print(grouped.mean(numeric_only=True).round(3))

# Save results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results saved to churn_classifiers_output.csv")



In [None]:
# Experiment 2: Added 2 classifiers (boosters)

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)


from scipy.stats import randint, uniform

# Initialize logs
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)
   # Estimate class weight for scale_pos_weight
neg, pos = (y == 0).sum(), (y == 1).sum()
scale_pos_weight = neg / pos
extra_classifiers = {

    "XGBoost": (
        XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,  # Parallelization
            scale_pos_weight=scale_pos_weight  # Handles class imbalance
        ),
        {
            "n_estimators": randint(100, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "reg_alpha": uniform(0.1, 1.0),    # L1 regularization
            "reg_lambda": uniform(1.0, 3.0)    # L2 regularization
        }
    ),
    "GradientBoosting": (
        GradientBoostingClassifier(
            random_state=42
        ),
        {
            "n_estimators": randint(100, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "min_samples_split": randint(2, 10),  # Regularization
            "min_samples_leaf": randint(1, 10),   # Regularization
            "max_features": ["auto", "sqrt", "log2"]  # Feature usage regularization
        }
    )
}
fold_number = 1
# New experiments loop
for train_index, test_index in skf.split(X, y):
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in extra_classifiers.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1]
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1]

        # Metrics
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob)
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curves
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
        ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
        ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save to same CSV
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Save combined results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results (including XGBoost and GBM) saved to churn_classifiers_output.csv")


In [None]:
# Experiment 2 Neural Network
# NEURAL NETWORK EXP2 – Cost-Sensitive Learning (Class Weighting)
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# For logging results
nn_results = []

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

fold_number = 1
for train_index, test_index in skf.split(X_scaled, y):
    print(f"\n🧠 Neural Network (Class Weighted) - Fold {fold_number}")
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    # Compute class weights based on training fold
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.array([0, 1]),
        y=y_train_fold
    )
    class_weights_dict = {0: class_weights[0], 1: class_weights[1]}
    print(f"Class Weights: {class_weights_dict}")

    # Model definition
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train_fold.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

    # Training
    history = model.fit(
        X_train_fold, y_train_fold,
        validation_split=0.2,
        epochs=50,
        batch_size=64,
        class_weight=class_weights_dict,
        callbacks=[early_stop],
        verbose=0
    )

    print(f"Epochs trained: {len(history.history['loss'])}")

    # Predictions
    y_test_prob = model.predict(X_test_fold).flatten()
    y_test_pred = (y_test_prob > 0.5).astype(int)

    # Evaluation
    acc = accuracy_score(y_test_fold, y_test_pred)
    prec = precision_score(y_test_fold, y_test_pred, zero_division=0)
    rec = recall_score(y_test_fold, y_test_pred, zero_division=0)
    f1 = f1_score(y_test_fold, y_test_pred, zero_division=0)
    auc = roc_auc_score(y_test_fold, y_test_prob)

    print(f"→ Accuracy: {acc:.3f} | F1: {f1:.3f} | AUC: {auc:.3f}")

    # Log results
    nn_results.append({
        "Classifier": "NeuralNetwork_Weighted",
        "Fold": fold_number,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "AUC ROC": auc
    })

    # Confusion Matrix
    cm = confusion_matrix(y_test_fold, y_test_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"NN (Weighted) - Confusion Matrix - Fold {fold_number}")
    plt.show()

    # ROC Curve
    RocCurveDisplay.from_predictions(y_test_fold, y_test_prob)
    plt.title(f"NN (Weighted) - ROC Curve - Fold {fold_number}")
    plt.show()

    fold_number += 1

# Plot Accuracy and Loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Convert to DataFrame if needed
nn_results_df = pd.DataFrame(nn_results)
print("\n📊 Neural Network (Weighted) Evaluation (Average Across Folds):")
print(nn_results_df.groupby("Classifier").mean(numeric_only=True).round(3))


In [None]:
#Experiment 3
#Scoring=recall as hyperparameter optimization metric
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.stats import randint, uniform


# Initialize results storage
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

# Logging function
def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# Classifiers (some with class_weight balanced)
classifiers_and_params = {
    "kNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": randint(1, 200),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        }
    ),
    "LDA": (
        LinearDiscriminantAnalysis(),
        [
            {"solver": ["svd"], "shrinkage": [None], "tol": uniform(1e-5, 1e-2)},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto"], "tol": uniform(1e-5, 1e-2)}
        ]
    ),
    "LogReg": (
        LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l1", "l2"],
            "solver": ["liblinear", "saga"]
        }
    ),
    "DecisionTree": (
        DecisionTreeClassifier(random_state=42, class_weight='balanced'),
        {
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42, class_weight='balanced'),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "NaiveBayes": (
        GaussianNB(),
        {
            "var_smoothing": uniform(1e-9, 1e-5)
        }
    ),
    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.5)
        }
    )
}
# Begin loop
fold_number = 1
for train_index, test_index in skf.split(X, y):
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in classifiers_and_params.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=30,
            scoring="recall",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1] if hasattr(best_model, "predict_proba") else None
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1] if hasattr(best_model, "predict_proba") else None

        # Evaluation
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob) if y_test_prob is not None else 0.0
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion Matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curve
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        if y_train_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
            ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        if y_test_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
            ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save metrics
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        # Detailed logs
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Summary
print("\n📊 Final Summary by Classifier (Average Across Folds):")
summary_df = pd.DataFrame(fold_metrics)
grouped = summary_df.groupby("Classifier")
print(grouped.mean(numeric_only=True).round(3))

# Save results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results saved to churn_classifiers_output.csv")



In [None]:
# Experiment 3 boosting classifiers

from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import uniform, randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)


from scipy.stats import randint, uniform

# Initialize logs
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)
   # Estimate class weight for scale_pos_weight
neg, pos = (y == 0).sum(), (y == 1).sum()
scale_pos_weight = neg / pos
extra_classifiers = {

    "XGBoost": (
        XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,  # Parallelization
            scale_pos_weight=scale_pos_weight  # Handles class imbalance
        ),
        {
            "n_estimators": randint(100, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "reg_alpha": uniform(0.1, 1.0),    # L1 regularization
            "reg_lambda": uniform(1.0, 3.0)    # L2 regularization
        }
    ),
    "GradientBoosting": (
        GradientBoostingClassifier(
            random_state=42
        ),
        {
            "n_estimators": randint(100, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "min_samples_split": randint(2, 10),  # Regularization
            "min_samples_leaf": randint(1, 10),   # Regularization
            "max_features": ["auto", "sqrt", "log2"]  # Feature usage regularization
        }
    )
}
fold_number = 1
# New experiments loop
for train_index, test_index in skf.split(X, y):
    X_train_fold = X.iloc[train_index]
    X_test_fold = X.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in extra_classifiers.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=30,
            scoring="recall",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1]
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1]

        # Metrics
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob)
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curves
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
        ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
        ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save to same CSV
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Save combined results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results (including XGBoost and GBM) saved to churn_classifiers_output.csv")


In [None]:
# Experiment 4 - SMOTE
# Create synthetic data for minority class in train


from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
smote = SMOTE(random_state=42)
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # 🟡 Apply SMOTE ONLY to the training set
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # Report distribution
    train_counts = y_train.value_counts()
    train_percentages = y_train.value_counts(normalize=True) * 100

    test_counts = y_test.value_counts()
    test_percentages = y_test.value_counts(normalize=True) * 100

    print(f"Fold {fold + 1}:")
    print("Training set after SMOTE:")
    print(f"  Not Churned (0): {train_counts[0]} ({train_percentages[0]:.2f}%)")
    print(f"  Churned (1): {train_counts[1]} ({train_percentages[1]:.2f}%)")
    print("Testing set (original distribution):")
    print(f"  Not Churned (0): {test_counts[0]} ({test_percentages[0]:.2f}%)")
    print(f"  Churned (1): {test_counts[1]} ({test_percentages[1]:.2f}%)")
    print("-" * 40)


In [None]:
!pip install imbalanced-learn


In [None]:
# Experiment 4 - SMOTE

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.stats import randint, uniform

# Initialize results storage
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

# Logging function
def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# Classifiers (some with class_weight balanced)
classifiers_and_params = {
    "kNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": randint(1, 50),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        }
    ),
    "LDA": (
        LinearDiscriminantAnalysis(),
        [
            {"solver": ["svd"], "shrinkage": [None], "tol": uniform(1e-5, 1e-2)},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto"], "tol": uniform(1e-5, 1e-2)}
        ]
    ),
    "LogReg": (
        LogisticRegression(random_state=42, max_iter=1000),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l1", "l2"],
            "solver": ["liblinear", "saga"]
        }
    ),
    "DecisionTree": (
        DecisionTreeClassifier(random_state=42),
        {
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "NaiveBayes": (
        GaussianNB(),
        {
            "var_smoothing": uniform(1e-9, 1e-5)
        }
    ),
    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.5)
        }
    )
}
# Begin loop
fold_number = 1
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    # Apply SMOTE only on training data
    smote = SMOTE(random_state=42)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)


    for clf_name, (clf_base, param_dist) in classifiers_and_params.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1] if hasattr(best_model, "predict_proba") else None
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1] if hasattr(best_model, "predict_proba") else None

        # Evaluation
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob) if y_test_prob is not None else 0.0
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion Matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curve
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        if y_train_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
            ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        if y_test_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
            ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save metrics
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        # Detailed logs
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Summary
print("\n📊 Final Summary by Classifier (Average Across Folds):")
summary_df = pd.DataFrame(fold_metrics)
grouped = summary_df.groupby("Classifier")
print(grouped.mean(numeric_only=True).round(3))

# Save results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results saved to churn_classifiers_output.csv")



In [None]:
# Experiment 4: Added 2 classifiers (boosters)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.stats import randint, uniform

# Initialize results storage
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

# Logging function
def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

   # Estimate class weight for scale_pos_weight
neg, pos = (y == 0).sum(), (y == 1).sum()
scale_pos_weight = neg / pos
extra_classifiers = {

    "XGBoost": (
        XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42,
            n_jobs=-1,  # Parallelization
            scale_pos_weight=scale_pos_weight  # Handles class imbalance
        ),
        {
            "n_estimators": randint(100, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "reg_alpha": uniform(0.1, 1.0),    # L1 regularization
            "reg_lambda": uniform(1.0, 3.0)    # L2 regularization
        }
    ),
    "GradientBoosting": (
        GradientBoostingClassifier(
            random_state=42
        ),
        {
            "n_estimators": randint(100, 300),
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.6, 0.4),
            "min_samples_split": randint(2, 10),  # Regularization
            "min_samples_leaf": randint(1, 10),   # Regularization
            "max_features": ["auto", "sqrt", "log2"]  # Feature usage regularization
        }
    )
}
# Begin loop
fold_number = 1
for train_index, test_index in skf.split(X, y):
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    # Apply SMOTE only on training data
    smote = SMOTE(random_state=42)
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)


    for clf_name, (clf_base, param_dist) in extra_classifiers.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")

        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)

        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1] if hasattr(best_model, "predict_proba") else None
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1] if hasattr(best_model, "predict_proba") else None

        # Evaluation
        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob) if y_test_prob is not None else 0.0
        }

        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        # Confusion Matrices
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        ax[0].set_title(f'{clf_name} - Fold {fold_number} - Train')

        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        ax[1].set_title(f'{clf_name} - Fold {fold_number} - Test')
        plt.tight_layout()
        plt.show()

        # ROC Curve
        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        if y_train_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
            ax[0].set_title(f"{clf_name} - ROC (Train) - Fold {fold_number}")
        if y_test_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
            ax[1].set_title(f"{clf_name} - ROC (Test) - Fold {fold_number}")
        plt.tight_layout()
        plt.show()

        # Save metrics
        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        # Detailed logs
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    fold_number += 1

# Summary
print("\n📊 Final Summary by Classifier (Average Across Folds):")
summary_df = pd.DataFrame(fold_metrics)
grouped = summary_df.groupby("Classifier")
print(grouped.mean(numeric_only=True).round(3))

# Save results
results_df.to_csv("churn_classifiers_output.csv", index=False)
print("\n✅ All results saved to churn_classifiers_output.csv")



In [None]:
# EXP4 NN – SMOTE Oversampling for Neural Network
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# For logging results
nn_results = []

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# SMOTE setup
smote = SMOTE(random_state=42)

fold_number = 1
for train_index, test_index in skf.split(X_scaled, y):
    print(f"\n🧠 Neural Network (SMOTE) - Fold {fold_number}")
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

    # Apply SMOTE to training data only
    X_train_fold, y_train_fold = smote.fit_resample(X_train_fold, y_train_fold)

    print(f"Resampled training distribution: {np.bincount(y_train_fold)}")

    # Model definition
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train_fold.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)

    # Training
    history = model.fit(
        X_train_fold, y_train_fold,
        validation_split=0.2,
        epochs=50,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )

    print(f"Epochs trained: {len(history.history['loss'])}")

    # Predictions
    y_test_prob = model.predict(X_test_fold).flatten()
    y_test_pred = (y_test_prob > 0.5).astype(int)

    # Evaluation
    acc = accuracy_score(y_test_fold, y_test_pred)
    prec = precision_score(y_test_fold, y_test_pred, zero_division=0)
    rec = recall_score(y_test_fold, y_test_pred, zero_division=0)
    f1 = f1_score(y_test_fold, y_test_pred, zero_division=0)
    auc = roc_auc_score(y_test_fold, y_test_prob)

    print(f"→ Accuracy: {acc:.3f} | F1: {f1:.3f} | AUC: {auc:.3f}")

    # Log results
    nn_results.append({
        "Classifier": "NeuralNetwork_SMOTE",
        "Fold": fold_number,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "AUC ROC": auc
    })

    # Confusion Matrix
    cm = confusion_matrix(y_test_fold, y_test_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"NN (SMOTE) - Confusion Matrix - Fold {fold_number}")
    plt.show()

    # ROC Curve
    RocCurveDisplay.from_predictions(y_test_fold, y_test_prob)
    plt.title(f"NN (SMOTE) - ROC Curve - Fold {fold_number}")
    plt.show()

    fold_number += 1

# Plot Accuracy and Loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

# Convert to DataFrame if needed
nn_results_df = pd.DataFrame(nn_results)
print("\n📊 Neural Network (SMOTE) Evaluation (Average Across Folds):")
print(nn_results_df.groupby("Classifier").mean(numeric_only=True).round(3))


In [None]:
# Experiment 5- Baseline (default) training on top 20 features

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from scipy.stats import randint, uniform

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# ----------------------------
# 1️⃣ Use top 20 features
# ----------------------------
X_encoded = pd.get_dummies(X)  # Same encoding as when you ranked features
X_top20 = X_encoded[top_20_features]  # ← Your top 20 list
print(f"Shape of X with top 20 features: {X_top20.shape}")

# ----------------------------
# 2️⃣ Initialize storage
# ----------------------------
results_df = pd.DataFrame(columns=[
    "Classifier Name", "Fold", "TrainOrTest",
    "Num Train Samples", "Num Churned in Train",
    "TP", "TN", "FP", "FN", "ROC-AUC"
])

fold_metrics = {
    "Classifier": [], "Fold": [],
    "Accuracy": [], "Precision": [], "Recall": [], "F1 Score": [], "AUC ROC": []
}

# ----------------------------
# 3️⃣ Logging helper
# ----------------------------
def log_metrics_to_df(df, classifier_name, fold_number, dataset_type, y_true, y_pred, X_data, y_data, y_prob):
    cm = confusion_matrix(y_true, y_pred)
    TN, FP, FN, TP = cm.ravel()
    roc_auc = roc_auc_score(y_true, y_prob) if y_prob is not None else None
    row = {
        "Classifier Name": classifier_name,
        "Fold": fold_number,
        "TrainOrTest": dataset_type,
        "Num Train Samples": len(X_data),
        "Num Churned in Train": sum(y_data == 1),
        "TP": TP, "TN": TN, "FP": FP, "FN": FN,
        "ROC-AUC": roc_auc
    }
    return pd.concat([df, pd.DataFrame([row])], ignore_index=True)

# ----------------------------
# 4️⃣ ML classifiers + XGBoost + GBM
# ----------------------------
classifiers_and_params = {
    "kNN": (
        KNeighborsClassifier(),
        {
            "n_neighbors": randint(1, 50),
            "weights": ["uniform", "distance"],
            "p": [1, 2]
        }
    ),
    "LDA": (
        LinearDiscriminantAnalysis(),
        [
            {"solver": ["svd"], "shrinkage": [None], "tol": uniform(1e-5, 1e-2)},
            {"solver": ["lsqr", "eigen"], "shrinkage": [None, "auto"], "tol": uniform(1e-5, 1e-2)}
        ]
    ),
    "LogReg": (
        LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
        {
            "C": uniform(0.01, 10),
            "penalty": ["l1", "l2"],
            "solver": ["liblinear", "saga"]
        }
    ),
    "DecisionTree": (
        DecisionTreeClassifier(random_state=42, class_weight='balanced'),
        {
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "RandomForest": (
        RandomForestClassifier(random_state=42, class_weight='balanced'),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(1, 20),
            "min_samples_split": randint(2, 10),
            "min_samples_leaf": randint(1, 10),
            "criterion": ["gini", "entropy"]
        }
    ),
    "NaiveBayes": (
        GaussianNB(),
        {
            "var_smoothing": uniform(1e-9, 1e-5)
        }
    ),
    "AdaBoost": (
        AdaBoostClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.5)
        }
    ),
    "XGBoost": (
        XGBClassifier(random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='logloss'),
        {
            "n_estimators": randint(50, 300),
            "max_depth": randint(2, 10),
            "learning_rate": uniform(0.01, 0.3),
            "subsample": uniform(0.7, 0.3),
            "colsample_bytree": uniform(0.7, 0.3),
            "reg_alpha": uniform(0, 1),
            "reg_lambda": uniform(0, 1)
        }
    ),
    "GBM": (
        GradientBoostingClassifier(random_state=42),
        {
            "n_estimators": randint(50, 300),
            "learning_rate": uniform(0.01, 0.3),
            "max_depth": randint(2, 10),
            "subsample": uniform(0.7, 0.3)
        }
    )
}

# ----------------------------
# 5️⃣ CV loop for ML models
# ----------------------------
fold_number = 1
for train_index, test_index in skf.split(X_top20, y):
    X_train_fold = X_top20.iloc[train_index]
    X_test_fold = X_top20.iloc[test_index]
    y_train_fold = y.iloc[train_index]
    y_test_fold = y.iloc[test_index]

    for clf_name, (clf_base, param_dist) in classifiers_and_params.items():
        print(f"\n🔍 {clf_name} on Fold {fold_number}")
        random_search = RandomizedSearchCV(
            estimator=clf_base,
            param_distributions=param_dist,
            n_iter=10,
            scoring="f1",
            cv=3,
            random_state=42,
            n_jobs=-1
        )
        random_search.fit(X_train_fold, y_train_fold)
        best_model = random_search.best_estimator_
        print("Best params:", random_search.best_params_)

        y_train_pred = best_model.predict(X_train_fold)
        y_test_pred = best_model.predict(X_test_fold)
        y_train_prob = best_model.predict_proba(X_train_fold)[:, 1] if hasattr(best_model, "predict_proba") else None
        y_test_prob = best_model.predict_proba(X_test_fold)[:, 1] if hasattr(best_model, "predict_proba") else None

        test_metrics = {
            "acc": accuracy_score(y_test_fold, y_test_pred),
            "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
            "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
            "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
            "auc": roc_auc_score(y_test_fold, y_test_prob) if y_test_prob is not None else 0.0
        }
        print(f"→ Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.heatmap(confusion_matrix(y_train_fold, y_train_pred), annot=True, fmt='d', cmap='Blues', ax=ax[0])
        sns.heatmap(confusion_matrix(y_test_fold, y_test_pred), annot=True, fmt='d', cmap='Blues', ax=ax[1])
        plt.tight_layout()
        plt.show()

        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        if y_train_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_train_fold, y_train_fold, ax=ax[0])
        if y_test_prob is not None:
            RocCurveDisplay.from_estimator(best_model, X_test_fold, y_test_fold, ax=ax[1])
        plt.tight_layout()
        plt.show()

        fold_metrics["Classifier"].append(clf_name)
        fold_metrics["Fold"].append(fold_number)
        fold_metrics["Accuracy"].append(test_metrics["acc"])
        fold_metrics["Precision"].append(test_metrics["prec"])
        fold_metrics["Recall"].append(test_metrics["rec"])
        fold_metrics["F1 Score"].append(test_metrics["f1"])
        fold_metrics["AUC ROC"].append(test_metrics["auc"])

        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Train", y_train_fold, y_train_pred, X_train_fold, y_train_fold, y_train_prob)
        results_df = log_metrics_to_df(results_df, clf_name, fold_number, "Test", y_test_fold, y_test_pred, X_test_fold, y_test_fold, y_test_prob)

    # ✅ Add NN exactly like your original
    print(f"\n🧠 Neural Network on Fold {fold_number}")

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_fold)
    X_test_scaled = scaler.transform(X_test_fold)

    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    early_stop = EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True)
    history = model.fit(
        X_train_scaled, y_train_fold,
        validation_split=0.2,
        epochs=50,
        batch_size=64,
        callbacks=[early_stop],
        verbose=0
    )
    print(f"Epochs trained: {len(history.history['loss'])}")

    y_train_prob = model.predict(X_train_scaled).flatten()
    y_test_prob = model.predict(X_test_scaled).flatten()
    y_train_pred = (y_train_prob > 0.5).astype(int)
    y_test_pred = (y_test_prob > 0.5).astype(int)

    test_metrics = {
        "acc": accuracy_score(y_test_fold, y_test_pred),
        "prec": precision_score(y_test_fold, y_test_pred, zero_division=0),
        "rec": recall_score(y_test_fold, y_test_pred, zero_division=0),
        "f1": f1_score(y_test_fold, y_test_pred, zero_division=0),
        "auc": roc_auc_score(y_test_fold, y_test_prob)
    }
    print(f"→ NN Accuracy: {test_metrics['acc']:.3f} | F1: {test_metrics['f1']:.3f} | AUC: {test_metrics['auc']:.3f}")

    results_df = log_metrics_to_df(results_df, "DNN", fold_number, "Train", y_train_fold, y_train_pred, X_train_scaled, y_train_fold, y_train_prob)
    results_df = log_metrics_to_df(results_df, "DNN", fold_number, "Test", y_test_fold, y_test_pred, X_test_scaled, y_test_fold, y_test_prob)

    fold_number += 1

print("\n📊 Final Summary:")
summary_df = pd.DataFrame(fold_metrics)
print(summary_df.groupby("Classifier").mean(numeric_only=True).round(3))

results_df.to_csv("churn_exp5_top20.csv", index=False)
print("✅ All results saved to churn_exp5_top20.csv")


In [None]:
# ⚙️ Encode your final X exactly as you’ll use it
X_encoded = pd.get_dummies(X)

# ✅ Use the encoded version to refit RF and get new top 20
rf_for_selection = RandomForestClassifier(n_estimators=100, random_state=42)
rf_for_selection.fit(X_encoded, y)
importances = pd.Series(rf_for_selection.feature_importances_, index=X_encoded.columns)
important_features = importances.sort_values(ascending=False)
top_20_features = important_features.head(20).index.tolist()

# Subset
X_top20 = X_encoded[top_20_features]
print("Top 20 features:", top_20_features)
