In [1]:
# =============================================================================
# FINAL, CORRECTED PIPELINE: From Raw Data to Final Model Evaluation
# =============================================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score
import os
from datetime import datetime
os.environ["OMP_NUM_THREADS"] = "6"  # or "12"

print("--- Starting Final Pipeline: Reloading and Reprocessing All Data ---")

# --- Part 1: Load Data ---
try:
    training_data = pd.read_csv('ml_case_training_data.csv')
    output = pd.read_csv('ml_case_training_output.csv')
    hist_data = pd.read_csv('ml_case_training_hist_data.csv')
    print("✅ 1. Data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: Could not find data files. {e}")
    exit()

# --- Part 2: Merging, Aggregating, and Feature Engineering ---
# Create a complete raw dataframe
df_raw = pd.merge(pd.merge(training_data, output, on='id'), hist_data, on='id', how='left')

# CORRECTED: Aggregate historical price data by customer ID
print("Aggregating historical price data...")
df_raw['price_date'] = pd.to_datetime(df_raw['price_date'], errors='coerce')
agg_price_data = df_raw.groupby('id').agg(
    mean_price_p1_var=('price_p1_var', 'mean'),
    std_price_p1_var=('price_p1_var', 'std'),
    mean_price_p2_var=('price_p2_var', 'mean'),
    std_price_p2_var=('price_p2_var', 'std'),
).reset_index()

# Create the main, unique customer dataframe `df`
df = df_raw.drop(columns=hist_data.columns.drop('id')).drop_duplicates(subset='id')

# Merge the aggregated price data back
df = pd.merge(df, agg_price_data, on='id', how='left')

# Feature Engineering
df['date_activ'] = pd.to_datetime(df['date_activ'], errors='coerce')
df['tenure'] = (pd.to_datetime('2016-01-01') - df['date_activ']).dt.days / 365.25
df['margin_x_tenure'] = df['net_margin'] * df['tenure']
print("✅ 2. Feature engineering and aggregation complete.")


# --- Part 3: Data Cleaning and Final Prep ---
# Drop unneeded columns
cols_to_drop = [col for col in df.columns if 'date' in col or 'campaign_disc_ele' in col]
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')

# Fix negative consumption errors
for col in ['cons_12m', 'cons_gas_12m', 'cons_last_month', 'imp_cons']:
    if col in df.columns:
        df.loc[df[col] < 0, col] = np.nan

# Convert binary 'has_gas'
if 'has_gas' in df.columns and df['has_gas'].dtype == 'object':
    df['has_gas'] = df['has_gas'].map({'t': 1, 'f': 0})

# ————————————————
# **Drop the ID column** so it never gets one‑hot‑encoded
if 'id' in df.columns:
    df.drop(columns=['id'], inplace=True)
# ————————————————

# One-hot encode categoricals

categorical_features = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_features, drop_first=True)
print("✅ 3. Data cleaning and encoding complete.")

# --- Part 3.1: Quick Data‑Size Diagnostics ---
print("\n=== Data‑Size Diagnostics ===")
print("Full dataframe shape:", df.shape)                            # rows × columns
print("Memory footprint (MB):", df.memory_usage(deep=True).sum()/1024**2)
print("Number of features:", df.shape[1] - 1, "(minus target)")
print("Number of samples:", df.shape[0])
# Optional: list zero‑variance features
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=0.0).fit(df.drop(columns='churn', errors='ignore'))
zero_var = [c for c,v in zip(df.columns, vt.variances_) if v == 0]
print("Zero‑variance features:", zero_var)

# --- Part 4: Final Split, Impute, and Balance ---
X = df.drop(columns='churn', errors='ignore')
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

# Impute missing/infinite values
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
median_values = X_train.median()
X_train = X_train.fillna(median_values)
X_test = X_test.fillna(median_values)

# Balance with SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("✅ 4. Data prep for modeling complete.")


# --- Part 5: Model Training & Evaluation ---
print("\n--- Running Final Model Bake-off (excluding SVC) ---")
print(f"[{datetime.now()}] 🚀 Starting Final Model Bake‑off (excluding SVC)…")

# scale once
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled  = scaler.transform(X_test)

# define SVC separately
svc_model = SVC(random_state=42, probability=True)

# all other models (no SVC in here)
models = {
    'Dummy Classifier': DummyClassifier(strategy='stratified', random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'AdaBoost': AdaBoostClassifier(random_state=42, algorithm='SAMME'),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42)
}

# these models expect scaled data
scaled_models = ['Logistic Regression', 'Naive Bayes', 'KNN']

results = []

print(f"[{datetime.now()}] 🕒 Beginning training of batch models…")

# 1) train & evaluate everyone except SVC
for name, model in models.items():
    print(f"Training {name}...")
    if name in scaled_models:
        model.fit(X_train_scaled, y_train_resampled)
        preds_proba = model.predict_proba(X_test_scaled)[:, 1]
        preds      = model.predict(X_test_scaled)
    else:
        model.fit(X_train_resampled, y_train_resampled)
        preds_proba = model.predict_proba(X_test)[:, 1]
        preds      = model.predict(X_test)

    results.append({
        'Model':     name,
        'AUC':       roc_auc_score(y_test, preds_proba),
        'Accuracy':  accuracy_score(y_test, preds),
        'F1-Score':  f1_score(y_test, preds),
        'Precision': precision_score(y_test, preds),
        'Recall':    recall_score(y_test, preds)
    })


--- Starting Final Pipeline: Reloading and Reprocessing All Data ---
✅ 1. Data loaded successfully.
Aggregating historical price data...
✅ 2. Feature engineering and aggregation complete.
✅ 3. Data cleaning and encoding complete.

=== Data‑Size Diagnostics ===
Full dataframe shape: (16096, 457)
Memory footprint (MB): 10.131351470947266
Number of features: 456 (minus target)
Number of samples: 16096
Zero‑variance features: []
✅ 4. Data prep for modeling complete.

--- Running Final Model Bake-off (excluding SVC) ---
[2025-07-31 20:37:01.024300] 🚀 Starting Final Model Bake‑off (excluding SVC)…
[2025-07-31 20:37:01.548023] 🕒 Beginning training of batch models…
Training Dummy Classifier...
Training Logistic Regression...
Training Naive Bayes...
Training KNN...
Training Decision Tree...
Training Random Forest...
Training AdaBoost...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Number of positive: 10876, number of negative: 10876
[LightGBM] [Info] Auto-choosing row-wise multi-t

In [3]:
# --- Part 5 (modified SVC) ---
# Run SVC separately due to time requirements
print(f"[{datetime.now()}] 🕒 Finished batch models. Now training SVM…")

from sklearn.svm import SVC

# pass these params into SVC to speed up training
svc = SVC(
    kernel='rbf',
    cache_size=2048,
    probability=True,
    tol=1e-3,
    random_state=42
)
print("Training sklearn SVC on 497 features…")
svc.fit(X_train_scaled, y_train_resampled)
y_pred = svc.predict(X_test_scaled)
y_proba = svc.predict_proba(X_test_scaled)[:, 1]

results.append({
    'Model':    'SVC',
    'AUC':      roc_auc_score(y_test, y_proba) if 'y_proba' in locals() else None,
    'F1-Score': f1_score(y_test, y_pred),
    'Recall':   recall_score(y_test, y_pred),
    'Accuracy': accuracy_score(y_test, y_pred),
    'Precision':precision_score(y_test, y_pred)
})


# 3) compile and display
results_df = (
    pd.DataFrame(results)
      .sort_values(by='AUC', ascending=False)
      .set_index('Model')
)

print("\n--- Final Model Performance Comparison ---")
print(f"[{datetime.now()}] 🎉 All models complete. Here are the results:")
print(results_df)
print("\n--- SCRIPT COMPLETE ---")


[2025-07-31 20:37:21.959404] 🕒 Finished batch models. Now training SVM…
Training sklearn SVC on 497 features…

--- Final Model Performance Comparison ---
[2025-07-31 20:47:36.736636] 🎉 All models complete. Here are the results:
                          AUC  Accuracy  F1-Score  Precision    Recall
Model                                                                 
Random Forest        0.679437  0.891650  0.180451   0.360902  0.120301
XGBoost              0.673428  0.891402  0.215440   0.379747  0.150376
LightGBM             0.664376  0.890656  0.179104   0.350365  0.120301
AdaBoost             0.628777  0.822813  0.215622   0.192157  0.245614
SVC                  0.601714  0.837227  0.146023   0.152174  0.140351
KNN                  0.589034  0.791252  0.200000   0.161290  0.263158
Logistic Regression  0.575898  0.823062  0.183486   0.169133  0.200501
Decision Tree        0.534871  0.802932  0.167891   0.144404  0.200501
Dummy Classifier     0.524482  0.508946  0.180083   0.107907  