In [10]:
!pip install faker


Collecting faker
  Downloading faker-37.3.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.3.0-py3-none-any.whl (1.9 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m1.1/1.9 MB[0m [31m33.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.3.0


In [11]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

fake = Faker()

num_users = 5000

# User Profiles
user_ids = [f"user_{i+1:04d}" for i in range(num_users)]

device_types = ['Smart TV', 'Mobile', 'Tablet', 'Laptop', 'Desktop']
languages = ['English', 'Hindi', 'Spanish', 'Tamil', 'Telugu', 'Bengali', 'Marathi']

user_profiles = pd.DataFrame({
    'user_id': user_ids,
    'age': np.random.randint(18, 70, size=num_users),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=num_users, p=[0.48, 0.48, 0.04]),
    'location': [fake.city() for _ in range(num_users)],
    'preferred_language': np.random.choice(languages, size=num_users),
    'device_type': np.random.choice(device_types, size=num_users)
})

# Subscriptions
payment_methods = ['Credit Card', 'Debit Card', 'UPI', 'Net Banking', 'Wallet']
auto_renew_choices = [True, False]

start_dates = [fake.date_between(start_date='-3y', end_date='-1y') for _ in range(num_users)]
months_using = np.random.randint(1, 36, size=num_users)
end_dates = [start + timedelta(days=30*int(months)) for start, months in zip(start_dates, months_using)]

plan_changes = np.random.poisson(0.2, size=num_users)

base_monthly_fee = 12.99 * 82  # INR approx

total_revenue = [
    months * base_monthly_fee * (0.9 if changes > 0 else 1.0)
    for months, changes in zip(months_using, plan_changes)
]

subscriptions = pd.DataFrame({
    'user_id': user_ids,
    'payment_method': np.random.choice(payment_methods, size=num_users),
    'auto_renew': np.random.choice(auto_renew_choices, size=num_users, p=[0.85, 0.15]),
    'plan_changes': plan_changes,
    'subscription_start_date': start_dates,
    'subscription_end_date': end_dates,
    'months_using': months_using,
    'total_revenue_inr': np.round(total_revenue, 2)
})

# User Activity
genres = ['Drama', 'Comedy', 'Action', 'Thriller', 'Romance', 'Documentary', 'Horror', 'Sci-Fi']

most_watched_genres = np.random.choice(genres, size=num_users, p=[0.2,0.15,0.15,0.15,0.15,0.05,0.1,0.05])
completion_rate = np.random.uniform(0.4, 1.0, size=num_users)
peak_hours = ['Evening', 'Night', 'Afternoon', 'Morning']

user_activity = pd.DataFrame({
    'user_id': user_ids,
    'completion_rate': np.round(completion_rate, 2),
    'peak_hour_streaming': np.random.choice(peak_hours, size=num_users, p=[0.5, 0.3, 0.15, 0.05]),
    'avg_genre_diversity': np.random.randint(1, 5, size=num_users),
    'most_watched_genre': most_watched_genres
})

# Save CSV files
user_profiles.to_csv('user_profiles.csv', index=False)
subscriptions.to_csv('subscriptions.csv', index=False)
user_activity.to_csv('user_activity.csv', index=False)

print("CSV files generated")


CSV files generated


In [12]:
import pandas as pd

user_profiles = pd.read_csv("user_profiles.csv")
subscriptions = pd.read_csv("subscriptions.csv")
user_activity = pd.read_csv("user_activity.csv")

# Merge based on user_id
df = user_profiles.merge(subscriptions, on="user_id").merge(user_activity, on="user_id")

# Save for further use
df.to_csv("merged_saas_data.csv", index=False)
df.head()


Unnamed: 0,user_id,age,gender,location,preferred_language,device_type,payment_method,auto_renew,plan_changes,subscription_start_date,subscription_end_date,months_using,total_revenue_inr,completion_rate,peak_hour_streaming,avg_genre_diversity,most_watched_genre
0,user_0001,38,Male,West Jennifershire,Spanish,Desktop,Wallet,True,0,2023-12-31,2024-08-27,8,8521.44,0.72,Afternoon,2,Thriller
1,user_0002,41,Male,East Natalie,Spanish,Laptop,Wallet,True,0,2022-06-25,2024-06-14,24,25564.32,0.55,Evening,4,Thriller
2,user_0003,56,Male,Griffinborough,Tamil,Desktop,Wallet,True,0,2023-10-20,2026-04-07,30,31955.4,0.46,Evening,3,Thriller
3,user_0004,66,Female,Derrickmouth,Hindi,Desktop,Credit Card,True,0,2023-09-13,2025-06-04,21,22368.78,0.54,Night,3,Romance
4,user_0005,48,Male,South Alexis,Spanish,Desktop,Credit Card,True,1,2022-08-17,2023-11-10,15,14379.93,0.64,Night,2,Sci-Fi


Schema Unification

In [13]:
import pandas as pd
import difflib

# Your internal schema
STANDARD_COLUMNS = {
    # User Profiles
    "user_id": ["user_id", "userid", "id", "userID", "user Id", "user-id", "user id"],
    "age": ["age", "user_age", "age_years"],
    "gender": ["gender", "sex"],
    "preferred_language": ["preferred_language", "language", "lang", "preferred_lang"],
    "device_type": ["device_type", "device", "platform", "device_platform"],

    # Subscriptions
    "subscription_id": ["subscription_id", "sub_id", "subscriptionId"],
    "start_date": ["start_date", "subscription_start", "sub_start", "startDate", "subscriptionStartDate"],
    "end_date": ["end_date", "subscription_end", "sub_end", "endDate", "subscriptionEndDate"],
    "payment_method": ["payment_method", "payment", "pay_method", "paymentType"],
    "auto_renew": ["auto_renew", "autoRenew", "renewal_status", "is_auto_renew"],
    "plan_changes": ["plan_changes", "planChangeCount", "number_of_plan_changes"],
    "total_revenue": ["revenue", "total_revenue", "amount_paid", "revenue_inr", "payment_amount"],

    # User Activity
    "user_activity_id": ["user_activity_id", "activity_id", "activityId"],
    "completion_rate": ["completion_rate", "watch_completion", "content_completion_rate"],
    "peak_hour_streaming": ["peak_hour_streaming", "peak_stream_hour", "peakHourStream"],
    "avg_genre_diversity": ["avg_genre_diversity", "genre_diversity", "average_genre_diversity"],
    "max_genre": ["max_genre", "favorite_genre", "most_watched_genre", "top_genre"],
    "months_using": ["months_using", "subscription_months", "tenure_months", "months_subscribed"],

    # Engagement and other metrics
    "num_logins": ["num_logins", "login_count", "number_of_logins"],
    "total_watch_time": ["total_watch_time", "watch_time", "total_streaming_minutes"],
    "avg_watch_session": ["avg_watch_session", "average_session_duration"],

    # Flags / Booleans
    "is_active": ["is_active", "active_status", "currently_active"],
}


def fuzzy_map_columns(df, standard_cols):
    mapped_cols = {}
    user_cols = df.columns.tolist()

    for std_col, variants in standard_cols.items():
        best_match = None
        highest_ratio = 0
        for col in user_cols:
            for variant in variants:
                ratio = difflib.SequenceMatcher(None, col.lower(), variant.lower()).ratio()
                if ratio > highest_ratio and ratio > 0.6:  # threshold for match
                    best_match = col
                    highest_ratio = ratio
        if best_match:
            mapped_cols[best_match] = std_col
    df = df.rename(columns=mapped_cols)
    return df


Rigorous Data Cleaning

In [18]:
import numpy as np
import pandas as pd

def clean_and_validate(df):
  numeric_cols = [ 'tenure_months', 'plan_changes', 'completion_rate', 'avg_genre_diversity']

  for col in numeric_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # convert invalid to NaN

    # 1. Parse dates safely without deprecated arg
    for col in ["start_date", "end_date"]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    # 2. Remove rows with invalid start_date (critical)
    df = df[df["start_date"].notna()].copy()

    # 3. Fill missing end_date with today's date
    if "end_date" in df.columns:
        df["end_date"] = df["end_date"].fillna(pd.Timestamp.today())

    # 4. Handle revenue: fill missing or negative values with 0
    if "revenue" in df.columns:
        df.loc[df["revenue"].isna() | (df["revenue"] < 0), "revenue"] = 0

    # 5. Fill missing categorical values with 'Unknown'
    categorical_cols = [
        "preferred_language", "payment_method", "device_type","peak_hour_streaming"
        "max_genre", "gender"
    ]
    for col in categorical_cols:
        if col in df.columns:
            # If categorical dtype, add 'Unknown' category first
            if isinstance(df[col].dtype, pd.CategoricalDtype):
                if "Unknown" not in df[col].cat.categories:
                    df[col] = df[col].cat.add_categories(["Unknown"])
            df[col] = df[col].fillna("Unknown")

    # 6. Fill missing numeric columns with median or 0 fallback
    numeric_cols = [
        "plan_changes", "completion_rate",
        "avg_genre_diversity", "months_using", "num_logins",
        "total_watch_time", "avg_watch_session"
    ]
    for col in numeric_cols:
        if col in df.columns:
            median_val = df[col].median()
            df[col] = df[col].fillna(median_val if not np.isnan(median_val) else 0)

    # 7. Remove duplicate records based on user_id and start_date (subscription uniqueness)
    if "user_id" in df.columns and "start_date" in df.columns:
        df = df.drop_duplicates(subset=["user_id", "start_date"])

    # 8. Compute capped tenure_months (clip at 36 months) with safer calculation
    if "start_date" in df.columns and "end_date" in df.columns:
        tenure = (df["end_date"] - df["start_date"]).dt.days / 30.44  # Approx days per month
        df["tenure_months"] = tenure.clip(upper=36).round()

    # 9. Ensure auto_renew is boolean, fill missing with False
    if "auto_renew" in df.columns:
        # Handle mixed types robustly
        df["auto_renew"] = df["auto_renew"].fillna(False)
        # Convert common string representations to bool
        df["auto_renew"] = df["auto_renew"].apply(
            lambda x: True if str(x).strip().lower() in ["true", "1", "yes"] else False
        )

    # 10. Final reset index
    df.reset_index(drop=True, inplace=True)

    return df


Feature_engineering

In [44]:
def feature_engineer(df):
    import pandas as pd
    import numpy as np

    # 1. Convert Dates
    df['start_date'] = pd.to_datetime(df.get('start_date'), errors='coerce')
    df['end_date'] = pd.to_datetime(df.get('end_date'), errors='coerce')

    # 2. Convert Numeric Columns Safely
    numeric_cols = ['total_revenue', 'tenure_months', 'plan_changes', 'completion_rate', 'avg_genre_diversity']
    for col in numeric_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

    # 3. Churn Label (if not already present)
    today = pd.Timestamp.today()
    if 'total_revenue' in df.columns:
        df['churned'] = ((df['end_date'] < today) & df['end_date'].notna()).astype(int)
        df['is_active'] = (df['end_date'] >= today).astype(int)

    # 4. Monthly Average Revenue
    if 'total_revenue' in df.columns and 'tenure_months' in df.columns:
        df['monthly_avg_revenue'] = df['total_revenue'] / df['tenure_months'].replace(0, 1)

    # 5. Peak Hour Mapping
    if 'peak_hour_streaming' in df.columns:
        peak_hour_map = {'Morning': 1, 'Afternoon': 2, 'Evening': 3, 'Night': 4}
        df['peak_hour_streaming_num'] = df['peak_hour_streaming'].map(peak_hour_map).fillna(0).astype(int)

    # 6. Engagement Score
    if all(col in df.columns for col in ['completion_rate', 'peak_hour_streaming_num', 'avg_genre_diversity']):
        df['engagement_score'] = (
            0.4 * df['completion_rate'] +
            0.3 * df['peak_hour_streaming_num'] +
            0.3 * df['avg_genre_diversity']
        ).round(2)

    # 7. Revenue Segments (based on monthly revenue)
    if 'monthly_avg_revenue' in df.columns:
        df['revenue_segment'] = pd.cut(
            df['monthly_avg_revenue'],
            bins=[-np.inf, 100, 300, 700, 1500, np.inf],
            labels=['Low', 'Below Avg', 'Average', 'High', 'Premium']
        )

    # 8. Plan Stability
    if 'plan_changes' in df.columns:
        df['is_stable_plan'] = (df['plan_changes'] <= 1).astype(int)

    # 9. Max Genre to Category
    if 'max_genre' in df.columns:
        df['max_genre'] = df['max_genre'].astype('category')

    # 10. Clean Boolean Flags
    for col in ['auto_renew', 'is_active']:
        if col in df.columns:
            df[col] = df[col].astype(int)

    # 11. Extract Year & Month from Start Date
    if 'start_date' in df.columns:
        df['subscription_year'] = df['start_date'].dt.year
        df['subscription_month'] = df['start_date'].dt.month

    return df


Robust & Modular Categorical Encoding Module (for General Datasets)

In [14]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [42]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import category_encoders as ce

def detect_categorical_columns(df, exclude=[]):
    """Automatically detect categorical columns based on dtype and cardinality."""
    return [
        col for col in df.select_dtypes(include=["object", "category"]).columns
        if col not in exclude and df[col].nunique() < 100
    ]

def encode_categorical_train(df, target_col=None, method='target', custom_cat_cols=None):
    """
    Encodes categorical columns using the specified method.

    method: 'target' | 'onehot' | 'ordinal'
    """

    cat_cols = custom_cat_cols if custom_cat_cols else detect_categorical_columns(df)

    if not cat_cols:
        return df, None  # No encoding needed

    if method == 'target':
        if target_col is None:
            raise ValueError("Target column required for target encoding.")
        encoder = ce.TargetEncoder(cols=cat_cols)
        df[cat_cols] = encoder.fit_transform(df[cat_cols], df[target_col])

    elif method == 'onehot':
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(df[cat_cols])
        df = df.drop(columns=cat_cols)
        df = df.join(pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols)))

    elif method == 'ordinal':
        encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        df[cat_cols] = encoder.fit_transform(df[cat_cols])

    else:
        raise ValueError(f"Unknown encoding method: {method}")

    return df, (encoder, cat_cols, method)

def encode_categorical_infer(df, encoder_obj):
    """Applies fitted encoder on new/inference data."""
    if encoder_obj is None:
        return df

    encoder, cat_cols, method = encoder_obj

    if method == 'target':
        df[cat_cols] = encoder.transform(df[cat_cols])

    elif method == 'onehot':
        encoded = encoder.transform(df[cat_cols])
        df = df.drop(columns=cat_cols)
        df = df.join(pd.DataFrame(encoded, columns=encoder.get_feature_names_out(cat_cols)))

    elif method == 'ordinal':
        df[cat_cols] = encoder.transform(df[cat_cols])

    return df


Logging & Versioning Code

In [41]:
import os
import json
import pandas as pd
from datetime import datetime

def log_and_version(df, process_name="preprocessing", output_dir="processed_data"):
    os.makedirs(output_dir, exist_ok=True)

    # Create versioned filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    version_filename = f"{process_name}_v_{timestamp}.csv"
    version_path = os.path.join(output_dir, version_filename)

    # Save cleaned/processed CSV
    df.to_csv(version_path, index=False)

    # Collect metadata
    metadata = {
        "process": process_name,
        "timestamp": timestamp,
        "shape": df.shape,
        "columns": list(df.columns),
        "null_counts": df.isnull().sum().to_dict(),
        "column_types": df.dtypes.astype(str).to_dict()
    }

    # Save metadata as JSON
    metadata_path = os.path.join(output_dir, f"{process_name}_meta_{timestamp}.json")
    with open(metadata_path, "w") as f:
        json.dump(metadata, f, indent=4)

    print(f"✅ Data saved to: {version_path}")
    print(f"📝 Metadata saved to: {metadata_path}")
    return version_path, metadata_path


End of your preprocessing pipeline

In [45]:
df = fuzzy_map_columns(df, STANDARD_COLUMNS)
df = clean_and_validate(df)
df = feature_engineer(df)

cat_cols = ["preferred_language","total_revenue","gender", "location", "payment_method", "peak_hour_streaming","device_type", "max_genre"]
cat_cols = [c for c in cat_cols if c in df.columns]

df, encoder = encode_categorical_train(
    df,
    target_col='churned',   # ✅ Specify your target column here
    method='target',
    custom_cat_cols=cat_cols
)

log_and_version(df, process_name="preprocessed_data")



✅ Data saved to: processed_data/preprocessed_data_v_20250516_170745.csv
📝 Metadata saved to: processed_data/preprocessed_data_meta_20250516_170745.json


('processed_data/preprocessed_data_v_20250516_170745.csv',
 'processed_data/preprocessed_data_meta_20250516_170745.json')

In [46]:
print(df.columns)


Index(['user_activity_id', 'age', 'gender', 'location', 'preferred_language',
       'device_type', 'total_revenue', 'auto_renew', 'plan_changes',
       'start_date', 'end_date', 'months_using', 'total_watch_time',
       'completion_rate', 'peak_hour_streaming', 'avg_genre_diversity',
       'max_genre', 'tenure_months', 'churned', 'is_active',
       'peak_hour_streaming_num', 'engagement_score', 'is_stable_plan',
       'subscription_id', 'subscription_month', 'subscription_year',
       'monthly_avg_revenue', 'revenue_segment'],
      dtype='object')


EDA

In [194]:
from google.colab import files
df.to_csv("final_dataset.csv", index=False)
files.download("final_dataset.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [55]:
df

Unnamed: 0,user_activity_id,age,gender,location,preferred_language,device_type,total_revenue,auto_renew,plan_changes,start_date,...,churned,is_active,peak_hour_streaming_num,engagement_score,is_stable_plan,subscription_id,subscription_month,subscription_year,monthly_avg_revenue,revenue_segment
0,user_0001,38,0.663880,1.0,0.663957,0.672396,0.6708,1,0,2023-12-31,...,1,0,0,0.89,1,2023,12,2023,0.0,Low
1,user_0002,41,0.663880,1.0,0.663957,0.653140,0.6708,1,0,2022-06-25,...,1,0,0,1.42,1,2022,6,2022,0.0,Low
2,user_0003,56,0.663880,0.0,0.675712,0.672396,0.6708,1,0,2023-10-20,...,0,1,0,1.08,1,2023,10,2023,0.0,Low
3,user_0004,66,0.680851,0.0,0.656891,0.672396,0.6708,1,0,2023-09-13,...,0,1,0,1.12,1,2023,9,2023,0.0,Low
4,user_0005,48,0.663880,1.0,0.663957,0.672396,0.6708,1,1,2022-08-17,...,1,0,0,0.86,1,2022,8,2022,0.0,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,user_4996,24,0.680851,0.0,0.675712,0.678125,0.6708,1,0,2023-05-22,...,0,1,0,1.57,1,2023,5,2023,0.0,Low
4996,user_4997,27,0.680851,1.0,0.656891,0.653140,0.6708,1,0,2023-01-23,...,1,0,0,0.86,1,2023,1,2023,0.0,Low
4997,user_4998,28,0.680851,1.0,0.687933,0.686235,0.6708,1,0,2022-10-30,...,1,0,0,0.85,1,2022,10,2022,0.0,Low
4998,user_4999,38,0.663880,1.0,0.675712,0.678125,0.6708,1,0,2022-06-04,...,1,0,0,0.93,1,2022,6,2022,0.0,Low


In [69]:
import pandas as pd

# Your full DataFrame
# df = ... (already loaded and feature-engineered)

# Define target variable
y = df['churned']

# Drop columns that shouldn't be features:
# For example: IDs, dates, target, subscription_id, etc.
# Drop columns that shouldn't be used as predictors
drop_cols = ['is_active','user_activity_id', 'start_date', 'end_date',
    'churn_month', 'start_month','tenure_bucket','revenue_segment','churn','subscription_id'
]

X = df.drop(columns=drop_cols + ['churned'], errors='ignore')



# If you have categorical features left, encode them (example: one-hot or label encoding)
# For simplicity, let's assume all are numeric after your feature engineering.

# Check the shape
print("Features shape:", X.shape)
print("Target shape:", y.shape)


Features shape: (5000, 21)
Target shape: (5000,)


In [72]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve,
    classification_report, f1_score, confusion_matrix
)
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# ====== 1. Data Preparation ======
# Let's analyze your target distribution
print("Target distribution:")
print(df['churned'].value_counts())
print(f"Churn rate: {df['churned'].mean():.2%}")

# Use all available features from your dataset
features = [
    'age', 'gender', 'location', 'preferred_language', 'device_type',
    'total_revenue', 'auto_renew', 'plan_changes', 'months_using',
    'total_watch_time', 'completion_rate', 'peak_hour_streaming',
    'avg_genre_diversity', 'max_genre', 'tenure_months', 'engagement_score',
    'is_stable_plan', 'subscription_month', 'subscription_year',
    'monthly_avg_revenue'
]

# Check if features exist in dataframe
features = [f for f in features if f in df.columns]
print(f"\nUsing {len(features)} features")

X = df[features].copy()
y = df['churned'].copy()

# ====== 2. Feature Engineering ======
# Handle categorical features
cat_cols = X.select_dtypes(include=['object']).columns
for col in cat_cols:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Standardize numerical features
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns
scaler = StandardScaler()
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

# ====== 3. Split Data ======
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Churn rate in train: {y_train.mean():.2%}")
print(f"Churn rate in test: {y_test.mean():.2%}")

# ====== 4. Balance Classes with SMOTE ======
print("\nApplying SMOTE to balance classes...")
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print(f"Training data after SMOTE: {X_train_res.shape}")
print(f"Class distribution after SMOTE: {pd.Series(y_train_res).value_counts()}")

# ====== 5. Train Model ======
print("\nTraining XGBoost classifier...")

# Basic XGBoost model with simple parameters
model = xgb.XGBClassifier(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    random_state=42
)

# Simple fitting without early stopping or eval sets
model.fit(X_train_res, y_train_res)

# ====== 6. Evaluate Model ======
# Get predictions
y_prob = model.predict_proba(X_test)[:, 1]
y_pred = model.predict(X_test)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_prob)
print(f"\nROC AUC: {roc_auc:.4f}")

# Find optimal threshold
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)

if len(thresholds) > best_idx:  # Ensure we have valid index
    best_thresh = thresholds[best_idx]
    print(f"Best threshold by F1: {best_thresh:.4f}")

    # Apply best threshold
    y_pred_best = (y_prob >= best_thresh).astype(int)

    # Show confusion matrix
    cm = confusion_matrix(y_test, y_pred_best)
    tn, fp, fn, tp = cm.ravel()

    print("\nConfusion Matrix:")
    print(f"True Negative: {tn}, False Positive: {fp}")
    print(f"False Negative: {fn}, True Positive: {tp}")

    # Calculate and display metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision_score = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_score = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    f1 = 2 * precision_score * recall_score / (precision_score + recall_score) if (precision_score + recall_score) > 0 else 0

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision_score:.4f}")
    print(f"Recall: {recall_score:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"F1 Score: {f1:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred_best))
else:
    print("Could not calculate optimal threshold")
    print("\nClassification Report with default threshold:")
    print(classification_report(y_test, y_pred))

# ====== 7. Feature Importance ======
print("\nFeature Importance:")
importance = model.feature_importances_
indices = np.argsort(importance)[::-1]

feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance.head(10))

# ====== 8. Threshold Analysis ======
print("\nThreshold Analysis:")
thresholds_to_try = [0.3, 0.4, 0.5, 0.6, 0.7]
results = []

for t in thresholds_to_try:
    y_pred = (y_prob >= t).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

    results.append({
        'Threshold': t,
        'Precision': precision,
        'Recall': recall,
        'Specificity': specificity,
        'F1': f1,
        'TP': tp,
        'FP': fp,
        'TN': tn,
        'FN': fn
    })

threshold_df = pd.DataFrame(results)
print(threshold_df)

# ====== 9. Business Recommendations ======
print("\nBusiness Recommendations:")
print("1. Model Performance Summary:")
print(f"   - ROC AUC: {roc_auc:.4f}")
print(f"   - Best F1 Score: {threshold_df['F1'].max():.4f} at threshold {threshold_df.loc[threshold_df['F1'].idxmax(), 'Threshold']}")

print("\n2. Recommended Thresholds:")
print(f"   - For balanced performance (F1): {threshold_df.loc[threshold_df['F1'].idxmax(), 'Threshold']}")
print(f"   - For high precision (minimize false positives): {threshold_df.loc[threshold_df['Precision'].idxmax(), 'Threshold']}")
print(f"   - For high recall (catch most churners): {threshold_df.loc[threshold_df['Recall'].idxmax(), 'Threshold']}")

print("\n3. Top Churn Indicators:")
for i, (feat, imp) in enumerate(zip(feature_importance['Feature'].head(5), feature_importance['Importance'].head(5))):
    print(f"   {i+1}. {feat}: {imp:.4f}")

print("\n4. Actionable Insights:")
print("   - Focus retention efforts on customers with high churn probability")
print("   - Address issues related to the top churn indicators")
print("   - Create tiered intervention strategies based on churn probability")

Target distribution:
churned
1    3354
0    1646
Name: count, dtype: int64
Churn rate: 67.08%

Using 20 features
Train set: (4000, 20)
Test set: (1000, 20)
Churn rate in train: 67.07%
Churn rate in test: 67.10%

Applying SMOTE to balance classes...
Training data after SMOTE: (5366, 20)
Class distribution after SMOTE: churned
1    2683
0    2683
Name: count, dtype: int64

Training XGBoost classifier...

ROC AUC: 0.9999
Best threshold by F1: 0.4285

Confusion Matrix:
True Negative: 329, False Positive: 0
False Negative: 2, True Positive: 669

Accuracy: 0.9980
Precision: 1.0000
Recall: 0.9970
Specificity: 1.0000
F1 Score: 0.9985

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00       329
           1       1.00      1.00      1.00       671

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Feature Import