In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, precision_recall_curve
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

# -----------------------------
# 1. LOAD & CLEAN DATA
# -----------------------------
df = pd.read_csv('datset.csv')


df.drop('customerID', axis=1, inplace=True)


df.replace(' ', np.nan, inplace=True)

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

df.fillna(df.mean(numeric_only=True), inplace=True)

print("Initial Data Shape:", df.shape)

# -----------------------------
# 2. TARGET & FEATURES
# -----------------------------
y = df['Churn'].map({'Yes': 1, 'No': 0})
X = df.drop('Churn', axis=1)

# -----------------------------
# 3. FEATURE ANALYSIS & SELECTION
# -----------------------------

irrelevant_cols = ['customerID']  # already dropped
X = X.drop(columns=irrelevant_cols, errors='ignore')

# Separate numeric and categorical columns
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# Correlation with target (only numeric features)
corr_with_target = X[numeric_cols].copy()
corr_with_target['Churn'] = y
corr_matrix = corr_with_target.corr()
target_corr = corr_matrix['Churn'].drop('Churn')

# Keep numeric features that have abs(corr) > threshold
corr_threshold = 0.05  # small threshold to remove almost useless features
good_numeric_features = target_corr[abs(target_corr) > corr_threshold].index.tolist()

print("Selected numeric features based on target correlation:", good_numeric_features)

# Keep only selected numeric features
X_numeric = X[good_numeric_features]

# -----------------------------
# 4. ONE-HOT ENCODE CATEGORICAL FEATURES
# -----------------------------
X_categorical = pd.get_dummies(X[categorical_cols], drop_first=True)

# Combine numeric + categorical
X_final = pd.concat([X_numeric, X_categorical], axis=1)

print("Final Feature Shape:", X_final.shape)

# -----------------------------
# 5. TRAIN-TEST SPLIT
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# 6. SMOTE OVERSAMPLING
# -----------------------------
print("Before SMOTE:", np.bincount(y_train))

sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

print("After SMOTE:", np.bincount(y_train_res))

# -----------------------------
# 7. SCALING
# -----------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# -----------------------------
# 8. XGBOOST MODEL
# -----------------------------
model = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.85,
    colsample_bytree=0.85,
    random_state=42,
    eval_metric='logloss'
)
model.fit(X_train_scaled, y_train_res)

# -----------------------------
# 9. THRESHOLD OPTIMIZATION
# -----------------------------
y_proba = model.predict_proba(X_test_scaled)[:, 1]

best_f1 = 0
best_threshold = 0.5

prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

for t in thresholds:
    y_pred_t = (y_proba >= t).astype(int)
    f1 = f1_score(y_test, y_pred_t)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("\nBest Threshold:", best_threshold)
print("Best F1 Score:", best_f1)

# -----------------------------
# 10. FINAL EVALUATION
# -----------------------------
y_pred_final = (y_proba >= best_threshold).astype(int)

print("\n--- XGBoost Performance (Threshold Optimized) ---")
print("F1 Score:", f1_score(y_test, y_pred_final))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_final))


Initial Data Shape: (7043, 20)
Selected numeric features based on target correlation: ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
Final Feature Shape: (7043, 30)
Before SMOTE: [4139 1495]
After SMOTE: [4139 4139]

Best Threshold: 0.33631936
Best F1 Score: 0.6236786469344608

--- XGBoost Performance (Threshold Optimized) ---
F1 Score: 0.6236786469344608
ROC AUC: 0.8272985610581518
Confusion Matrix:
[[758 277]
 [ 79 295]]
