In [158]:
# 1. Import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, precision_recall_curve
from xgboost import XGBClassifier

In [None]:
# 2. Download and Load dataset

!wget -O RetailX_CustomerData.csv https://raw.githubusercontent.com/contrecesar/Automatizacion-2025-1/main/JesusChacon-MachineLearningModelsScikit-Learn/RetailX_CustomerData.csv

df = pd.read_csv("RetailX_CustomerData.csv")

In [None]:
# 3. Data overview

print(df.info())
print(df['Churn'].value_counts())
print(df['Churn'].value_counts(normalize=True))

In [None]:
# 4. Check missing values

print(df.isnull().sum())

In [None]:
# 5. Visualization: Spending by churn

sns.boxplot(x="Churn", y="TotalSpent", data=df)
plt.title("Customer Spending by Churn Status")
plt.show()

sns.histplot(data=df, x='TotalSpent', hue='Churn', kde=True, bins=30)
plt.title("Distribution of TotalSpent by Churn")
plt.show()

print(df.groupby('Churn')['TotalSpent'].describe())

In [None]:
# 6. Correlation heatmap

plt.figure(figsize=(10, 6))
df_corr = pd.get_dummies(df.drop(columns=["CustomerID"]), drop_first=True)
sns.heatmap(df_corr.corr(), annot=True, cmap="coolwarm")
plt.title("Feature Correlation")
plt.show()

In [164]:
# 7. Preprocessing

df_encoded = pd.get_dummies(df, columns=["Region", "ProductCategory"], drop_first=True)
X = df_encoded.drop(columns=["CustomerID", "Churn"])
y = df_encoded["Churn"]

In [165]:
# 8. Train/test split and scale

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)


In [166]:
# 9. Balance with SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

In [None]:
# 10. Train XGBoost with fixed params

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    scale_pos_weight=(y_train_res == 0).sum() / (y_train_res == 1).sum(),
    eval_metric='logloss',
    random_state=42
)
xgb.fit(X_train_res, y_train_res)

print("Modelo Entrenado")

In [169]:
# 11. Predict probabilities

y_proba = xgb.predict_proba(X_test)[:, 1]

In [None]:
# 12. Find best threshold from precision-recall curve

precision, recall, thresholds = precision_recall_curve(y_test, y_proba)
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Best threshold based on F1 score: {best_threshold:.3f}")

y_pred_adj = (y_proba >= best_threshold).astype(int)

In [None]:
# 13. Evaluation

print("Classification Report:\n", classification_report(y_test, y_pred_adj))
print("Accuracy:", accuracy_score(y_test, y_pred_adj))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

In [None]:
# 14. Confusion matrix

sns.heatmap(confusion_matrix(y_test, y_pred_adj), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# 15. Feature importance plot

importances = xgb.feature_importances_
feature_names = X.columns

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(10))
plt.title("Top 10 Feature Importances")
plt.tight_layout()
plt.show()