<a href="https://colab.research.google.com/github/delfibarros-lab/churn-prediction/blob/main/Churn_Prediction_Portfolio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-Learn para Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

sns.set_theme(style="whitegrid")
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = 'https://raw.githubusercontent.com/blastchar/telco-customer-churn/master/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(url)

display(df.head())

In [None]:
# 1. Eliminar ID que no sirve para predecir
df.drop('customerID', axis=1, inplace=True)

# 2. 'TotalCharges' tiene espacios en blanco en clientes nuevos, lo forzamos a numérico
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'] = df['TotalCharges'].fillna(0) # Llenamos nulos con 0

# 3. Convertir la variable objetivo 'Churn' a 1 y 0
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 4. Convertir variables categóricas a numéricas (One-Hot Encoding)
df = pd.get_dummies(df, drop_first=True)

print(f"Tamaño del dataset listo para el modelo: {df.shape}")

In [None]:
# Separar variables independientes (X) de la variable a predecir (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

# Separar en 80% para entrenar y 20% para testear
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Escalar los datos numéricos para que los algoritmos funcionen mejor
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Entrenamos Regresión Logística
log_model = LogisticRegression()
log_model.fit(X_train_scaled, y_train)
y_pred_log = log_model.predict(X_test_scaled)

# Entrenamos Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

print("✅ Modelos entrenados con éxito.")

In [None]:
print("--- LOGISTIC REGRESSION PERFORMANCE ---")
print(classification_report(y_test, y_pred_log))
print(f"ROC-AUC Score: {roc_auc_score(y_test, log_model.predict_proba(X_test_scaled)[:, 1]):.4f}\n")

# Matriz de confusión visual para Random Forest
plt.figure(figsize=(6, 4))
cm = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.title('Confusion Matrix - Random Forest')
plt.ylabel('Valor Real')
plt.xlabel('Predicción del Modelo')
plt.show()