# 📊 Análisis de Churn en Clientes de Telecomunicaciones
Este notebook analiza y predice la pérdida de clientes utilizando aprendizaje automático.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Cargar dataset final
df = pd.read_csv('https://raw.githubusercontent.com/christanov/telco-churn-project/main/data/telco_dataset_final.csv')
df.head()

In [None]:
# Preparar datos
features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'Contract', 'InternetService',
            'OnlineSecurity', 'TechSupport', 'OnlineBackup', 'DeviceProtection']
X = df[features].copy()
y = df['Churn']

In [None]:
# Codificar variables categóricas
categorical_cols = ['Contract', 'InternetService', 'OnlineSecurity', 'TechSupport', 
                    'OnlineBackup', 'DeviceProtection']
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

In [None]:
# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Escalar características numéricas
scaler = StandardScaler()
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [None]:
# Entrenar modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# Reporte de clasificación
print(classification_report(y_test, y_pred))

In [None]:
# Matriz de confusión
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de Confusión')
plt.ylabel('Real')
plt.xlabel('Predicho')
plt.show()

In [None]:
# Importancia de características
importancia = pd.DataFrame({'feature': features, 'importance': model.feature_importances_})
sns.barplot(x='importance', y='feature', data=importancia.sort_values('importance', ascending=True))
plt.title('Importancia de Características')
plt.show()

In [None]:
# Visualización: Distribución de churn por categoría de cliente
churn_dist = pd.crosstab(df['CustomerCategory'], df['Churn'], normalize='index') * 100
churn_dist.plot(kind='bar', stacked=True)
plt.title('Distribución de Churn por Categoría')
plt.ylabel('% de Clientes')
plt.show()

In [None]:
# Visualización: ARPU y Servicios por Categoría
fig, ax1 = plt.subplots(figsize=(10,5))
ax2 = ax1.twinx()
avg_arpu = df.groupby('CustomerCategory')['ARPU'].mean()
avg_services = df.groupby('CustomerCategory')['TotalServices'].mean()
ax1.bar(avg_arpu.index, avg_arpu.values, color='blue', alpha=0.6, label='ARPU')
ax2.plot(avg_services.index, avg_services.values, color='red', marker='o', label='Servicios')
ax1.set_ylabel('ARPU', color='blue')
ax2.set_ylabel('Servicios', color='red')
plt.title('ARPU vs Servicios por Categoría')
plt.tight_layout()
plt.show()