In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline




df = pd.read_csv("datos.csv")
print("Tamaño del dataset:", df.shape)
df.head()


Tamaño del dataset: (440833, 12)


Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,2.0,30.0,Female,39.0,14.0,5.0,18.0,Standard,Annual,932.0,17.0,1.0
1,3.0,65.0,Female,49.0,1.0,10.0,8.0,Basic,Monthly,557.0,6.0,1.0
2,4.0,55.0,Female,14.0,4.0,6.0,18.0,Basic,Quarterly,185.0,3.0,1.0
3,5.0,58.0,Male,38.0,21.0,7.0,7.0,Standard,Monthly,396.0,29.0,1.0
4,6.0,23.0,Male,32.0,20.0,5.0,8.0,Basic,Monthly,617.0,20.0,1.0


In [6]:
# 1. Eliminar columnas tipo ID (no aportan al modelo)
id_cols = [c for c in df.columns if "id" in c.lower()]
df = df.drop(columns=id_cols, errors="ignore")

# 2. Eliminar duplicados
df = df.drop_duplicates()

# 3. Revisar valores nulos
print(df.isnull().sum())

# Opciones: imputar nulos con valores estadísticos
df = df.fillna(df.median(numeric_only=True))   # numéricos → mediana
df = df.fillna(df.mode().iloc[0])              # categóricos → moda

# 4. Confirmar tipos de datos
print(df.dtypes)


Age                  1
Gender               1
Tenure               1
Usage Frequency      1
Support Calls        1
Payment Delay        1
Subscription Type    1
Contract Length      1
Total Spend          1
Last Interaction     1
Churn                1
dtype: int64
Age                  float64
Gender                object
Tenure               float64
Usage Frequency      float64
Support Calls        float64
Payment Delay        float64
Subscription Type     object
Contract Length       object
Total Spend          float64
Last Interaction     float64
Churn                float64
dtype: object


In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Identificar columnas
num_cols = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 
            'Payment Delay', 'Total Spend', 'Last Interaction']
cat_cols = ['Gender', 'Subscription Type', 'Contract Length']

# Preprocesamiento:
# - numéricas → escalado
# - categóricas → OneHot
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
    ]
)

# Aplicar transformación a X (todas las variables excepto Churn)
X = df.drop(columns=['Churn'])
y = df['Churn']

X_processed = preprocessor.fit_transform(X)

print("Shape después de codificación + escalado:", X_processed.shape)



Shape después de codificación + escalado: (440833, 12)
