In [1]:
import kagglehub
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Descargar el dataset
path = kagglehub.dataset_download("radheshyamkollipara/bank-customer-churn")

# Cargar datos
data = pd.read_csv(path + "/Customer-Churn-Records.csv")

Downloading from https://www.kaggle.com/api/v1/datasets/download/radheshyamkollipara/bank-customer-churn?dataset_version_number=1...


100%|██████████| 307k/307k [00:00<00:00, 41.7MB/s]

Extracting files...





In [2]:
# 1. Inspeccionar datos
print("Primeras filas del dataset:\n", data.head())
print("\nInformación del dataset:\n", data.info())

Primeras filas del dataset:
    RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  Complain  Satisfaction Score Card Type  \
0        101348.88       1         1                   2   

In [3]:
# 2. Eliminar columnas irrelevantes o no numéricas
columns_to_drop = ["RowNumber", "CustomerId", "Surname"]
data.drop(columns=columns_to_drop, axis=1, inplace=True, errors="ignore")

In [4]:
# 3. Convertir variables categóricas en numéricas con One-Hot Encoding
categorical_features = ["Geography", "Gender", "Card Type"]
data = pd.get_dummies(data, columns=categorical_features, drop_first=True)

In [5]:
# 4. Escalar variables numéricas
scaler = StandardScaler()
numerical_features = ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary", "Point Earned"]
data[numerical_features] = scaler.fit_transform(data[numerical_features])


In [6]:
# 5. Verificar datos preprocesados
print("\nDatos preprocesados:\n", data.head())


Datos preprocesados:
    CreditScore       Age    Tenure   Balance  NumOfProducts  HasCrCard  \
0    -0.326221  0.293517 -1.041760 -1.225848      -0.911583          1   
1    -0.440036  0.198164 -1.387538  0.117350      -0.911583          0   
2    -1.536794  0.293517  1.032908  1.333053       2.527057          1   
3     0.501521  0.007457 -1.387538 -1.225848       0.807737          0   
4     2.063884  0.388871 -1.041760  0.785728      -0.911583          1   

   IsActiveMember  EstimatedSalary  Exited  Complain  Satisfaction Score  \
0               1         0.021886       1         1                   2   
1               1         0.216534       0         1                   3   
2               0         0.240687       1         1                   3   
3               0        -0.108918       0         0                   5   
4               1        -0.365276       0         0                   5   

   Point Earned  Geography_Germany  Geography_Spain  Gender_Male  \
0     -

In [7]:
# 6. Guardar datos preprocesados
data.to_csv("processed_data.csv", index=False)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import pickle

# 1. Cargar datos preprocesados
data = pd.read_csv("processed_data.csv")

# 2. Separar variables independientes (X) y la variable objetivo (y)
X = data.drop("Exited", axis=1)
y = data["Exited"]

# 3. Dividir en conjunto de entrenamiento (80%) y prueba (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)