In [1]:
# Importación de librerías
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

In [5]:
# Carga el conjunto de datos
try:
    bb_churn_df = pd.read_csv('files/datasets/input/Churn.csv')
    print("Carga exitosa del conjunto de datos.")
except FileNotFoundError:
    print("El archivo no se encuentra en la ruta especificada.")

Carga exitosa del conjunto de datos.


In [6]:
# Revisa que los tipos de datos sean correctos
print(bb_churn_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           9091 non-null   float64
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB
None


In [7]:
# Explora los primeros registros
display(bb_churn_df.head())

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [8]:
# Verifica datos faltantes
print(bb_churn_df.isnull().sum())

RowNumber            0
CustomerId           0
Surname              0
CreditScore          0
Geography            0
Gender               0
Age                  0
Tenure             909
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
dtype: int64


In [9]:
# Reemplaza los valores faltantes en la columna Tenure con ceros
bb_churn_df['Tenure'].fillna(0, inplace=True)

In [10]:
# Verifica que ya no hay valores faltantes
print(bb_churn_df.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


In [11]:
# Convertir variables categóricas en variables dummy
bb_churn_df = pd.get_dummies(bb_churn_df, columns=['Geography', 'Gender'], drop_first=True)

In [12]:
# Elimina la columna 'Surname'
bb_churn_df = bb_churn_df.drop('Surname', axis=1)

In [13]:
# Separa las características y la variable objetivo
features = bb_churn_df.drop('Exited', axis=1)
target = bb_churn_df['Exited']

In [14]:
# Divide los datos en conjuntos de entrenamiento y prueba
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=12345)

In [15]:
# Examina el equilibrio de clases
print("Clase 0 (No Churn):", target_train.value_counts()[0])
print("Clase 1 (Churn):", target_train.value_counts()[1])

Clase 0 (No Churn): 6390
Clase 1 (Churn): 1610


In [16]:
# Sobremuestreo de la clase minoritaria (Churn = 1)
features_resampled, target_resampled = resample(features_train[target_train == 1],
                                               target_train[target_train == 1],
                                               replace=True,
                                               n_samples=target_train.value_counts()[0],
                                               random_state=12345)

In [17]:
# Normalización de variables numéricas
scaler = StandardScaler()
features_resampled_scaled = scaler.fit_transform(features_resampled)

In [18]:
# Concatena los datos de entrenamiento balanceados
features_train_balanced = pd.concat([features_train, features_resampled])
target_train_balanced = pd.concat([target_train, target_resampled])

In [19]:
# Entrena el modelo utilizando RandomForest con parámetros optimizados
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=12345)
rf_model.fit(features_train_balanced, target_train_balanced)

In [20]:
# Realiza predicciones en el conjunto de prueba
target_pred = rf_model.predict(features_test)

In [21]:
# Calcula métricas de evaluación
f1 = f1_score(target_test, target_pred)
roc_auc = roc_auc_score(target_test, rf_model.predict_proba(features_test)[:, 1])

In [22]:
# Imprime resultados
print("F1 Score:", f1)
print("AUC-ROC Score:", roc_auc)

F1 Score: 0.6352015732546706
AUC-ROC Score: 0.8640108029079713


In [24]:
# Utiliza GridSearchCV para optimizar parámetros
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=12345)
grid_search = GridSearchCV(rf_model, param_grid, scoring='f1', cv=5)
grid_search.fit(features_train_balanced, target_train_balanced)

In [25]:
# Mejores parámetros
best_params = grid_search.best_params_

In [26]:
# Modelo con parámetros optimizados
rf_model_optimized = grid_search.best_estimator_
rf_model_optimized.fit(features_train_balanced, target_train_balanced)

In [27]:
# Predicciones en el conjunto de prueba con el modelo optimizado
target_pred_optimized = rf_model_optimized.predict(features_test)

In [28]:
# Métricas de evaluación con el modelo optimizado
f1_optimized = f1_score(target_test, target_pred_optimized)
roc_auc_optimized = roc_auc_score(target_test, rf_model_optimized.predict_proba(features_test)[:, 1])

In [29]:
print("\nResultados con parámetros optimizados:")
print("F1 Score (optimizado):", f1_optimized)
print("AUC-ROC Score (optimizado):", roc_auc_optimized)
print("\nClassification Report (optimizado):")
print(classification_report(target_test, target_pred_optimized))
print("\nConfusion Matrix (optimizado):")
print(confusion_matrix(target_test, target_pred_optimized))


Resultados con parámetros optimizados:
F1 Score (optimizado): 0.62453531598513
AUC-ROC Score (optimizado): 0.8611343946664364

Classification Report (optimizado):
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      1573
           1       0.66      0.59      0.62       427

    accuracy                           0.85      2000
   macro avg       0.78      0.75      0.76      2000
weighted avg       0.84      0.85      0.85      2000


Confusion Matrix (optimizado):
[[1445  128]
 [ 175  252]]
