In [1]:
import pandas as pd

df_ml = pd.read_csv("DATA/client_full.csv")
df_ml.head()

Unnamed: 0,client_id,first_name,last_name,gender,age,email,city,address,postal_code,country,citizenship,document_type,document_number,registration_date,last_seen,last_purchase_date,days_since_last_purchase,churn,frequency,monetary
0,21e5c13d-1c9a-4d00-9164-b72302d5edef,Tristán,Rojas,M,42,tristán.rojas13@example.com,Rivera,Bulevar Artigas 3757,28289,Uruguay,Uruguay,CI,2719583-8,2025-05-02,2025-06-19,2024-12-16 21:01:32,15.0,0,44,2306.18
1,36e48bdd-db11-4abe-9526-cfc90e68924d,Óscar,Barranco,M,58,óscar.barranco13@example.com,Salto,Bulevar Artigas 3911,76237,Uruguay,Uruguay,CI,1445199-8,2023-02-24,2023-12-09,2024-12-26 10:58:03,5.0,0,122,7236.93
2,145c22df-3579-412e-bc12-b4fce70abaf3,Macario,Querol,M,28,macario.querol67@example.com,Rivera,Av. Rivera 206,30926,Uruguay,Uruguay,CI,8090293-5,2024-10-03,2025-05-03,2024-12-29 22:46:06,2.0,0,37,2076.58
3,90c4a925-e51f-4dac-9193-2d9aec97a472,Ramiro,Sanchez,M,19,ramiro.sanchez53@example.com,Salto,Av. 18 de Julio 6324,22676,Uruguay,Uruguay,CI,7022674-5,2025-01-13,2025-08-11,2024-11-19 02:28:56,42.0,0,13,614.39
4,853f711a-4c36-40b4-b5d0-6207152cd793,Juan Francisco,Flor,M,36,juan francisco.flor68@example.com,Mercedes,Av. 18 de Julio 6301,20328,Uruguay,Uruguay,CI,5918715-9,2025-03-09,2025-03-21,,999.0,1,0,0.0


In [2]:
# Seleccionar columnas relevantes para el modelo
model_cols = [
    "age",
    "gender",
    "city",
    "country",
    "citizenship",
    "document_type",
    "days_since_last_purchase",
    "frequency",
    "monetary",
    "churn"
]

df = df_ml[model_cols].copy()
df.head()


Unnamed: 0,age,gender,city,country,citizenship,document_type,days_since_last_purchase,frequency,monetary,churn
0,42,M,Rivera,Uruguay,Uruguay,CI,15.0,44,2306.18,0
1,58,M,Salto,Uruguay,Uruguay,CI,5.0,122,7236.93,0
2,28,M,Rivera,Uruguay,Uruguay,CI,2.0,37,2076.58,0
3,19,M,Salto,Uruguay,Uruguay,CI,42.0,13,614.39,0
4,36,M,Mercedes,Uruguay,Uruguay,CI,999.0,0,0.0,1


In [3]:
# OneHotEncoding de variables categóricas
df_encoded = pd.get_dummies(
    df,
    columns=["gender", "city", "country", "citizenship", "document_type"],
    drop_first=True
)

df_encoded.head()


Unnamed: 0,age,days_since_last_purchase,frequency,monetary,churn,gender_M,city_Maldonado,city_Melo,city_Mercedes,city_Montevideo,...,country_España,country_Francia,country_USA,country_Uruguay,citizenship_Brasil,citizenship_España,citizenship_Francia,citizenship_USA,citizenship_Uruguay,document_type_Doc. Extranjero
0,42,15.0,44,2306.18,0,True,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
1,58,5.0,122,7236.93,0,True,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
2,28,2.0,37,2076.58,0,True,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
3,19,42.0,13,614.39,0,True,False,False,False,False,...,False,False,False,True,False,False,False,False,True,False
4,36,999.0,0,0.0,1,True,False,False,True,False,...,False,False,False,True,False,False,False,False,True,False


In [4]:
# Separar X e y
X = df_encoded.drop(columns=["churn"])
y = df_encoded["churn"]

X.shape, y.shape


((2357, 24), (2357,))

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # mantiene proporción de churn
)

X_train.shape, X_test.shape


((1885, 24), (472, 24))

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Crear modelo
logit = LogisticRegression(max_iter=1000)

# Entrenar
logit.fit(X_train, y_train)

# Predicciones
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)[:, 1]

# Evaluación
print("ROC-AUC:", roc_auc_score(y_test, y_pred_proba))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


ROC-AUC: 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       361
           1       1.00      1.00      1.00       111

    accuracy                           1.00       472
   macro avg       1.00      1.00      1.00       472
weighted avg       1.00      1.00      1.00       472



In [7]:
X = df_encoded.drop(columns=["churn", "days_since_last_purchase"])
y = df_encoded["churn"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [8]:
logit = LogisticRegression(max_iter=1000)
logit.fit(X_train, y_train)

y_pred = logit.predict(X_test)
y_proba = logit.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print(classification_report(y_test, y_pred))


ROC-AUC: 0.8854283646527413
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       361
           1       0.84      0.69      0.76       111

    accuracy                           0.90       472
   macro avg       0.87      0.83      0.85       472
weighted avg       0.89      0.90      0.89       472



In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, classification_report

dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=30,
    random_state=42
)

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_proba_dt = dt.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba_dt))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_dt))


ROC-AUC: 0.8291282972723417

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.94      0.92       361
           1       0.77      0.68      0.72       111

    accuracy                           0.88       472
   macro avg       0.83      0.81      0.82       472
weighted avg       0.87      0.88      0.87       472



In [10]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=20,
    class_weight="balanced",
    random_state=42
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba_rf))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_rf))


ROC-AUC: 0.9024980659329689

Classification Report:

              precision    recall  f1-score   support

           0       0.94      0.89      0.92       361
           1       0.71      0.83      0.76       111

    accuracy                           0.88       472
   macro avg       0.83      0.86      0.84       472
weighted avg       0.89      0.88      0.88       472



In [11]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
y_proba_gb = gb.predict_proba(X_test)[:,1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba_gb))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_gb))


ROC-AUC: 0.8755833395722593

Classification Report:

              precision    recall  f1-score   support

           0       0.90      0.95      0.92       361
           1       0.80      0.66      0.72       111

    accuracy                           0.88       472
   macro avg       0.85      0.80      0.82       472
weighted avg       0.88      0.88      0.88       472



In [12]:
from sklearn.metrics import roc_auc_score, f1_score

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train predictions
    pred_train = model.predict(X_train)
    proba_train = model.predict_proba(X_train)[:,1]
    
    # Test predictions
    pred_test = model.predict(X_test)
    proba_test = model.predict_proba(X_test)[:,1]
    
    return {
        "train_auc": roc_auc_score(y_train, proba_train),
        "test_auc": roc_auc_score(y_test, proba_test),
        "train_f1": f1_score(y_train, pred_train),
        "test_f1": f1_score(y_test, pred_test)
    }


In [13]:
results = []

# Logistic Regression
results.append(["Logistic Regression"] + list(evaluate_model(logit, X_train, y_train, X_test, y_test).values()))

# Decision Tree
results.append(["Decision Tree"] + list(evaluate_model(dt, X_train, y_train, X_test, y_test).values()))

# Random Forest
results.append(["Random Forest"] + list(evaluate_model(rf, X_train, y_train, X_test, y_test).values()))

# Gradient Boosting
results.append(["Gradient Boosting"] + list(evaluate_model(gb, X_train, y_train, X_test, y_test).values()))


In [14]:
comparison_df = pd.DataFrame(
    results,
    columns=["Model", "Train AUC", "Test AUC", "Train F1", "Test F1"]
)

comparison_df


Unnamed: 0,Model,Train AUC,Test AUC,Train F1,Test F1
0,Logistic Regression,0.894927,0.885428,0.698002,0.758621
1,Decision Tree,0.911645,0.829128,0.754802,0.717703
2,Random Forest,0.954972,0.902498,0.756121,0.763485
3,Gradient Boosting,0.960149,0.875583,0.801453,0.722772


In [15]:
import pickle

with open("rf_model.pkl", "wb") as f:
    pickle.dump(rf, f)


In [1]:
import mlflow
mlflow.__version__


'3.5.1'