In [1]:
import warnings
warnings.filterwarnings("ignore")

# Importamos dataset dummy

In [2]:
import pandas as pd

df = pd.read_csv("../data/dummy_data.csv")

X = df.drop(columns="target")
y = df["target"]

model_columns = X.columns

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

df.head(5)

Unnamed: 0,age,income,visits_last_30d,time_on_site_min,previous_purchases,target
0,56,2998,16,8.27,4,1
1,46,4597,5,6.5,5,1
2,32,2905,9,4.73,0,0
3,25,2276,13,9.44,1,1
4,38,4654,11,6.41,1,1


# Entrenamiento del "modelo 1" (Regresión Logística)

In [3]:

# Modelo 1
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()
model1.fit(X_train, y_train)

y_pred = model1.predict(X_test)
y_prob = model1.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

print(classification_report(y_test, y_pred))

coefficients = pd.DataFrame({
    "feature": X.columns,
    "coef": model1.coef_[0]
}).sort_values("coef", ascending=False)

print(coefficients)

Accuracy: 0.9
ROC AUC: 0.8400000000000001
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.88      1.00      0.94        15

    accuracy                           0.90        20
   macro avg       0.94      0.80      0.84        20
weighted avg       0.91      0.90      0.89        20

              feature      coef
4  previous_purchases  0.996003
2     visits_last_30d  0.207661
1              income  0.000715
0                 age -0.009481
3    time_on_site_min -0.017066


# Entrenamiento del "modelo 2" (Random Forest Classifier)

In [4]:
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier()
model2.fit(X_train, y_train)

y_pred = model2.predict(X_test)
y_prob = model2.predict_proba(X_test)[:, 1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))

print(classification_report(y_test, y_pred))

coefficients = pd.DataFrame({
    "feature": X.columns,
    "coef": model2.feature_importances_
}).sort_values("coef", ascending=False)

print(coefficients) 

Accuracy: 0.8
ROC AUC: 0.8933333333333333
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.87      0.87      0.87        15

    accuracy                           0.80        20
   macro avg       0.73      0.73      0.73        20
weighted avg       0.80      0.80      0.80        20

              feature      coef
1              income  0.252557
4  previous_purchases  0.218161
3    time_on_site_min  0.192358
0                 age  0.177914
2     visits_last_30d  0.159010


# Predecimos sobre nuevos clientes dummy con ambos modelos

In [5]:
# predict on new 5 customers

new_customers = pd.DataFrame({
    "age": [25, 35, 45, 55, 65],
    "income": [1000, 4000, 5000, 20000, 7000],
    "visits_last_30d": [10, 9, 8, 7, 6],
    "time_on_site_min": [5, 4, 3, 5, 1],
    "previous_purchases": [1, 2, 3, 4, 5]
})

new_customers

Unnamed: 0,age,income,visits_last_30d,time_on_site_min,previous_purchases
0,25,1000,10,5,1
1,35,4000,9,4,2
2,45,5000,8,3,3
3,55,20000,7,5,4
4,65,7000,6,1,5


In [6]:
new_customers["pred_model_1"] = model1.predict_proba(new_customers[model_columns])[:,1]
new_customers["pred_model_2"] = model2.predict_proba(new_customers[model_columns])[:,1]

new_customers

Unnamed: 0,age,income,visits_last_30d,time_on_site_min,previous_purchases,pred_model_1,pred_model_2
0,25,1000,10,5,1,0.367873,0.38
1,35,4000,9,4,2,0.909936,0.87
2,45,5000,8,3,3,0.976751,0.85
3,55,20000,7,5,4,1.0,0.96
4,65,7000,6,1,5,0.998625,0.87


# Modelos en la vida real (modelos en producción con mlflow)

# Inicializamos mlflow

In [8]:
import mlflow

from mlflow.models import infer_signature

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, roc_auc_score

mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment("random_forest_grid_search5")

2026/02/21 23:09:44 INFO mlflow.tracking.fluent: Experiment with name 'random_forest_grid_search5' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/eduardo/Documents/ds_projects/repositorios_eduardomelog/mlflow_ninja/notebooks/../mlruns/581875453308518760', creation_time=1771736984749, experiment_id='581875453308518760', last_update_time=1771736984749, lifecycle_stage='active', name='random_forest_grid_search5', tags={}, workspace='default'>

In [9]:
# -----------------------------
# 2. Grid sencillo
# -----------------------------
param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [3, 5, None],
    "min_samples_split": [2, 5]
}

In [10]:

with mlflow.start_run(run_name="random_forest_grid"):

    for params in ParameterGrid(param_grid):

        with mlflow.start_run(nested=True):

            model = RandomForestClassifier(
                **params,
                random_state=42,
                n_jobs=-1
            )

            model.fit(X_train, y_train)

            preds = model.predict(X_test)

            acc = accuracy_score(y_test, preds)
            auc = roc_auc_score(y_test, preds)

            mlflow.log_params(params)

            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("auc", auc)

            signature = infer_signature(X_train, model.predict(X_train))

            mlflow.sklearn.log_model(
                sk_model=model,
                name="model",
                signature=signature,
                input_example=X_train.iloc[:5]
            )



In [11]:
# Buscar el parent run
parent = mlflow.search_runs(
    filter_string="tags.mlflow.runName = 'random_forest_grid'"
).iloc[0]

parent_run_id = parent.run_id

# Obtener subruns
runs = mlflow.search_runs(
    filter_string=f"tags.mlflow.parentRunId = '{parent_run_id}'"
)

# Quedarnos solo con métricas
results = runs[[
    "run_id",
    "metrics.accuracy",
    "metrics.auc"
]].rename(columns={
    "metrics.accuracy": "accuracy",
    "metrics.auc": "auc"
})

#print(results)

# Ordenar por accuracy
results = results.sort_values("accuracy", ascending=False)

print(results)

# Elegir champion
champion_run_id = results.iloc[0]["run_id"]

# Cargar modelo campeón
champion_model = mlflow.sklearn.load_model(
    f"runs:/{champion_run_id}/model"
)

                              run_id  accuracy       auc
8   342c5f8d6faa49fea5a078fabcbfdf9a      0.90  0.800000
9   a95058f3655342e8bf248e3d180456b8      0.90  0.800000
10  f2b1759e76a0449caa3db9d27c727220      0.90  0.800000
11  648c092149ac4e3db1c2a8adbad262b0      0.90  0.800000
1   d1186818ea5348999ae29821e564d7d1      0.85  0.766667
2   075454a20445401ab41ede094ca6f19d      0.85  0.766667
3   17488688134141d3b0e47077eac5af62      0.85  0.766667
4   0db9ef29eed84d6fa3a7b235670024ca      0.85  0.766667
5   e39aa340550b4ea0b8f62b71f33a1933      0.85  0.766667
6   fe9d281344364270a73a763068f38f92      0.85  0.766667
7   705e0350ba6b4b46bc376249e8b16f7f      0.85  0.766667
0   ba5f99fd1e2f4ae1bbbcc7b59e08c6c5      0.80  0.733333


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

In [12]:

# Predecir
new_customers["pred_champion"] = champion_model.predict_proba(new_customers[model_columns])[:,1]
new_customers

Unnamed: 0,age,income,visits_last_30d,time_on_site_min,previous_purchases,pred_model_1,pred_model_2,pred_champion
0,25,1000,10,5,1,0.367873,0.38,0.470129
1,35,4000,9,4,2,0.909936,0.87,0.864982
2,45,5000,8,3,3,0.976751,0.85,0.813714
3,55,20000,7,5,4,1.0,0.96,0.93641
4,65,7000,6,1,5,0.998625,0.87,0.810546
