In [6]:
import json
import random
from datetime import datetime, timedelta
from ucimlrepo import fetch_ucirepo, list_available_datasets
import numpy as np
import pandas as pd
from etl import UserGenerator
from feature_engineer import FeatureEngineer

# ETL

In [7]:
user_generator = UserGenerator(n_samples=25000)


In [8]:
ds = user_generator.create_dataset()
print(type(ds), isinstance(ds, tuple))

<class 'pandas.core.frame.DataFrame'> False


In [9]:
ds

Unnamed: 0,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...
541904,PACK OF 20 SPACEBOY NAPKINS,12,12/9/2011 12:50,0.85,12680.0,France
541905,CHILDREN'S APRON DOLLY GIRL,6,12/9/2011 12:50,2.10,12680.0,France
541906,CHILDRENS CUTLERY DOLLY GIRL,4,12/9/2011 12:50,4.15,12680.0,France
541907,CHILDRENS CUTLERY CIRCUS PARADE,4,12/9/2011 12:50,4.15,12680.0,France


In [10]:
ds = user_generator.run_etl()

In [11]:
ds.info

<bound method DataFrame.info of                                 Description  Quantity         InvoiceDate  \
0        WHITE HANGING HEART T-LIGHT HOLDER         6 2010-12-01 08:26:00   
1                       WHITE METAL LANTERN         6 2010-12-01 08:26:00   
2            CREAM CUPID HEARTS COAT HANGER         8 2010-12-01 08:26:00   
3       KNITTED UNION FLAG HOT WATER BOTTLE         6 2010-12-01 08:26:00   
4            RED WOOLLY HOTTIE WHITE HEART.         6 2010-12-01 08:26:00   
...                                     ...       ...                 ...   
541904          PACK OF 20 SPACEBOY NAPKINS        12 2011-12-09 12:50:00   
541905         CHILDREN'S APRON DOLLY GIRL          6 2011-12-09 12:50:00   
541906        CHILDRENS CUTLERY DOLLY GIRL          4 2011-12-09 12:50:00   
541907      CHILDRENS CUTLERY CIRCUS PARADE         4 2011-12-09 12:50:00   
541908        BAKING SET 9 PIECE RETROSPOT          3 2011-12-09 12:50:00   

        UnitPrice  CustomerID         Count

In [12]:
ds.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
Quantity,397884.0,12.988238,1.0,2.0,6.0,12.0,80995.0,179.331775
InvoiceDate,397884.0,2011-07-10 23:41:23.511023360,2010-12-01 08:26:00,2011-04-07 11:12:00,2011-07-31 14:39:00,2011-10-20 14:33:00,2011-12-09 12:50:00,
UnitPrice,397884.0,3.116488,0.001,1.25,1.95,3.75,8142.75,22.097877
CustomerID,397884.0,15294.423453,12346.0,13969.0,15159.0,16795.0,18287.0,1713.14156


In [13]:
ds.columns

Index(['Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID',
       'Country'],
      dtype='object')

In [14]:
# Información base
print("Fecha mínima:", ds["InvoiceDate"].min())
print("Fecha máxima:", ds["InvoiceDate"].max())
print("Clientes únicos:", ds["CustomerID"].nunique())
print("Productos únicos:", ds["Description"].nunique())
print("Países:", ds["Country"].nunique())
print(f"Rango de fechas: {ds['InvoiceDate'].min().date()} → {ds['InvoiceDate'].max().date()}")


Fecha mínima: 2010-12-01 08:26:00
Fecha máxima: 2011-12-09 12:50:00
Clientes únicos: 4338
Productos únicos: 3877
Países: 37
Rango de fechas: 2010-12-01 → 2011-12-09


# Feature Engineering

In [15]:
feature_engineer = FeatureEngineer(ds)

In [16]:
df_engineered = feature_engineer.run()


  .apply(self.historial_compra)   # <-- ahora acepta g


In [17]:
df_engineered

Unnamed: 0,Description,InvoiceDate,Country,Quantity,Revenue,UnitPrice,CustomerID,n_past_invoices,prev_date,recency_days,spend_prior,qty_prior,avg_ticket_prior,avg_qty_per_invoice_prior,next_date,days_to_next,y_repurchase_30d
0,MEDIUM CERAMIC TOP STORAGE JAR,2011-01-18 10:01:00,United Kingdom,74215,77183.60,1.04,12346,0,NaT,9999,0.00,0,0.000000,0.000000,NaT,,0
1,3D DOG PICTURE PLAYING CARDS,2010-12-07 14:57:00,Iceland,24,70.80,2.95,12347,0,NaT,9999,0.00,0,0.000000,0.000000,2010-12-07 14:57:00,0.0,1
2,AIRLINE BAG VINTAGE JET SET BROWN,2010-12-07 14:57:00,Iceland,4,17.00,4.25,12347,1,2010-12-07 14:57:00,0,70.80,24,70.800000,24.000000,2010-12-07 14:57:00,0.0,1
3,ALARM CLOCK BAKELIKE CHOCOLATE,2010-12-07 14:57:00,Iceland,4,15.00,3.75,12347,2,2010-12-07 14:57:00,0,87.80,28,43.900000,14.000000,2010-12-07 14:57:00,0.0,1
4,ALARM CLOCK BAKELIKE GREEN,2010-12-07 14:57:00,Iceland,4,15.00,3.75,12347,3,2010-12-07 14:57:00,0,102.80,32,34.266667,10.666667,2010-12-07 14:57:00,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
387655,SWISS CHALET TREE DECORATION,2011-10-12 10:23:00,United Kingdom,24,6.96,0.29,18287,63,2011-10-12 10:23:00,0,1739.84,1442,27.616508,22.888889,2011-10-12 10:23:00,0.0,1
387656,TREE T-LIGHT HOLDER WILLIE WINKIE,2011-10-12 10:23:00,United Kingdom,12,19.80,1.65,18287,64,2011-10-12 10:23:00,0,1746.80,1466,27.293750,22.906250,2011-10-28 09:29:00,15.0,1
387657,PAINTED METAL STAR WITH HOLLY BELLS,2011-10-28 09:29:00,United Kingdom,48,18.72,0.39,18287,65,2011-10-12 10:23:00,15,1766.60,1478,27.178462,22.738462,2011-10-28 09:29:00,0.0,1
387658,SET OF 3 WOODEN SLEIGH DECORATIONS,2011-10-28 09:29:00,United Kingdom,36,45.00,1.25,18287,66,2011-10-28 09:29:00,0,1785.32,1526,27.050303,23.121212,2011-10-28 09:29:00,0.0,1


# Modelando con MLFlow

In [18]:
import mlflow


experiment_name = "recompra-LogReg"
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(experiment_name)

# Enable autologging for sklearn models
mlflow.sklearn.autolog(
    log_input_examples=True,
    log_model_signatures=True,
    log_models=True,
    disable=False,
    exclusive=False,
    disable_for_unsupported_versions=False,
    silent=False,
    max_tuning_runs=5
)

In [19]:
num_feats = [
    'recency_days','n_past_invoices','spend_prior','qty_prior',
    'avg_ticket_prior','avg_qty_per_invoice_prior','UnitPrice','Quantity','Revenue'
]
cat_feats = ['Country']

In [25]:
from sklearn.linear_model import LogisticRegression

from train_mlflow import TrainMlflow

model = LogisticRegression(max_iter=500)

# 3) Instancia y entrena
trainer = TrainMlflow(
    df=df_engineered,
    numeric_features=num_feats,
    categorical_features=cat_feats,
    target_column='y_repurchase_30d',
    model=model,
    mlflow_setup={"tracking_uri": "file:./mlruns", "experiment_name": "OnlineRetail"}
)

pipeline, run_id = trainer.train()
trainer.pipeline = pipeline                  # <- necesario para save_model()
trainer.save_model("models/model.pkl")       # ✅ Modelo guardado en models/model.pkl


Rango total: 2010-12-01 08:26:00 → 2011-12-09 12:50:00 | cutoff: 2011-11-09 12:50:00
train_end: 2011-09-01 00:00:00
train: 218546 | test: 102095
pos_rate train=0.972 | test=0.979




MLflow Run ID: b1535ba6ecdb4e2f9f952c181a75be13
Tracking URI: http://127.0.0.1:5000
Train Accuracy: 0.9720
Test Accuracy: 0.9786
🏃 View run calm-penguin-992 at: http://127.0.0.1:5000/#/experiments/2/runs/b1535ba6ecdb4e2f9f952c181a75be13
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2


AttributeError: 'TrainMlflow' object has no attribute 'save_model'