In [1]:
import dask
import dask.dataframe as dd

dask.config.set({"dataframe.convert-string": False})

dataset_path = "./smadex-challenge-predict-the-revenue/train/train"
filters = [("datetime", ">=", "2025-10-01-00-00"), ("datetime", "<", "2025-10-01-01-00")]

ddf = dd.read_parquet(
    dataset_path,
    filters=filters
)

In [2]:
len(ddf.columns)

85

In [3]:
nrows = ddf.shape[0].compute()
nrows

121887

In [4]:
ddf.head()

Unnamed: 0,buyer_d1,buyer_d7,buyer_d14,buyer_d28,buy_d7,buy_d14,buy_d28,iap_revenue_d7,iap_revenue_d14,iap_revenue_d28,...,user_bundles_l28d,weekend_ratio,weeks_since_first_seen,wifi_ratio,whale_users_bundle_num_buys_prank,whale_users_bundle_revenue_prank,whale_users_bundle_total_num_buys,whale_users_bundle_total_revenue,row_id,datetime
0,0,1,1,1,1,1,1,2.147718,2.147718,2.147718,...,"[88981729bd5c1e5aea9ada4bce00a2531e9e98f7, 25c...",0.019802,6.0,0.913366,,,,,819ecc0e-1a97-43ed-83f6-b9ede4f7fc48,2025-10-01-00-00
1,0,0,0,0,0,0,0,0.0,0.0,0.0,...,,,,,,,,,0a7fbf18-5041-42af-bd0a-0cb6586b8598,2025-10-01-00-00
2,0,0,0,0,0,0,0,0.0,0.0,0.0,...,"[6506b7e0a24666debd08f74266800f2eb154df5a, 150...",0.399021,6.0,0.999388,,,,,fc1a2689-b136-4ffa-b23b-9d8215bd720f,2025-10-01-00-00
3,0,0,0,0,0,0,0,0.0,0.0,0.0,...,"[2b472e3dc96f1847490d7411b25e12ed417b9714, 3ba...",0.121547,6.0,1.0,,,,,0340fcc6-50bd-42ab-b9f4-4c1184b640cb,2025-10-01-00-00
4,0,0,0,0,0,0,0,0.0,0.0,0.0,...,"[1031535cf2a1315422fd05d321349bcd3c3ffc04, 478...",0.293285,6.0,0.160243,,,,,219d253f-bef4-4039-84b2-ed55f009cc43,2025-10-01-00-00


In [5]:
import pandas as pd

pd.set_option("display.max_rows", None)
null_columns = ddf.isnull().sum().compute()
null_columns

buyer_d1                                           0
buyer_d7                                           0
buyer_d14                                          0
buyer_d28                                          0
buy_d7                                             0
buy_d14                                            0
buy_d28                                            0
iap_revenue_d7                                     0
iap_revenue_d14                                    0
iap_revenue_d28                                    0
registration                                   57760
retention_d1_to_d7                              9564
retention_d3_to_d7                              9564
retention_d7_to_d14                             9564
retention_d1                                    9564
retention_d3                                    9564
retentiond7                                     9564
advertiser_bundle                                  0
advertiser_category                           

In [6]:
# Calcular porcentaje de nulos
null_pct = (null_columns / nrows * 100).sort_values(ascending=False)

# Estrategia según % de nulos:
# < 5%: imputación con media + ruido
# 5-30%: KNN o MICE
# > 30%: considerar crear feature binaria "is_missing" + imputar con mediana
null_pct

last_advertiser_action                        98.899801
advertiser_actions_action_count               98.899801
advertiser_actions_action_last_timestamp      98.899801
rev_by_adv                                    98.272170
last_buy_ts_bundle                            98.052294
last_buy                                      98.052294
last_buy_ts_category                          98.052294
ctr                                           96.405687
ctr_pct_rk                                    96.405687
whale_users_bundle_revenue_prank              96.317901
whale_users_bundle_num_buys_prank             96.317901
whale_users_bundle_total_revenue              96.317901
whale_users_bundle_total_num_buys             96.317901
iap_revenue_usd_category                      95.403119
num_buys_bundle                               95.403119
num_buys_category                             95.403119
iap_revenue_usd_bundle                        95.403119
iap_revenue_usd_category_bottom_taxonomy      95

In [7]:
ddf['whale_users_bundle_revenue_prank'][14].compute()
ddf['whale_users_bundle_total_revenue'][14].compute()

[('163ffe25c9eb6e1d5702e6ae5e539f9b570bbdf1', 401.55124261939136)]

In [8]:
ddf['buyer_d7'].value_counts().compute()

df_pos = ddf[ddf['buyer_d7'] == 1]
df_neg = ddf[ddf['buyer_d7'] == 0]

In [9]:
n_pos = df_pos.shape[0].compute()
n_pos

5156

In [10]:
n_neg = df_neg.shape[0].compute()

In [11]:
n_neg

116731

In [12]:
df_neg_sampled = df_neg.sample(frac=(n_pos*4) / df_neg.shape[0].compute(), random_state=42)
df_neg_sampled.shape[0].compute()

20624

In [13]:
def undersample_partition(df, target_col="buyer_d7", majority=0, minority=1, ratio=4.0):
    pos = df[df[target_col] == minority]
    neg = df[df[target_col] == majority]
    
    n_pos = len(pos)
    n_keep_neg = int(n_pos * ratio)
    
    neg_sampled = neg.sample(n=n_keep_neg, random_state=42) if len(neg) > n_keep_neg else neg
    
    return dd.concat([pos, neg_sampled])

df_balanced = ddf.map_partitions(undersample_partition)

In [14]:
df_balanced.shape[0].compute()

25780

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score

target = "iap_revenue_d7"
sample_frac = 0.1  # cambiar a 1.0 para usar todo (puede agotar memoria)

# seleccionar columnas numéricas y eliminar columnas vacías
num_ddf = ddf.select_dtypes(include=["number"])

# eliminar columnas que son todas NaN (dask.DataFrame.dropna no soporta axis)
# usamos el conteo de nulos por columna (se computa)
null_counts = num_ddf.isnull().sum().compute()
# usamos nrows si ya fue calculado; si no, lo computamos aquí
try:
    total_rows = nrows
except NameError:
    total_rows = num_ddf.shape[0].compute()

keep_cols = null_counts[null_counts < total_rows].index.tolist()
num_ddf = num_ddf[keep_cols]

if target not in num_ddf.columns:
    raise KeyError(f"Target {target} not found in numeric columns. Columns sample: {list(num_ddf.columns)[:20]}")

# muestreo para evitar OOM, luego pasar a pandas
sampled = num_ddf.sample(frac=sample_frac, random_state=42).compute()
df = sampled.fillna(0)

X = df.drop(columns=[target])
y = df[target].values

# split 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# escalar y entrenar
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

models = {
    "Lasso": Lasso(alpha=1.0, max_iter=10000, random_state=42),
    "Ridge": Ridge(alpha=1.0, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_s, y_train)
    # predicciones train y test
    y_pred_train = model.predict(X_train_s)
    y_pred_test = model.predict(X_test_s)

    # métricas train
    mse_train = mean_squared_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = r2_score(y_train, y_pred_train)

    # métricas test
    mse_test = mean_squared_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    r2_test = r2_score(y_test, y_pred_test)

    print(f"{name} | Train RMSE: {rmse_train:.4f}  R2: {r2_train:.4f} (train)")
    print(f"{name} | Test  RMSE: {rmse_test:.4f}  R2: {r2_test:.4f} (test)")

Lasso | Train RMSE: 7.6656  R2: 0.9034 (train)
Lasso | Test  RMSE: 6.5730  R2: 0.8500 (test)
Ridge | Train RMSE: 4.3020  R2: 0.9696 (train)
Ridge | Test  RMSE: 96.7759  R2: -31.5169 (test)


In [22]:
# Diagnóstico rápido de por qué Ridge falla
import numpy as np
import pandas as pd

# inspeccionar predicciones problemáticas
y_pred_ridge = models['Ridge'].predict(X_test_s)
print("y_test: size", y_test.shape, "  y_pred size", y_pred_ridge.shape)
print("y_pred_ridge: min,max,mean,median,95p:", y_pred_ridge.min(), y_pred_ridge.max(), y_pred_ridge.mean(),
      np.median(y_pred_ridge), np.percentile(y_pred_ridge,95))

# ver unas muestras comparadas
comp = pd.DataFrame({"y_test": y_test, "y_pred_ridge": y_pred_ridge})
print(comp.head(20))

# magnitud de coeficientes
coefs = models['Ridge'].coef_
print("coef: mean, std, max_abs:", coefs.mean(), coefs.std(), np.max(np.abs(coefs)))
print("n_features:", len(coefs))

# comprobar condition number de X_train_s (colinealidad)
s = np.linalg.svd(X_train_s, compute_uv=False)
cond = s.max() / s.min() if s.min() != 0 else np.inf
print("condition number X_train_s:", cond)

# probar Ridge con alpha mayor para ver si se estabiliza
from sklearn.linear_model import Ridge
for a in [1.0, 10.0, 100.0, 1000.0]:
    r = Ridge(alpha=a, random_state=42)
    r.fit(X_train_s, y_train)
    p = r.predict(X_test_s)
    rmse = np.sqrt(mean_squared_error(y_test, p))
    print(f"alpha={a} RMSE_test={rmse:.4f}, coef_max_abs={np.max(np.abs(r.coef_)):.4e}")

y_test: size (3657,)   y_pred size (3657,)
y_pred_ridge: min,max,mean,median,95p: -5679.094329138956 233.1795562427295 -0.6341251525549821 0.04512292929434114 1.2830000774698798
    y_test  y_pred_ridge
0      0.0      0.028267
1      0.0     -0.659870
2      0.0      1.059015
3      0.0     -1.471089
4      0.0      0.558406
5      0.0      0.070275
6      0.0      0.013190
7      0.0     -0.052111
8      0.0     -0.002612
9      0.0     -0.030774
10     0.0     -0.052831
11     0.0      0.384082
12     0.0      0.156477
13     0.0      0.009215
14     0.0     -0.105113
15     0.0      0.594068
16     0.0     -0.380536
17     0.0      0.087966
18     0.0      0.001081
19     3.0     19.936415
coef: mean, std, max_abs: 0.9180595919638311 14.991592098933372 59.143390507742
n_features: 26
condition number X_train_s: 5.966774864134239e+16
alpha=1.0 RMSE_test=96.7759, coef_max_abs=5.9143e+01
alpha=10.0 RMSE_test=62.4502, coef_max_abs=4.8838e+01
alpha=100.0 RMSE_test=18.4340, coef_max_abs=2

In [24]:
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import numpy as np

alphas = np.logspace(-2, 3, 13)
cv = KFold(n_splits=5, shuffle=True, random_state=42)

def eval_model(model, Xtr, Xte, ytr, yte, name):
    model.fit(Xtr, ytr)
    p_tr = model.predict(Xtr)
    p_te = model.predict(Xte)
    mse_tr = mean_squared_error(ytr, p_tr); rmse_tr = np.sqrt(mse_tr); r2_tr = r2_score(ytr, p_tr)
    mse_te = mean_squared_error(yte, p_te); rmse_te = np.sqrt(mse_te); r2_te = r2_score(yte, p_te)
    print(f"{name} | Train RMSE: {rmse_tr:.4f} R2: {r2_tr:.4f} | Test RMSE: {rmse_te:.4f} R2: {r2_te:.4f}")

# 1) RidgeCV directly (no PCA) — remove unsupported arg 'store_cv_values'
r_cv = RidgeCV(alphas=alphas, cv=cv, scoring='neg_mean_squared_error')
eval_model(r_cv, X_train_s, X_test_s, y_train, y_test, "RidgeCV (no PCA)")

print("best_alpha (RidgeCV):", r_cv.alpha_)

# 2) PCA -> RidgeCV pipeline to reduce colinearity (keep 99% variance)
pipe = make_pipeline(PCA(n_components=0.99, svd_solver='full', random_state=42),
                     RidgeCV(alphas=alphas, cv=cv, scoring='neg_mean_squared_error'))
eval_model(pipe, X_train_s, X_test_s, y_train, y_test, "RidgeCV + PCA")

# 3) log1p(target) with PCA+RidgeCV
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)
pipe_log = make_pipeline(PCA(n_components=0.99, svd_solver='full', random_state=42),
                         RidgeCV(alphas=alphas, cv=cv, scoring='neg_mean_squared_error'))
pipe_log.fit(X_train_s, y_train_log)
p_log = pipe_log.predict(X_test_s)
p_back = np.expm1(p_log).clip(min=0)
mse_te = mean_squared_error(y_test, p_back); rmse_te = np.sqrt(mse_te); r2_te = r2_score(y_test, p_back)
print(f"RidgeCV+PCA on log1p(y): Test RMSE: {rmse_te:.4f} R2: {r2_te:.4f}  (alpha={pipe_log.named_steps['ridgecv'].alpha_})")

RidgeCV (no PCA) | Train RMSE: 4.2999 R2: 0.9696 | Test RMSE: 101.8233 R2: -34.9972
best_alpha (RidgeCV): 0.01
RidgeCV + PCA | Train RMSE: 7.4701 R2: 0.9083 | Test RMSE: 37.6436 R2: -3.9199
RidgeCV+PCA on log1p(y): Test RMSE: 21692.5951 R2: -1633795.0808  (alpha=56.23413251903491)


In [25]:
# Two-stage model: classifier (zero vs positive) + regressor for positives.
# Copia esta celda en tu notebook y ejecútala.
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, roc_auc_score

# Entrenamiento clasificadora/regresora
is_pos_train = (y_train > 0).astype(int)
is_pos_test = (y_test > 0).astype(int)

clf = RandomForestClassifier(n_estimators=200, max_depth=10, n_jobs=-1, random_state=42)
reg = make_pipeline(StandardScaler(), RandomForestRegressor(n_estimators=200, max_depth=10, n_jobs=-1, random_state=42))

# fit classifier
clf.fit(X_train, is_pos_train)

# fit regressor solo con positivos (si hay)
if is_pos_train.sum() > 10:
    reg.fit(X_train[is_pos_train == 1], y_train[is_pos_train == 1])
    reg_tr_pred = reg.predict(X_train)
    reg_te_pred = reg.predict(X_test)
else:
    # fallback: regressor trivial pred = mean positive (evitar errores si no hay suficientes positivos)
    mean_pos = y_train[is_pos_train == 1].mean() if is_pos_train.sum() > 0 else 0.0
    reg_tr_pred = np.full(len(X_train), mean_pos)
    reg_te_pred = np.full(len(X_test), mean_pos)

# combinar: E[y] ≈ P(pos) * E[y | pos]
p_tr = clf.predict_proba(X_train)[:, 1]
p_te = clf.predict_proba(X_test)[:, 1]

y_pred_train = (p_tr * reg_tr_pred).clip(min=0)
y_pred_test = (p_te * reg_te_pred).clip(min=0)

# métricas finales
mse_tr = mean_squared_error(y_train, y_pred_train); rmse_tr = np.sqrt(mse_tr); r2_tr = r2_score(y_train, y_pred_train)
mse_te = mean_squared_error(y_test, y_pred_test); rmse_te = np.sqrt(mse_te); r2_te = r2_score(y_test, y_pred_test)

print(f"Two-stage | Train RMSE: {rmse_tr:.4f}  R2: {r2_tr:.4f} (train)")
print(f"Two-stage | Test  RMSE: {rmse_te:.4f}  R2: {r2_te:.4f} (test)")

# métricas de la clasificadora (importante por la gran cantidad de ceros)
yhat_clf_test = clf.predict(X_test)
print("Classifier on test — precision, recall, f1, auc:",
      f"{precision_score(is_pos_test, yhat_clf_test):.4f},",
      f"{recall_score(is_pos_test, yhat_clf_test):.4f},",
      f"{f1_score(is_pos_test, yhat_clf_test):.4f},",
      f"{roc_auc_score(is_pos_test, clf.predict_proba(X_test)[:,1]):.4f}")

# baseline comparación (predecir media del train)
y_mean_pred = np.full_like(y_test, y_train.mean())
print("Baseline | Test RMSE:", np.sqrt(mean_squared_error(y_test, y_mean_pred)),
      " R2:", r2_score(y_test, y_mean_pred))

Two-stage | Train RMSE: 6.8501  R2: 0.9229 (train)
Two-stage | Test  RMSE: 9.3342  R2: 0.6975 (test)
Classifier on test — precision, recall, f1, auc: 1.0000, 0.9809, 0.9904, 1.0000
Baseline | Test RMSE: 16.971795932987344  R2: -7.007653064139419e-05


In [28]:
import os, joblib, numpy as np, pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split

os.makedirs("models", exist_ok=True)

# ============= VARIABLE PARA CONTROLAR VOLUMEN DE DATOS =============
# Aumenta este valor para leer más datos (0.1 = 10%, 0.3 = 30%, 0.5 = 50%, 1.0 = 100%)
data_fraction = 1  # CAMBIAR AQUÍ para probar con más datos

# ============= CARGAR Y PREPROCESAR DATOS =============
target = "iap_revenue_d7"
num_ddf = ddf.select_dtypes(include=["number"])

null_counts = num_ddf.isnull().sum().compute()
try:
    total_rows = nrows
except NameError:
    total_rows = num_ddf.shape[0].compute()

keep_cols = null_counts[null_counts < total_rows].index.tolist()
num_ddf = num_ddf[keep_cols]

if target not in num_ddf.columns:
    raise KeyError(f"Target {target} not found")

# CAMBIO: usar data_fraction variable
sampled = num_ddf.sample(frac=data_fraction, random_state=42).compute()
df = sampled.fillna(0)

# Preprocesamiento mínimo pero efectivo:
# 1) Remover outliers extremos en target (clip a percentil 99)
q99 = df[target].quantile(0.99)
df[target] = df[target].clip(upper=q99)

# 2) Remover features con varianza casi cero
X = df.drop(columns=[target])
y = df[target].values

# Eliminar columnas con std < 0.01 (prácticamente constantes)
var_threshold = 0.01
high_var_cols = X.columns[X.std() > var_threshold].tolist()
X = X[high_var_cols]

print(f"Datos cargados: {len(X)} muestras, {len(X.columns)} features (data_fraction={data_fraction})")
print(f"Target stats: mean={y.mean():.4f}, median={np.median(y):.4f}, std={y.std():.4f}, max={y.max():.4f}")
print(f"Target q99={q99:.4f}")

# split 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# ============= TWO-STAGE CON LIGHTGBM (×20 MÁS RÁPIDO) =============
is_pos_train = (y_train > 0).astype(int)
is_pos_test  = (y_test  > 0).astype(int)

print(f"\nPositivos train: {is_pos_train.sum()}/{len(y_train)}, test: {is_pos_test.sum()}/{len(y_test)}")

# Clasificador LightGBM (100 árboles, no 200 — AUC ya es 1.0, no necesita más)
clf_lgb = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)
clf_lgb.fit(X_train, is_pos_train)

# Regresor LightGBM solo con positivos
pos_idx = is_pos_train == 1
if pos_idx.sum() > 10:
    reg_lgb = lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        num_leaves=31,
        reg_lambda=1.0,
        random_state=42,
        verbose=-1,
        n_jobs=-1
    )
    reg_lgb.fit(X_train.iloc[pos_idx], y_train[pos_idx])
else:
    reg_lgb = None
    print("Warning: insuficientes positivos para regresor")

# Predicciones
p_tr = clf_lgb.predict_proba(X_train)[:, 1]
p_te = clf_lgb.predict_proba(X_test)[:, 1]

if reg_lgb is not None:
    r_tr = reg_lgb.predict(X_train)
    r_te = reg_lgb.predict(X_test)
else:
    mean_pos = y_train[pos_idx].mean() if pos_idx.sum() > 0 else 0.0
    r_tr = np.full(len(X_train), mean_pos)
    r_te = np.full(len(X_test), mean_pos)

y_pred_train = np.clip(p_tr * r_tr, 0, None)
y_pred_test  = np.clip(p_te * r_te,  0, None)

# Métricas
mse_tr = mean_squared_error(y_train, y_pred_train); rmse_tr = np.sqrt(mse_tr); r2_tr = r2_score(y_train, y_pred_train)
mse_te = mean_squared_error(y_test,  y_pred_test);  rmse_te = np.sqrt(mse_te);  r2_te = r2_score(y_test,  y_pred_test)

print(f"\n=== TWO-STAGE LIGHTGBM (data_fraction={data_fraction}) ===")
print(f"Train RMSE: {rmse_tr:.4f}  R2: {r2_tr:.4f}")
print(f"Test  RMSE: {rmse_te:.4f}  R2: {r2_te:.4f}")

# Métricas clasificadora
yhat_clf_test = clf_lgb.predict(X_test)
print(f"Classifier — precision: {precision_score(is_pos_test, yhat_clf_test):.4f}, recall: {recall_score(is_pos_test, yhat_clf_test):.4f}, f1: {f1_score(is_pos_test, yhat_clf_test):.4f}, auc: {roc_auc_score(is_pos_test, clf_lgb.predict_proba(X_test)[:,1]):.4f}")

# Baseline
y_mean_pred = np.full_like(y_test, y_train.mean())
baseline_rmse = np.sqrt(mean_squared_error(y_test, y_mean_pred))
baseline_r2 = r2_score(y_test, y_mean_pred)
print(f"Baseline   RMSE: {baseline_rmse:.4f}  R2: {baseline_r2:.4f}")

# Guardar modelo
joblib.dump({"clf": clf_lgb, "reg": reg_lgb, "features": X_train.columns.tolist(), "data_fraction": data_fraction}, 
            f"models/two_stage_lgb_frac{data_fraction}.pkl")
print(f"\nModelo guardado en models/two_stage_lgb_frac{data_fraction}.pkl")

Datos cargados: 121887 muestras, 25 features (data_fraction=1)
Target stats: mean=0.3170, median=0.0000, std=1.9822, max=16.3089
Target q99=16.3089

Positivos train: 3103/85320, test: 1346/36567

=== TWO-STAGE LIGHTGBM (data_fraction=1) ===
Train RMSE: 0.1052  R2: 0.9972
Test  RMSE: 0.1633  R2: 0.9932
Classifier — precision: 0.9978, recall: 1.0000, f1: 0.9989, auc: 1.0000
Baseline   RMSE: 1.9772  R2: -0.0000

Modelo guardado en models/two_stage_lgb_frac1.pkl

=== TWO-STAGE LIGHTGBM (data_fraction=1) ===
Train RMSE: 0.1052  R2: 0.9972
Test  RMSE: 0.1633  R2: 0.9932
Classifier — precision: 0.9978, recall: 1.0000, f1: 0.9989, auc: 1.0000
Baseline   RMSE: 1.9772  R2: -0.0000

Modelo guardado en models/two_stage_lgb_frac1.pkl


In [1]:
# ============= GENERAR SUBMISSION =============
import dask.dataframe as dd
import pandas as pd
import numpy as np

# Cargar datos de test con mismo preprocesamiento
test_path = "./smadex-challenge-predict-the-revenue/test/test"
test_ddf = dd.read_parquet(test_path)

# Seleccionar solo columnas numéricas que existen en features del modelo entrenado
test_num_ddf = test_ddf.select_dtypes(include=["number"])

# Usar solo las features que el modelo vio en train
features_in_model = X_train.columns.tolist()
test_num_ddf = test_num_ddf[[col for col in features_in_model if col in test_num_ddf.columns]]

# Rellenar NaNs con 0 (mismo preprocesamiento)
test_df = test_num_ddf.compute().fillna(0)

print(f"Test data cargado: {len(test_df)} muestras, {len(test_df.columns)} features")
print(f"Features en test: {len(test_df.columns)}, en modelo: {len(features_in_model)}")

# Asegurar que tenemos las mismas columnas (agregar 0s si faltan)
for col in features_in_model:
    if col not in test_df.columns:
        test_df[col] = 0.0

# Reordenar columnas al orden del modelo
test_df = test_df[features_in_model]

# Hacer predicciones con el modelo two-stage
p_test = clf_lgb.predict_proba(test_df)[:, 1]  # probabilidad de compra

if reg_lgb is not None:
    r_test = reg_lgb.predict(test_df)
else:
    r_test = np.zeros(len(test_df))

# Combinación: E[y] = P(compra) * E[y|compra]
y_submission = np.clip(p_test * r_test, 0, None)

print(f"\nPredicciones: min={y_submission.min():.4f}, max={y_submission.max():.4f}, mean={y_submission.mean():.4f}")

# Crear submission con row_id y predicciones
# Leer row_id del test original
test_row_ids = test_ddf['row_id'].compute()

submission_df = pd.DataFrame({
    'row_id': test_row_ids,
    'iap_revenue_d7': y_submission
})

print(f"\nSubmission shape: {submission_df.shape}")
print(submission_df.head(10))

# Guardar submission
output_path = "outputs/submission.csv"
submission_df.to_csv(output_path, index=False)
print(f"\nSubmission guardada en {output_path}")

NameError: name 'X_train' is not defined

iap_revenue_d7 -> Variable de quants diners gastara en una setmana.  
buyer_d7 -> Variable de si l'usuari ha comprat dins de l'aplicació en una setmana (1 = sí, 0 = no)
