## 1. Load & Explore Data

In [1]:
import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
import lightgbm as lgb
import joblib
import os

dask.config.set({"dataframe.convert-string": False})

# Load train data
dataset_path = "./smadex-challenge-predict-the-revenue/train/train"
filters = [("datetime", ">=", "2025-10-01-00-00"), ("datetime", "<", "2025-10-02-00-00")]
ddf = dd.read_parquet(dataset_path, filters=filters)

print(f"Dataset: {ddf.shape[0].compute()} rows, {len(ddf.columns)} columns")
print(f"Target stats (iap_revenue_d7):")
target_stats = ddf['iap_revenue_d7'].describe().compute()
print(target_stats)
print(f"\nZeros: {(ddf['iap_revenue_d7'] == 0).sum().compute()} ({(ddf['iap_revenue_d7'] == 0).sum().compute() / len(ddf) * 100:.1f}%)")

Dataset: 3347176 rows, 85 columns
Target stats (iap_revenue_d7):
count    3.347176e+06
mean     1.902049e+00
std      6.187249e+02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      6.966221e+05
Name: iap_revenue_d7, dtype: float64
count    3.347176e+06
mean     1.902049e+00
std      6.187249e+02
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      6.966221e+05
Name: iap_revenue_d7, dtype: float64

Zeros: 3252023 (97.2%)

Zeros: 3252023 (97.2%)


## 2. Feature Engineering (Top 15 features)

In [None]:
# Cargar pequeño sample para exploración rápida
sample_frac = 0.15  # 15% para iteración rápida
df = ddf.sample(frac=sample_frac, random_state=42).compute()

print(f"Sample: {len(df)} rows")
print(f"Columns: {list(df.columns)[:15]}")

# === PREPARAR FEATURES ===
target = "iap_revenue_d7"
y = df[target].values

# Seleccionar numéricas base
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c != target and c not in ['row_id']]

print(f"\nNumeric features ({len(numeric_cols)}): {numeric_cols[:10]}...")

# Dropear columnas full-NaN
X = df[numeric_cols].fillna(0)

# Remover varianza ~0
var_threshold = 0.01
high_var_cols = X.columns[X.std() > var_threshold].tolist()
X = X[high_var_cols]

# Clip outliers en target (no modificar, solo para análisis)
q99 = y[y > 0].quantile(0.99) if (y > 0).any() else 0
print(f"\n99th percentile of positive revenue: {q99:.2f}")
print(f"Feature shape: {X.shape}")

Sample: 502077 rows
Columns: ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28', 'registration', 'retention_d1_to_d7', 'retention_d3_to_d7', 'retention_d7_to_d14', 'retention_d1']

Numeric features (26): ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d14', 'iap_revenue_d28', 'registration']...


## 3. Transform Target (LOG1P)

In [None]:
# Log-transform target → MSLE-friendly
y_log = np.log1p(y)

print(f"Original y: min={y.min():.2f}, max={y.max():.2f}, mean={y.mean():.2f}, median={np.median(y):.2f}")
print(f"Log-transformed y: min={y_log.min():.4f}, max={y_log.max():.4f}, mean={y_log.mean():.4f}, median={np.median(y_log):.4f}")

# Train/val split (80/20)
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)
# También guardamos y original para MSLE evaluation
_, _, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"\nTrain: {len(X_train)} | Val: {len(X_val)}")

## 4. Train LightGBM (Optimized for MSLE)

In [None]:
# === MODELO LightGBM EN LOG-SPACE ===
model = lgb.LGBMRegressor(
    objective='regression_l2',
    metric='mse',
    num_leaves=31,
    max_depth=6,
    learning_rate=0.05,
    n_estimators=300,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)

# Entrenar
print("Training LightGBM on log-transformed target...")
model.fit(
    X_train, y_train_log,
    eval_set=[(X_val, y_val_log)],
    callbacks=[
        lgb.log_evaluation(period=50),
        lgb.early_stopping(stopping_rounds=30)
    ]
)

print("Training completed!")

## 5. Evaluation & MSLE Metrics

In [None]:
# Predicciones
y_pred_log_train = model.predict(X_train)
y_pred_log_val = model.predict(X_val)

# Transform inversa
y_pred_train = np.expm1(y_pred_log_train).clip(0, None)
y_pred_val = np.expm1(y_pred_log_val).clip(0, None)

# MSLE evaluation
msle_train = mean_squared_log_error(y_train, y_pred_train)
msle_val = mean_squared_log_error(y_val, y_pred_val)

print(f"=== MSLE Metrics ===")
print(f"Train MSLE: {msle_train:.4f}")
print(f"Val MSLE:   {msle_val:.4f}")

# Analizar distribución de predicciones
print(f"\n=== Prediction Distribution ===")
print(f"Train predictions: min={y_pred_train.min():.2f}, max={y_pred_train.max():.2f}, mean={y_pred_train.mean():.2f}")
print(f"Val predictions:   min={y_pred_val.min():.2f}, max={y_pred_val.max():.2f}, mean={y_pred_val.mean():.2f}")
print(f"Val non-zero preds: {(y_pred_val > 0).sum()} / {len(y_pred_val)} ({(y_pred_val > 0).sum() / len(y_pred_val) * 100:.1f}%)")

# Feature importance
print(f"\n=== Top 10 Features ===")
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(importance_df.head(10))

## 6. Optimization: Clipping & Calibration

In [None]:
# === CLIPPING INTELIGENTE ===
# Clip a percentil 99 del train para evitar predicciones extremas
q99_pred = np.percentile(y_pred_train, 99)
y_pred_val_clipped = np.clip(y_pred_val, 0.001, q99_pred)

msle_val_clipped = mean_squared_log_error(y_val, y_pred_val_clipped)
print(f"Val MSLE (raw): {msle_val:.4f}")
print(f"Val MSLE (clipped at p99={q99_pred:.2f}): {msle_val_clipped:.4f}")
print(f"Improvement: {(msle_val - msle_val_clipped) / msle_val * 100:.2f}%")

# === PREDICCIÓN MÍNIMA PARA FORZAR NO-CEROS ===
# Si estás prediciendo muchos ceros, fuerza un mínimo
min_pred = 0.01  # Predice al menos $0.01
y_pred_val_minclip = np.clip(y_pred_val, min_pred, q99_pred)
msle_val_minclip = mean_squared_log_error(y_val, y_pred_val_minclip)
print(f"Val MSLE (min={min_pred}, max={q99_pred:.2f}): {msle_val_minclip:.4f}")

## 7. Save Model

In [None]:
os.makedirs("models", exist_ok=True)

# Guardar modelo + configuración
model_data = {
    "model": model,
    "features": X_train.columns.tolist(),
    "q99_pred": q99_pred,
    "min_pred": min_pred
}

model_path = "models/msle_lgb_log_optimized.pkl"
joblib.dump(model_data, model_path)
print(f"Model saved: {model_path}")

## 8. Generate Submission

In [None]:
import glob
import dask.dataframe as dd

# Load latest model
model_files = sorted(glob.glob("models/msle_lgb*.pkl"))
model_path = model_files[-1]
print(f"Loading: {model_path}")
m = joblib.load(model_path)

model = m["model"]
features = m["features"]
q99_pred = m["q99_pred"]
min_pred = m["min_pred"]

# Load test
test_path = "./smadex-challenge-predict-the-revenue/test/test"
test_ddf = dd.read_parquet(test_path)
test_row_ids = test_ddf["row_id"].compute()

# Prepare test data (same features)
test_df = test_ddf[[c for c in features if c in test_ddf.columns]].compute()
for c in features:
    if c not in test_df.columns:
        test_df[c] = 0.0
test_df = test_df[features].fillna(0)

print(f"Test data: {test_df.shape}")

# Predict in log-space and transform back
y_pred_log = model.predict(test_df)
y_pred = np.expm1(y_pred_log).clip(min_pred, q99_pred)

print(f"\nPredictions: min={y_pred.min():.4f}, max={y_pred.max():.4f}, mean={y_pred.mean():.4f}")
print(f"Non-zero: {(y_pred > 0).sum()} / {len(y_pred)} ({(y_pred > 0).sum() / len(y_pred) * 100:.1f}%)")

# Save submission
submission_df = pd.DataFrame({
    "row_id": test_row_ids,
    "iap_revenue_d7": y_pred
})

output_path = "outputs/submission_msle_optimized.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
submission_df.to_csv(output_path, index=False)

print(f"\nSubmission saved: {output_path}")
print(submission_df.head(10))