## 1. Load Data

In [None]:
import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd

dask.config.set({"dataframe.convert-string": False})

# Load train data (adjust datetime filters as needed)
dataset_path = "./smadex-challenge-predict-the-revenue/train/train"
filters = [("datetime", ">=", "2025-10-01-00-00"), ("datetime", "<", "2025-10-10-00-00")]

ddf = dd.read_parquet(dataset_path, filters=filters)

print(f"Train dataset: {ddf.shape[0].compute()} rows, {len(ddf.columns)} columns")
print(f"Columns: {list(ddf.columns)[:10]}...")

## 2. Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

target = "iap_revenue_d7"
data_fraction = 1.0  # Change to 0.1, 0.5 etc. for faster iteration

# Select numeric columns only
num_ddf = ddf.select_dtypes(include=["number"])

# Remove columns that are entirely NaN
null_counts = num_ddf.isnull().sum().compute()
total_rows = num_ddf.shape[0].compute()
keep_cols = null_counts[null_counts < total_rows].index.tolist()
num_ddf = num_ddf[keep_cols]

if target not in num_ddf.columns:
    raise KeyError(f"Target '{target}' not found in numeric columns")

# Sample and fill NaN with 0 (minimal preprocessing)
sampled = num_ddf.sample(frac=data_fraction, random_state=42).compute()
df = sampled.fillna(0)

# Clip target outliers at 99th percentile
q99 = df[target].quantile(0.99)
df[target] = df[target].clip(upper=q99)

# Remove features with near-zero variance
X = df.drop(columns=[target])
y = df[target].values

var_threshold = 0.01
high_var_cols = X.columns[X.std() > var_threshold].tolist()
X = X[high_var_cols]

print(f"Processed: {len(X)} samples, {len(X.columns)} features")
print(f"Target: min={y.min():.2f}, max={y.max():.2f}, mean={y.mean():.2f}, q99={q99:.2f}")

# Train/test split (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Train: {len(X_train)} samples | Test: {len(X_test)} samples")

## 3. Train Two-Stage LightGBM Model

In [None]:
import os
import joblib
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, r2_score, precision_score, recall_score, f1_score, roc_auc_score

os.makedirs("models", exist_ok=True)

# Prepare labels
is_pos_train = (y_train > 0).astype(int)
is_pos_test = (y_test > 0).astype(int)

print(f"Positive samples: train {is_pos_train.sum()}/{len(y_train)}, test {is_pos_test.sum()}/{len(y_test)}")

# === CLASSIFIER: Predict buyer (yes/no) ===
clf_lgb = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    verbose=-1,
    n_jobs=-1
)
clf_lgb.fit(X_train, is_pos_train)

# === REGRESSOR: Predict revenue for positive samples ===
pos_idx = is_pos_train == 1
if pos_idx.sum() > 10:
    reg_lgb = lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        num_leaves=31,
        reg_lambda=1.0,
        random_state=42,
        verbose=-1,
        n_jobs=-1
    )
    reg_lgb.fit(X_train.iloc[pos_idx], y_train[pos_idx])
else:
    reg_lgb = None

# === PREDICTIONS ===
p_tr = clf_lgb.predict_proba(X_train)[:, 1]
p_te = clf_lgb.predict_proba(X_test)[:, 1]

if reg_lgb is not None:
    r_tr = reg_lgb.predict(X_train)
    r_te = reg_lgb.predict(X_test)
else:
    r_tr = np.zeros(len(X_train))
    r_te = np.zeros(len(X_test))

# Combine: E[revenue] = P(buyer) * E[revenue | buyer]
y_pred_train = np.clip(p_tr * r_tr, 0, None)
y_pred_test = np.clip(p_te * r_te, 0, None)

# === METRICS ===
mse_tr = mean_squared_error(y_train, y_pred_train)
rmse_tr = np.sqrt(mse_tr)
r2_tr = r2_score(y_train, y_pred_train)

mse_te = mean_squared_error(y_test, y_pred_test)
rmse_te = np.sqrt(mse_te)
r2_te = r2_score(y_test, y_pred_test)

print(f"\n=== Train Results ===")
print(f"RMSE: {rmse_tr:.4f} | R²: {r2_tr:.4f}")
print(f"\n=== Test Results ===")
print(f"RMSE: {rmse_te:.4f} | R²: {r2_te:.4f}")

# Classifier metrics
yhat_clf_test = clf_lgb.predict(X_test)
print(f"\n=== Classifier Metrics (Test) ===")
print(f"Precision: {precision_score(is_pos_test, yhat_clf_test):.4f}")
print(f"Recall: {recall_score(is_pos_test, yhat_clf_test):.4f}")
print(f"F1: {f1_score(is_pos_test, yhat_clf_test):.4f}")
print(f"AUC: {roc_auc_score(is_pos_test, clf_lgb.predict_proba(X_test)[:,1]):.4f}")

# Save model
model_path = f"models/two_stage_lgb.pkl"
joblib.dump(
    {"clf": clf_lgb, "reg": reg_lgb, "features": X_train.columns.tolist()},
    model_path
)
print(f"\nModel saved: {model_path}")

## 4. Generate Submission

In [None]:
import os
import glob
import joblib
import dask.dataframe as dd
import pandas as pd
import numpy as np

# Load latest saved model
model_files = sorted(glob.glob("models/two_stage_lgb*.pkl"))
if not model_files:
    raise FileNotFoundError("No model found. Run training cell first.")

model_path = model_files[-1]
print(f"Loading model: {model_path}")
m = joblib.load(model_path)

clf = m["clf"]
reg = m["reg"]
features = m["features"]

# Load test data
test_path = "./smadex-challenge-predict-the-revenue/test/test"
print(f"Reading test data: {test_path}")
test_ddf = dd.read_parquet(test_path)

# Extract row_id
test_row_ids = test_ddf["row_id"].compute()

# Select features that exist in test, fill missing with 0
test_df = test_ddf[[c for c in features if c in test_ddf.columns]].compute()
for c in features:
    if c not in test_df.columns:
        test_df[c] = 0.0

# Reorder and fill NaN
test_df = test_df[features].fillna(0)

print(f"Test data: {test_df.shape[0]} rows, {test_df.shape[1]} features")

# Predict
p_test = clf.predict_proba(test_df)[:, 1]  # P(buyer)
r_test = reg.predict(test_df) if reg is not None else np.zeros(len(test_df))  # E[revenue|buyer]
y_submission = np.clip(p_test * r_test, 0, None)

print(f"Predictions: min={y_submission.min():.4f}, max={y_submission.max():.4f}, mean={y_submission.mean():.4f}")

# Save submission
submission_df = pd.DataFrame({
    "row_id": test_row_ids,
    "iap_revenue_d7": y_submission
})

output_path = "outputs/submission.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
submission_df.to_csv(output_path, index=False)

print(f"\nSubmission saved: {output_path}")
print(submission_df.head(10))