# 00 — EDA & Data Health Checks

In [None]:
import os, json, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.config import load_config
from src.schemas import PATHS, get_schema
from src.data_io import (
    load_train_sessions, load_test_sessions, 
    load_content_metadata, load_content_price_rate_review, load_content_search_log, load_content_sitewide_log, load_content_top_terms_log,
    load_user_metadata, load_user_search_log, load_user_sitewide_log, load_user_top_terms_log, load_user_fashion_search_log, load_user_fashion_sitewide_log,
    load_term_search_log, build_time_range_filter, memory_mb
)

plt.rcParams["figure.figsize"] = (9, 5)
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

cfg = load_config("configs/params.yaml")
data_dir = Path(cfg.paths.data_dir)
figures_dir = Path("runs/eda/images"); figures_dir.mkdir(parents=True, exist_ok=True)

def exists(key: str) -> bool:
    rel = PATHS[key]
    return (data_dir / rel).exists()

def safe_load(fn, key, **kwargs):
    if not exists(key):
        print(f"[SKIP] {key}: file not found at {data_dir / PATHS[key]}")
        return None
    try:
        return fn(data_dir, **kwargs)
    except Exception as e:
        print(f"[ERROR] loading {key}: {e}")
        return None

def quick_info(df: pd.DataFrame, name: str):
    if df is None:
        print(f"{name}: None")
        return
    print(f"{name}: shape={df.shape}, mem={memory_mb(df):.2f} MB")
    display(df.head(3))
    display(df.dtypes)

## 1. Load core tables

In [None]:
train = safe_load(load_train_sessions, "train_sessions")
test  = safe_load(load_test_sessions, "test_sessions")
content_meta = safe_load(load_content_metadata, "content_metadata")
user_meta    = safe_load(load_user_metadata, "user_metadata")

# Optional logs (may be large; load minimal columns for EDA)
content_site = safe_load(load_content_sitewide_log, "content_sitewide_log", columns=["date","total_click","total_cart","total_fav","total_order","content_id_hashed"])
user_site    = safe_load(load_user_sitewide_log, "user_sitewide_log", columns=["ts_hour","total_click","total_cart","total_fav","total_order","user_id_hashed"])
term_log     = safe_load(load_term_search_log, "term_search_log")

for name, df in [("train", train), ("test", test), ("content_meta", content_meta), ("user_meta", user_meta)]:
    quick_info(df, name)

## 2. Time ranges & timezone sanity

In [None]:
import pandas as pd

def time_summary(df, ts_col, name):
    if df is None or ts_col not in df.columns: 
        print(f"[SKIP] {name}")
        return None
    tz = str(df[ts_col].dt.tz) if hasattr(df[ts_col].dt, "tz") else "NA"
    print(f"{name}: tz={tz}, min={df[ts_col].min()}, max={df[ts_col].max()}, rows={len(df):,}")
    return df[ts_col]

tr_train = time_summary(train, "ts_hour", "train.ts_hour")
tr_test  = time_summary(test, "ts_hour", "test.ts_hour")

if tr_train is not None:
    tr_train.dt.date.value_counts().sort_index().plot(kind="line", title="Train daily row counts")
    plt.tight_layout(); plt.savefig(figures_dir / "train_daily_counts.png"); plt.show()
if tr_test is not None:
    tr_test.dt.date.value_counts().sort_index().plot(kind="line", title="Test daily row counts")
    plt.tight_layout(); plt.savefig(figures_dir / "test_daily_counts.png"); plt.show()

# Non-overlap heuristic
if tr_train is not None and tr_test is not None:
    print("Time overlap:", not (tr_train.max() < tr_test.min() or tr_test.max() < tr_train.min()))

## 3. Label prevalence & session readiness for AUC

In [None]:
if train is not None:
    click_rate  = float((train["clicked"]==1).mean())
    order_rate  = float((train["ordered"]==1).mean())
    print(f"Row-level click_rate={click_rate:.4f}, order_rate={order_rate:.4f}")

    # Session-level: fraction of sessions with at least one positive (needed for AUC per-session)
    sess = train.groupby("session_id").agg(
        any_click=("clicked", "max"),
        any_order=("ordered", "max"),
        n_items=("content_id_hashed", "count")
    ).reset_index()
    print(f"Sessions: total={len(sess):,}, with_click={sess.any_click.mean():.4f}, with_order={sess.any_order.mean():.4f}")
    sess["n_items"].plot(kind="hist", bins=50, title="Items per session (train)")
    plt.tight_layout(); plt.savefig(figures_dir / "items_per_session.png"); plt.show()

## 4. Nulls, dtypes, and candidate keys

In [None]:
def null_summary(df, name):
    if df is None: 
        print(f"[SKIP] {name}"); 
        return
    ns = df.isna().mean().sort_values(ascending=False)
    print(f"Null ratio (top 20) — {name}")
    display(ns.head(20).to_frame("null_ratio"))

null_summary(train, "train")
null_summary(test, "test")
null_summary(content_meta, "content_meta")
null_summary(user_meta, "user_meta")

# Candidate uniqueness: (session_id, content_id_hashed) in train
if train is not None:
    key_dupes = train.duplicated(subset=["session_id","content_id_hashed"]).sum()
    print(f"Train candidate key dupes (session_id, content_id_hashed): {key_dupes}")

## 5. Cardinality of categorical-like columns

In [None]:
def cardinality(df, cols, name):
    if df is None: 
        print(f"[SKIP] {name}"); 
        return
    stats = []
    for c in cols:
        if c in df.columns:
            u = df[c].nunique(dropna=True)
            stats.append((c, u))
    out = pd.DataFrame(stats, columns=["column","nunique"]).sort_values("nunique", ascending=False)
    print(name); display(out)

cardinality(train, ["search_term_normalized","user_id_hashed","content_id_hashed","session_id"], "Train cardinalities")
cardinality(content_meta, ["level1_category_name","level2_category_name","leaf_category_name","content_id_hashed"], "Content meta cardinalities")

## 6. Scaled [0,1] columns sanity (logs)

In [None]:
def check_scaled01(df, cols, name):
    if df is None: 
        print(f"[SKIP] {name}"); 
        return
    bad = {}
    for c in cols:
        if c in df.columns:
            mn, mx = df[c].min(skipna=True), df[c].max(skipna=True)
            if (mn is not None and mn < 0) or (mx is not None and mx > 1):
                bad[c] = (mn, mx)
    if bad:
        print(f"[WARN] Out-of-[0,1] values in {name}:", bad)
    else:
        print(f"[OK] All {len(cols)} columns within [0,1] in {name}")

check_scaled01(content_site, ["total_click","total_cart","total_fav","total_order"], "content_sitewide_log")
check_scaled01(user_site, ["total_click","total_cart","total_fav","total_order"], "user_sitewide_log")
if term_log is not None:
    check_scaled01(term_log, ["total_search_impression","total_search_click"], "term_search_log")

## 7. Basic distribution plots

In [None]:
if content_meta is not None and "content_rate_avg" in (content_meta.columns if 'content_rate_avg' in content_meta.columns else []):
    pass  # handled in price/rating dataset below
prr = safe_load(load_content_price_rate_review, "content_price_rate_review", columns=[
    "update_date","original_price","selling_price","discounted_price","content_rate_avg","content_rate_count","content_id_hashed"
])
if prr is not None:
    prr["selling_price"].dropna().plot(kind="hist", bins=50, title="Selling price distribution")
    plt.tight_layout(); plt.savefig(figures_dir / "selling_price_hist.png"); plt.show()
    prr["content_rate_avg"].dropna().plot(kind="hist", bins=50, title="Rating average distribution")
    plt.tight_layout(); plt.savefig(figures_dir / "rating_avg_hist.png"); plt.show()

## 8. Search term & CTR glimpses

In [None]:
if term_log is not None:
    df = term_log.copy()
    df["ctr"] = df["total_search_click"] / (df["total_search_impression"] + 1e-9)
    # Top 20 terms by impression
    top = (df.groupby("search_term_normalized")["total_search_impression"]
             .sum().sort_values(ascending=False).head(20))
    display(top.to_frame("imp_sum"))
    top.index.to_series().reset_index(drop=True).to_frame("term").plot(kind="bar", title="Top-20 terms (by impression)"); plt.tight_layout()
    plt.savefig(figures_dir / "top_terms_bar.png"); plt.show()

## 9. Leakage risk checklist (static)

In [None]:
checks = [
  "- All joins in features must use logs up to and including `ts_hour` (no future rows).",
  "- Target encoding must be fit in a fold-out-of-fold manner (no same-fold target leakage).",
  "- Do not use row-level labels from the same session to build features for other items within that session.",
  "- Ensure CV uses a purge gap of at least 24h between train and validation splits.",
  "- Verify no columns (e.g., explicit future price updates) reference times after `ts_hour`.",
]
print("\n".join(checks))