# Popularity/Recency on training set

In [1]:
import pandas as pd
import numpy as np
from itertools import product

# ── CONFIG ────────────────────────────────────────────────────
TRAIN_PATH        = "../datasets/train_clicks.parquet"
VAL_PATH          = "../datasets/valid_clicks.parquet"
TEST_PATH         = "../datasets/test_clicks.parquet"
META_PATH         = "../datasets/articles_metadata.csv"   # or your parquet copy
HALF_LIFE_DAYS    = [100_000, 3, 6., 8.]
BETA_VALUES       = [0.0, 0.001, 5., 7.]
FRESH_WINDOW_DAYS = 1
TOP_M             = 500
RECALL_K          = 10


In [2]:
# 1) Load train / val / test
train_df = pd.read_parquet(TRAIN_PATH)
val_df   = pd.read_parquet(VAL_PATH)
test_df  = pd.read_parquet(TEST_PATH)

# ensure timestamps are datetime
for df in (train_df, val_df, test_df):
    df["click_timestamp"] = pd.to_datetime(df["click_timestamp"], unit="ms")

print(f"Train: {train_df.shape}, Val: {val_df.shape}, Test: {test_df.shape}")

Train: (2857109, 12), Val: (65536, 12), Test: (65536, 12)


In [3]:
# 2) Load metadata
dtypes = {
        "article_id": "uint32",
        "category_id": "uint16",
        "publisher_id": "uint8",
        "words_count": "uint16"
    }

articles = pd.read_csv(META_PATH,dtype=dtypes)

articles["created_at_ts"] = pd.to_datetime(articles["created_at_ts"], unit="ms")
print(articles.shape)
articles.head()


(364047, 5)


Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
0,0,0,2017-12-13 05:53:39,0,168
1,1,1,2014-07-14 12:45:36,0,189
2,2,1,2014-08-22 00:35:06,0,250
3,3,1,2014-08-19 17:11:53,0,230
4,4,1,2014-08-03 13:06:11,0,162


In [4]:
import numpy as np

def compute_scores(train_df, articles, half_life_days, beta, fresh_window_days, top_m=500):
    # 1) global reference time
    now = train_df["click_timestamp"].max()
    # 2) decay constant (per second)
    λ = np.log(2) / pd.Timedelta(days=half_life_days).total_seconds()
    # 3) per-click weights
    ages    = (now - train_df["click_timestamp"]).dt.total_seconds()
    weights = np.exp(-λ * ages)
    # 4) aggregate per article
    pop = (
        train_df.assign(weight=weights)
                .groupby("click_article_id")["weight"]
                .sum()
                .rename("pop_score")
                .to_frame()
    )
    # 5) join publication times
    pop = pop.join(
        articles.set_index("article_id")[["created_at_ts"]],
        how="left"
    )
    # 6) apply freshness boost
    is_fresh = (now - pop["created_at_ts"]) <= pd.Timedelta(days=fresh_window_days)
    pop["final_score"] = pop["pop_score"] * (1 + beta * is_fresh.astype(float))
    # 7) build cleaned candidates DF
    top_series = pop["final_score"].nlargest(top_m)
    cands = pd.DataFrame({
        "article_id": top_series.index.astype(int),
        "final_score": top_series.values
    })
    return cands


def recall_at_k(cands, articles, holdout_df, K):
    # pull publication dates
    pub_dates = articles.set_index("article_id")["created_at_ts"].to_dict()
    assert "article_id" in cands.columns, "cands missing article_id"
    
    # user → cutoff map
    user_cutoff = holdout_df.set_index("user_id")["click_timestamp"].to_dict()
    art_ids = cands["article_id"].tolist()
    hits = []
    for _, row in holdout_df.iterrows():
        uid, true_a = row["user_id"], row["click_article_id"]
        cutoff = user_cutoff[uid]
        # keep only articles published ≤ cutoff
        valid = [a for a in art_ids if pub_dates.get(a, pd.Timestamp(0)) <= cutoff]
        topk  = valid[:K]
        hits.append(true_a in topk)
    return np.mean(hits)


In [5]:
from itertools import product

results = []
for counter, (h, beta) in enumerate(product(HALF_LIFE_DAYS, BETA_VALUES)):
    print(f"Entry {counter+1}/{len(HALF_LIFE_DAYS)*len(BETA_VALUES)}: h={h}, β={beta}")
    cands = compute_scores(train_df, articles, h, beta, FRESH_WINDOW_DAYS, top_m=TOP_M)
    r = recall_at_k(cands, articles, val_df, RECALL_K)
    results.append({
        "half_life_days": h,
        "beta": beta,
        f"recall@{RECALL_K}": r
    })
    print(f" → Recall@{RECALL_K}: {r:.4f}\n")

df_res = pd.DataFrame(results)
best = df_res.sort_values(f"recall@{RECALL_K}", ascending=False).iloc[0]
print("Best hyperparameters:\n", best)


Entry 1/16: h=100000, β=0.0
 → Recall@10: 0.0214

Entry 2/16: h=100000, β=0.001
 → Recall@10: 0.0214

Entry 3/16: h=100000, β=5.0
 → Recall@10: 0.0214

Entry 4/16: h=100000, β=7.0
 → Recall@10: 0.0214

Entry 5/16: h=3, β=0.0
 → Recall@10: 0.1470

Entry 6/16: h=3, β=0.001
 → Recall@10: 0.1470

Entry 7/16: h=3, β=5.0
 → Recall@10: 0.1470

Entry 8/16: h=3, β=7.0
 → Recall@10: 0.1470

Entry 9/16: h=6.0, β=0.0
 → Recall@10: 0.0894

Entry 10/16: h=6.0, β=0.001
 → Recall@10: 0.0894

Entry 11/16: h=6.0, β=5.0
 → Recall@10: 0.0894

Entry 12/16: h=6.0, β=7.0
 → Recall@10: 0.0894

Entry 13/16: h=8.0, β=0.0
 → Recall@10: 0.0571

Entry 14/16: h=8.0, β=0.001
 → Recall@10: 0.0571

Entry 15/16: h=8.0, β=5.0
 → Recall@10: 0.0571

Entry 16/16: h=8.0, β=7.0
 → Recall@10: 0.0571

Best hyperparameters:
 half_life_days    3.000000
beta              0.000000
recall@10         0.147034
Name: 4, dtype: float64


## 3.2 Combine training and validation sets

In [6]:
import pandas as pd

# Cell: Final Evaluation on Test Set

# 1) Load splits (if not already in memory)
train_df = pd.read_parquet("../datasets/train_clicks.parquet")
val_df   = pd.read_parquet("../datasets/valid_clicks.parquet")
test_df  = pd.read_parquet("../datasets/test_clicks.parquet")

# Ensure timestamps are datetime
for df in (train_df, val_df, test_df):
    df["click_timestamp"] = pd.to_datetime(df["click_timestamp"], unit="ms")

In [7]:
# 2) Load metadata (if not already in memory)
articles = pd.read_csv(
    "../datasets/articles_metadata.csv",
    dtype={
        "article_id": "uint32",
        "category_id": "uint16",
        "publisher_id": "uint8",
        "words_count": "uint16"
    }
)
articles["created_at_ts"] = pd.to_datetime(articles["created_at_ts"], unit="ms")

In [8]:
# 3) Combine train + validation
train_plus_val = pd.concat([train_df, val_df], ignore_index=True)

In [9]:
# 4) Compute scores with best hyperparameters (h=3 days, β=0)
cands = compute_scores(
    train_plus_val,
    articles,
    half_life_days=3,
    beta=0.0,
    fresh_window_days=FRESH_WINDOW_DAYS,
    top_m=TOP_M
)

In [10]:
# 5) Evaluate on test set
test_recall = recall_at_k(cands, articles, test_df, RECALL_K)
print(f"Final Test Recall@{RECALL_K} (h=3, β=0): {test_recall:.4f}")

Final Test Recall@10 (h=3, β=0): 0.1211
