In [8]:
#!/usr/bin/env python3
import pickle
import pandas as pd
from pathlib import Path
import math
import numpy as np
import json
from datetime import datetime, timezone


# paramètres fixes (à éditer si besoin)
ARTICLES_PATH = Path("news-portal-user-interactions-by-globocom/articles_metadata.csv")
EMBEDDINGS_PATH = Path("news-portal-user-interactions-by-globocom/articles_embeddings.pickle")
CLICKS_DIR = Path("news-portal-user-interactions-by-globocom/clicks")
OUT_DIR = Path("artifacts")
HALF_LIFE_DAYS = 30
ALPHA_RECENCY = 0.6
TOPK_COLD = 5


In [2]:
def load_articles_and_embeddings(articles_path=ARTICLES_PATH, embeddings_path=EMBEDDINGS_PATH, min_words=0):
    # 1) Charger les articles
    df = pd.read_csv(articles_path)
    
    # masque pour éliminer les articles vides
    mask = df['words_count'] > min_words
    
    # 2) Charger les embeddings
    with open(embeddings_path, "rb") as f:
        E = pickle.load(f)
    
    # 3) Appliquer le même masque aux deux
    df = df[mask].copy()
    E = E[mask.values]
    
    # 4) Ajouter la colonne pub_ts en secondes
    df['pub_ts'] = (df['created_at_ts'] // 1000).astype('int64')
    
    # 5) Ne garder que les colonnes utiles
    df = df[['article_id','pub_ts','words_count']].reset_index(drop=True)
    df["article_id"] = df["article_id"].astype(str)
    
    return df, E.astype('float32')


In [3]:
def load_clicks(clicks_dir=CLICKS_DIR):
    files = sorted(Path(clicks_dir).glob("*.csv"))
    usecols = ["user_id", "click_article_id", "click_timestamp"]
    parts = [pd.read_csv(p, usecols=usecols) for p in files]
    df = pd.concat(parts, ignore_index=True)

    df.rename(columns={"click_article_id": "article_id"}, inplace=True)
    df["timestamp"] = (df["click_timestamp"] // 1000).astype("int64")  # ms -> s
    df = df[["user_id", "article_id", "timestamp"]]

    # types propres et homogènes
    df["user_id"] = df["user_id"].astype(str)
    df["article_id"] = df["article_id"].astype(str)

    return df


In [4]:
def main():

    # 1) Articles + embeddings
    articles, E = load_articles_and_embeddings()
    id_to_row = {aid: i for i, aid in enumerate(articles["article_id"].tolist())}

    # 2) Clicks
    clicks = load_clicks(CLICKS_DIR)
    clicks = clicks[clicks["article_id"].isin(articles["article_id"])]

    # --- Référentiel temporel ---
    t_min, t_max = int(clicks["timestamp"].min()), int(clicks["timestamp"].max())
    ref_now = t_max
    data_span = max(1, t_max - t_min)

    # fenêtre 30 jours bornée par l’empan réel
    thirty_days = 30 * 24 * 3600
    win_sec = min(thirty_days, data_span)
    cutoff = ref_now - win_sec

    # 3a) Popularité (on compte len ombre de clicks)
    pop_all = clicks.groupby("article_id").size().rename("pop_all")
    pop_30  = clicks[clicks["timestamp"] >= cutoff].groupby("article_id").size().rename("pop_30d")
    art = articles.merge(pop_all, left_on="article_id", right_index=True, how="left")
    art = art.merge(pop_30,  left_on="article_id", right_index=True, how="left")
    art[["pop_all","pop_30d"]] = art[["pop_all","pop_30d"]].fillna(0).astype("int64")

    # 3b) Profils utilisateurs
    user_ids, profiles = [], []
    hl_sec = HALF_LIFE_DAYS * 24 * 3600.0
    lam = math.log(2) / hl_sec if hl_sec > 0 else 0.0
    idx_df = pd.DataFrame({"article_id": art["article_id"], "row": range(len(art))})
    c2 = clicks.merge(idx_df, on="article_id", how="inner")
    for uid, grp in c2.groupby("user_id"):
        rows = grp["row"].to_numpy()
        age = (ref_now - grp["timestamp"].to_numpy()).astype("float64")
        w = np.exp(-lam * age) if lam > 0 else np.ones_like(age)
        v = (E[rows].astype("float64") * w[:, None]).sum(axis=0) / (w.sum() + 1e-12)
        user_ids.append(uid)
        profiles.append(v.astype("float32"))
    P = np.vstack(profiles).astype("float32")

    # 3c) Cold-start
    rec = (art["pub_ts"] - art["pub_ts"].min()) / max(1, (art["pub_ts"].max() - art["pub_ts"].min()))
    pop_base = art["pop_30d"] if art["pop_30d"].max() > 0 else art["pop_all"]
    pop = (pop_base - pop_base.min()) / max(1, (pop_base.max() - pop_base.min()))
    score = ALPHA_RECENCY * rec + (1 - ALPHA_RECENCY) * pop
    top = art.assign(score=score).sort_values(["score","pub_ts","pop_30d","pop_all"], ascending=[False,False,False,False])
    cold5 = top.head(TOPK_COLD)["article_id"].tolist()
    top200 = top.head(200)

     # 4) sauvegarde artefacts
    art.to_parquet(OUT_DIR / "articles_clean.parquet", index=False)
    np.save(OUT_DIR / "embeddings_clean.npy", E.astype("float32"))

    (OUT_DIR / "id_to_row.json").write_text(json.dumps(id_to_row))
    np.save(OUT_DIR / "user_profiles.npy", P)
    pd.Series(user_ids).to_csv(OUT_DIR / "user_ids.csv", index=False, header=False)
    (OUT_DIR / "user_to_idx.json").write_text(json.dumps({u: i for i, u in enumerate(user_ids)}))
    (OUT_DIR / "cold_start_top5.json").write_text(json.dumps(cold5))
    top200.to_parquet(OUT_DIR / "popular_recent.parquet", index=False)

    manifest = {
        "built_at": int(datetime.now(timezone.utc).timestamp()),
        "articles": int(len(art)),
        "embedding_dim": int(E.shape[1]),
        "users": int(len(user_ids)),
        "half_life_days": HALF_LIFE_DAYS,
        "alpha_recency": ALPHA_RECENCY,
        "topk_cold": TOPK_COLD,
    }
    (OUT_DIR / "build_manifest.json").write_text(json.dumps(manifest, indent=2))
    print(json.dumps(manifest, indent=2))


In [9]:
if __name__ == "__main__":
    main()


{
  "built_at": 1758666428,
  "articles": 364012,
  "embedding_dim": 250,
  "users": 322897,
  "half_life_days": 30,
  "alpha_recency": 0.6,
  "topk_cold": 5
}
