In [1]:
import pickle
import pandas as pd
from pathlib import Path
import math
import numpy as np
import json
from datetime import datetime, timezone
from sklearn.decomposition import PCA

ARTICLES_PATH = Path("news-portal-user-interactions-by-globocom/articles_metadata.csv")
EMBEDDINGS_PATH = Path("news-portal-user-interactions-by-globocom/articles_embeddings.pickle")
CLICKS_DIR = Path("news-portal-user-interactions-by-globocom/clicks")
OUT_DIR = Path("artifacts")

HALF_LIFE_DAYS = 30
ALPHA_RECENCY = 0.6
TOPK_COLD = 5

MIN_WORDS = 0 

ENABLE_PCA = True
PCA_VARIANCE = 0.95
PCA_N_COMPONENTS = 52
PCA_DTYPE = "float32"

NORMALIZE_L2 = True

In [2]:
def l2_normalize(mat: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    n = np.linalg.norm(mat, axis=1, keepdims=True)
    n = np.maximum(n, eps)
    return (mat / n).astype(mat.dtype, copy=False)

In [3]:
def load_articles_and_embeddings(articles_path=ARTICLES_PATH, embeddings_path=EMBEDDINGS_PATH, min_words=MIN_WORDS):
    df = pd.read_csv(articles_path)

    # masque pour éliminer les articles vides
    mask = df['words_count'] > min_words

    with open(embeddings_path, "rb") as f:
        E = pickle.load(f)  # np.ndarray (N, 250)

    df = df[mask].copy()
    E = E[mask.values]

    df['pub_ts'] = (df['created_at_ts'] // 1000).astype('int64')

    df = df[['article_id', 'pub_ts', 'words_count']].reset_index(drop=True)
    df["article_id"] = df["article_id"].astype(str)

    E = E.astype('float32' if PCA_DTYPE == "float32" else 'float64', copy=False)
    return df, E

In [4]:
def load_clicks(clicks_dir=CLICKS_DIR):
    files = sorted(Path(clicks_dir).glob("*.csv"))
    usecols = ["user_id", "click_article_id", "click_timestamp"]
    parts = [pd.read_csv(p, usecols=usecols) for p in files]
    df = pd.concat(parts, ignore_index=True)

    df.rename(columns={"click_article_id": "article_id"}, inplace=True)
    df["timestamp"] = (df["click_timestamp"] // 1000).astype("int64")
    df = df[["user_id", "article_id", "timestamp"]]

    df["user_id"] = df["user_id"].astype(str)
    df["article_id"] = df["article_id"].astype(str)
    return df

In [5]:
def fit_and_apply_pca(E: np.ndarray):
    n_components = PCA_N_COMPONENTS if PCA_N_COMPONENTS is not None else float(PCA_VARIANCE)
    pca = PCA(n_components=n_components, svd_solver="full", random_state=0)
    X_pca = pca.fit_transform(E)
    exp_var = pca.explained_variance_ratio_.astype('float32')
    cum_exp = np.cumsum(exp_var)
    report = {
        "K": int(X_pca.shape[1]),
        "variance_expliquee_cumulee": float(cum_exp[-1]),
        "variance_par_composante": exp_var.tolist()
    }
    return X_pca, pca, report

In [6]:
def main():
    # 1) Articles + embeddings (filtrés)
    articles, E = load_articles_and_embeddings()
    id_to_row = {aid: i for i, aid in enumerate(articles["article_id"].tolist())}

    # 2) ACP (optionnel)
    if ENABLE_PCA:
        X_pca, pca, pca_report = fit_and_apply_pca(E)
        if NORMALIZE_L2:
            X_pca = l2_normalize(X_pca)
        E_eff = X_pca
        K_eff = int(E_eff.shape[1])

        # Sauvegardes PCA
        with open(OUT_DIR / "pca_model.pkl", "wb") as f:
            pickle.dump(pca, f, protocol=pickle.HIGHEST_PROTOCOL)
        (OUT_DIR / "pca_report.json").write_text(json.dumps(pca_report, indent=2), encoding="utf-8")
    else:
        # Option sans PCA (éventuellement normalisation L2)
        E_eff = l2_normalize(E) if NORMALIZE_L2 else E
        K_eff = int(E_eff.shape[1])

    # 3) Clicks (restreints aux articles conservés)
    clicks = load_clicks(CLICKS_DIR)
    clicks = clicks[clicks["article_id"].isin(articles["article_id"])]

    # --- Référentiel temporel ---
    t_min, t_max = int(clicks["timestamp"].min()), int(clicks["timestamp"].max())
    ref_now = t_max
    data_span = max(1, t_max - t_min)

    # fenêtre 30 jours bornée par l’empan réel
    thirty_days = 30 * 24 * 3600
    win_sec = min(thirty_days, data_span)
    cutoff = ref_now - win_sec

    # 3a) Popularité
    pop_all = clicks.groupby("article_id").size().rename("pop_all")
    pop_30 = clicks[clicks["timestamp"] >= cutoff].groupby("article_id").size().rename("pop_30d")
    art = articles.merge(pop_all, left_on="article_id", right_index=True, how="left")
    art = art.merge(pop_30, left_on="article_id", right_index=True, how="left")
    art[["pop_all", "pop_30d"]] = art[["pop_all", "pop_30d"]].fillna(0).astype("int64")

    # 3b) Profils utilisateurs (dans l’espace réduit E_eff)
    user_ids, profiles = [], []
    hl_sec = HALF_LIFE_DAYS * 24 * 3600.0
    lam = math.log(2) / hl_sec if hl_sec > 0 else 0.0
    idx_df = pd.DataFrame({"article_id": art["article_id"], "row": range(len(art))})
    c2 = clicks.merge(idx_df, on="article_id", how="inner")
    for uid, grp in c2.groupby("user_id"):
        rows = grp["row"].to_numpy()
        age = (ref_now - grp["timestamp"].to_numpy()).astype("float64")
        w = np.exp(-lam * age) if lam > 0 else np.ones_like(age)
        # moyenne pondérée dans l’espace réduit
        v = (E_eff[rows].astype("float64") * w[:, None]).sum(axis=0) / (w.sum() + 1e-12)
        profiles.append(v.astype("float32"))
        user_ids.append(uid)
    P = np.vstack(profiles).astype("float32")
    if NORMALIZE_L2:
        P = l2_normalize(P)

    # 3c) Cold-start
    rec = (art["pub_ts"] - art["pub_ts"].min()) / max(1, (art["pub_ts"].max() - art["pub_ts"].min()))
    pop_base = art["pop_30d"] if art["pop_30d"].max() > 0 else art["pop_all"]
    pop = (pop_base - pop_base.min()) / max(1, (pop_base.max() - pop_base.min()))
    score = ALPHA_RECENCY * rec + (1 - ALPHA_RECENCY) * pop
    top = art.assign(score=score).sort_values(["score", "pub_ts", "pop_30d", "pop_all"],
                                              ascending=[False, False, False, False])
    cold5 = top.head(TOPK_COLD)["article_id"].tolist()
    top200 = top.head(200)

    # 4) Sauvegarde artefacts
    art.to_parquet(OUT_DIR / "articles_clean.parquet", index=False)

    # Embeddings articles (réduits et/ou normalisés)
    np.save(OUT_DIR / "embeddings_clean.npy", E_eff.astype("float32"))

    # Indexations & profils
    (OUT_DIR / "id_to_row.json").write_text(json.dumps(id_to_row))
    np.save(OUT_DIR / "user_profiles.npy", P)
    pd.Series(user_ids).to_csv(OUT_DIR / "user_ids.csv", index=False, header=False)
    (OUT_DIR / "user_to_idx.json").write_text(json.dumps({u: i for i, u in enumerate(user_ids)}))

    # Cold-start et popularité
    (OUT_DIR / "cold_start_top5.json").write_text(json.dumps(cold5))
    top200.to_parquet(OUT_DIR / "popular_recent.parquet", index=False)

    # Manifest
    manifest = {
        "built_at": int(datetime.now(timezone.utc).timestamp()),
        "articles": int(len(art)),
        "embedding_dim": K_eff,
        "users": int(len(user_ids)),
        "half_life_days": HALF_LIFE_DAYS,
        "alpha_recency": ALPHA_RECENCY,
        "topk_cold": TOPK_COLD,
        "pca_enabled": ENABLE_PCA,
        "pca_variance_target": PCA_VARIANCE if ENABLE_PCA and PCA_N_COMPONENTS is None else None,
        "pca_n_components": PCA_N_COMPONENTS if ENABLE_PCA and PCA_N_COMPONENTS is not None else K_eff,
        "normalized_l2": NORMALIZE_L2
    }
    (OUT_DIR / "build_manifest.json").write_text(json.dumps(manifest, indent=2))
    print(json.dumps(manifest, indent=2))

In [7]:
if __name__ == "__main__":
    main()


{
  "built_at": 1759010297,
  "articles": 364012,
  "embedding_dim": 52,
  "users": 322897,
  "half_life_days": 30,
  "alpha_recency": 0.6,
  "topk_cold": 5,
  "pca_enabled": true,
  "pca_variance_target": null,
  "pca_n_components": 52,
  "normalized_l2": true
}
