In [None]:
from pathlib import Path
# plus moderne que os

import pickle
import numpy as np
import pandas as pd
import json

In [None]:
DATA_DIR = Path("news-portal-user-interactions-by-globocom")
CLICK_DIR = DATA_DIR / "clicks"

In [9]:
articles = pd.read_csv(DATA_DIR / "articles_metadata.csv")

with open(DATA_DIR / "articles_embeddings.pickle", "rb") as f:
    embeddings = pickle.load(f)

# Exclusion des articles vides
mask_valid = articles["words_count"] > 0
articles_clean = articles[mask_valid].copy()
embeddings_clean = embeddings[mask_valid.values]

# Vérifs rapides
assert len(articles_clean) == len(embeddings_clean)
valid_ids = set(articles_clean["article_id"].tolist())  # pour filtrer les clics
# un set est optimisé en python pour vérifier a in A
id_to_row = pd.Series(range(len(articles_clean)), index=articles_clean["article_id"]).to_dict()


In [10]:
# Liste des CSV de clics (ex: clicks_hour_000.csv, ...)
all_click_files = sorted([p for p in CLICK_DIR.iterdir()])

# ÉCHANTILLON: prends 24 premiers fichiers
sample_files = all_click_files[:24]
len(sample_files), sample_files[:3]

(24,
 [PosixPath('news-portal-user-interactions-by-globocom/clicks/clicks_hour_000.csv'),
  PosixPath('news-portal-user-interactions-by-globocom/clicks/clicks_hour_001.csv'),
  PosixPath('news-portal-user-interactions-by-globocom/clicks/clicks_hour_002.csv')])

In [11]:
from collections import defaultdict

# Accumulateurs
user_sum = defaultdict(lambda: np.zeros(embeddings_clean.shape[1], dtype=np.float32))  # somme des vecteurs
user_cnt = defaultdict(int)                                                            # nombre d'articles pris en compte
user_seen = defaultdict(set)                                                           # set des articles vus (pour filtrer + debug)

# Boucle fichiers (lecture colonne minimale)
usecols = ["user_id", "click_article_id"]

for path in sample_files:
    df = pd.read_csv(path, usecols=usecols)
    # filtrer clics vers articles valides (non vides)
    df = df[df["click_article_id"].isin(valid_ids)]
    if df.empty:
        continue

    # grouper par user → liste des articles cliqués
    grouped = df.groupby("user_id")["click_article_id"].apply(list)
    # en sortie on a un objet Series

    for uid, art_list in grouped.items():
        # supprimer doublons (un même article cliqué plusieurs fois dans l'heure)
        unique_ids = set(art_list)
        # mettre à jour le "seen"
        user_seen[uid].update(unique_ids)

        # sommer les vecteurs de ces articles
        idxs = [id_to_row[aid] for aid in unique_ids if aid in id_to_row]  # robustesse
        if not idxs:
            continue
        vecs = embeddings_clean[idxs]  # (n_i, 250)
        user_sum[uid] += vecs.sum(axis=0)
        user_cnt[uid] += len(idxs)

# Construire le DataFrame des profils
rows = []
for uid, cnt in user_cnt.items():
    if cnt == 0:
        continue
    mean_vec = (user_sum[uid] / cnt).astype(np.float32)  # moyenne
    rows.append({
        "user_id": uid,
        "n_articles": cnt,
        "seen_articles": list(user_seen[uid]),  # pour debug / filtrage plus tard
        "profile_vector": mean_vec
    })

user_profiles = pd.DataFrame(rows)
len(user_profiles), user_profiles.head(3)


(39218,
    user_id  n_articles                   seen_articles  \
 0        0           2                 [68866, 157541]   
 1        1           4  [235840, 160474, 59758, 96663]   
 2        2           2                 [119592, 30970]   
 
                                       profile_vector  
 0  [-0.022445861, -0.9760792, -0.2589676, -0.0532...  
 1  [-0.19884185, -0.96625423, -0.37118137, -0.048...  
 2  [-0.7338902, -0.9666825, -0.12142117, -0.74176...  )