In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import json
from tqdm import tqdm
import pickle
import os
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.feature_extraction.text import TfidfVectorizer

DATA_PATH = 'data_final_project/KuaiRec 2.0/data/'


In [None]:

def build_user_profiles():
    """
    Construit des profils utilisateurs enrichis :
    - Centrage sur vidéos aimées avec pondération par watch_ratio
    - Renforcement du centre par similarité interne
    - Réduction de l'effet des vidéos faiblement regardées
    """
    print("Building enhanced user profiles...")

    # Préparer les interactions
    positive_interactions = interactions_train[interactions_train['positive_interaction'] == 1]
    negative_interactions = interactions_train[interactions_train['watch_ratio_clamped'] < 0.2]
    all_users = interactions_train['user_id'].unique()

    # Mapping utilisateurs
    user_id_map = {user_id: idx for idx, user_id in enumerate(all_users)}
    user_profiles = np.zeros((len(all_users), video_features_matrix.shape[1]))

    for user_id in tqdm(all_users, desc="Users"):
        user_idx = user_id_map[user_id]

        # Positives
        user_pos = positive_interactions[positive_interactions['user_id'] == user_id]
        if user_pos.empty:
            continue

        # Vidéos aimées avec poids
        pos_indices, pos_weights = [], []
        for _, row in user_pos.iterrows():
            vid = row['video_id']
            if vid in video_id_map:
                pos_indices.append(video_id_map[vid])
                weight = row['watch_ratio_clamped']
                pos_weights.append(weight)

        # Barycentre pondéré
        liked_feats = video_features_matrix[pos_indices]
        pos_weights = np.array(pos_weights)
        pos_weights /= pos_weights.sum()

        centroid = np.average(liked_feats, axis=0, weights=pos_weights)

        # Raffinement : pondération par similarité au centre
        sims = cosine_similarity([centroid], liked_feats)[0]
        refined = np.average(liked_feats, axis=0, weights=pos_weights * sims)

        # Negatives
        user_neg = negative_interactions[negative_interactions['user_id'] == user_id]
        neg_vector = np.zeros_like(refined)
        for _, row in user_neg.iterrows():
            vid = row['video_id']
            if vid in video_id_map:
                idx = video_id_map[vid]
                weight = row['play_duration'] / user_neg['play_duration'].sum()
                neg_vector += weight * video_features_matrix[idx]
        neg_vector *= 0.5  # atténuation

        # Profil final : raffiné - négatif
        profile = refined - neg_vector

        # Normalisation
        norm = np.linalg.norm(profile)
        if norm > 0:
            profile = profile / norm

        user_profiles[user_idx] = profile

    print(f"Built {len(user_id_map)} refined user profiles.")
    return user_profiles, user_id_map, all_users

user_profiles_2, user_id_map_2, unique_users_2 = build_user_profiles()
print("User profiles built successfully!")
user_profiles_2

In [None]:
from sklearn.metrics import ndcg_score
from tqdm import tqdm

def evaluate_model(k=10):
    """
    Évalue le modèle avec Precision@k, Recall@k et NDCG@k,
    pour les utilisateurs et vidéos connus.

    Args:
        k (int): Cutoff top-K

    Returns:
        dict: Moyenne des métriques
    """
    print(f"Évaluation à top-{k}...")

    valid_users = set(user_id_map.keys())
    valid_videos = set(video_id_map.keys())

    filtered_test = interactions_test[
        (interactions_test['positive_interaction'] == 1) &
        (interactions_test['user_id'].isin(valid_users)) &
        (interactions_test['video_id'].isin(valid_videos))
    ]

    true_items_per_user = filtered_test.groupby('user_id')['video_id'].apply(set).to_dict()

    precision_list = []
    recall_list = []
    ndcg_list = []
    n_skipped = 0

    for user_id, true_items in tqdm(true_items_per_user.items(), desc="Utilisateurs évalués"):
        true_items = set(map(int, true_items))

        # Générer les recommandations
        recommended_pairs = generate_recommendations(user_id, top_n=k, exclude_watched=False)

        if not recommended_pairs:
            n_skipped += 1
            continue

        # Extraire uniquement les video_id
        recommended_ids = [int(vid_id) for vid_id, _ in recommended_pairs]
        recommended_set = set(recommended_ids)
        intersection = true_items & recommended_set

        if len(true_items) == 0:
            n_skipped += 1
            continue

        # Precision@k
        precision = len(intersection) / k
        precision_list.append(precision)

        # Recall@k
        recall = len(intersection) / len(true_items)
        recall_list.append(recall)

        # NDCG@k
        relevance = [1 if vid in true_items else 0 for vid in recommended_ids]
        ndcg = ndcg_score([relevance], [list(range(k, 0, -1))])  # Gain décroissant
        ndcg_list.append(ndcg)

    print(f"Utilisateurs évalués : {len(precision_list)} / {len(true_items_per_user)}")
    print(f"Utilisateurs ignorés : {n_skipped}")

    return {
        'precision@k': np.mean(precision_list),
        'recall@k': np.mean(recall_list),
        'ndcg@k': np.mean(ndcg_list),
        'users_evaluated': len(precision_list)
    }


results = evaluate_model(k=10)
print(f"Precision@10: {results['precision@k']:.4f}")
print(f"Recall@10:    {results['recall@k']:.4f}")
print(f"NDCG@10:      {results['ndcg@k']:.4f}")

: 