In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from scipy.sparse import csr_matrix, hstack
import pickle

In [2]:
# Charger les données prétraitées
train_data = pd.read_csv('interactions_train.csv')
test_data = pd.read_csv('interactions_test.csv')

  test_data = pd.read_csv('interactions_test.csv')


In [3]:
# Charger les données pour l'extraction de caractéristiques
kuairec_caption_category = pd.read_csv(
    "data_final_project/KuaiRec 2.0/data/kuairec_caption_category.csv",
    engine="python", sep=",", quotechar='"', on_bad_lines='skip'
)
item_categories = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_categories.csv")
item_daily_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_daily_features.csv")
social_network = pd.read_csv("data_final_project/KuaiRec 2.0/data/social_network.csv")
user_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/user_features.csv")


### Création des caractéristiques utilisateurs

In [4]:
# 1.1 Comportement de visionnement moyen par utilisateur
user_behavior = train_data.groupby('user_id').agg({
    'watch_ratio': ['mean', 'std', 'count'],
    'play_duration': ['mean', 'sum'],
    'positive_interaction': 'sum'
}).reset_index()

user_behavior.columns = ['user_id', 'avg_watch_ratio', 'std_watch_ratio', 'interaction_count', 
                        'avg_play_duration', 'total_play_duration', 'positive_interactions']

user_behavior


Unnamed: 0,user_id,avg_watch_ratio,std_watch_ratio,interaction_count,avg_play_duration,total_play_duration,positive_interactions
0,14,1.025346,1.023801,2657,9796.562288,26029466,2014
1,19,0.846335,0.404440,2601,7848.119954,20412960,2213
2,21,0.980583,1.021713,2701,9124.169567,24644382,2273
3,23,1.049398,1.532523,2770,10102.611191,27984233,1569
4,24,0.867872,1.313065,2679,8252.480030,22108394,1361
...,...,...,...,...,...,...,...
1406,7142,0.760655,0.364329,2677,7176.198356,19210683,2139
1407,7147,0.964763,0.469772,2632,8798.145137,23156718,2329
1408,7153,0.827530,1.287836,2672,7654.366392,20452467,1690
1409,7159,0.734130,0.584409,2712,6997.790192,18978007,1581


In [5]:
# 1.2 Préférence de durée de vidéo par utilisateur
user_duration_pref = train_data.groupby('user_id')['video_duration'].agg(['mean', 'std']).reset_index()
user_duration_pref.columns = ['user_id', 'preferred_video_duration', 'video_duration_std']

user_duration_pref

Unnamed: 0,user_id,preferred_video_duration,video_duration_std
0,14,13795.019195,19551.679058
1,19,13758.162630,19158.136638
2,21,13807.367642,19589.560121
3,23,13762.223105,18986.906220
4,24,13910.120941,19154.255695
...,...,...,...
1406,7142,14259.753455,20522.749633
1407,7147,13558.195289,19755.487318
1408,7153,13988.272455,18978.253984
1409,7159,14219.019912,20381.720369


In [6]:
# 1.3 Fusionner avec les caractéristiques utilisateurs du dataset original
user_features_subset = user_features[['user_id', 'user_active_degree', 'follow_user_num', 
                                     'fans_user_num', 'friend_user_num', 'register_days']]


# Fusionner toutes les caractéristiques utilisateurs
user_features_enriched = user_behavior.merge(user_duration_pref, on='user_id', how='left')
user_features_enriched = user_features_enriched.merge(user_features_subset, on='user_id', how='left')

user_features_enriched

Unnamed: 0,user_id,avg_watch_ratio,std_watch_ratio,interaction_count,avg_play_duration,total_play_duration,positive_interactions,preferred_video_duration,video_duration_std,user_active_degree,follow_user_num,fans_user_num,friend_user_num,register_days
0,14,1.025346,1.023801,2657,9796.562288,26029466,2014,13795.019195,19551.679058,full_active,73,6,1,279
1,19,0.846335,0.404440,2601,7848.119954,20412960,2213,13758.162630,19158.136638,full_active,3,2,1,146
2,21,0.980583,1.021713,2701,9124.169567,24644382,2273,13807.367642,19589.560121,high_active,63,0,0,154
3,23,1.049398,1.532523,2770,10102.611191,27984233,1569,13762.223105,18986.906220,full_active,94,1,0,261
4,24,0.867872,1.313065,2679,8252.480030,22108394,1361,13910.120941,19154.255695,full_active,59,3,0,298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1406,7142,0.760655,0.364329,2677,7176.198356,19210683,2139,14259.753455,20522.749633,full_active,6,0,0,268
1407,7147,0.964763,0.469772,2632,8798.145137,23156718,2329,13558.195289,19755.487318,full_active,99,1,0,297
1408,7153,0.827530,1.287836,2672,7654.366392,20452467,1690,13988.272455,18978.253984,full_active,7,0,0,544
1409,7159,0.734130,0.584409,2712,6997.790192,18978007,1581,14219.019912,20381.720369,full_active,9,0,0,86


### Création des caractéristiques des vidéos

In [7]:
# 2.1 Popularité des vidéos basée sur les interactions
video_popularity = train_data.groupby('video_id').agg({
    'watch_ratio': ['mean', 'count'],
    'positive_interaction': 'sum'
}).reset_index()

video_popularity.columns = ['video_id', 'avg_watch_ratio', 'view_count', 'positive_interactions']
video_popularity['popularity_score'] = video_popularity['positive_interactions'] / video_popularity['view_count']

video_popularity

Unnamed: 0,video_id,avg_watch_ratio,view_count,positive_interactions,popularity_score
0,103,0.781970,1249,979,0.783827
1,109,1.044293,1340,1187,0.885821
2,120,1.411307,1358,1241,0.913844
3,122,0.846462,1282,1066,0.831513
4,128,0.774669,1370,1020,0.744526
...,...,...,...,...,...
2988,10377,1.017218,15,12,0.800000
2989,10378,1.344247,1235,1138,0.921457
2990,10382,1.449722,414,393,0.949275
2991,10385,1.647607,1,1,1.000000


In [8]:
# 2.2 Fusionner avec les caractéristiques des vidéos du dataset original
# Agréger les métriques d'engagement quotidiennes en moyennes par vidéo
video_engagement = item_daily_features.groupby('video_id').agg({
    'play_cnt': 'mean', 
    'like_cnt': 'mean',
    'comment_cnt': 'mean',
    'share_cnt': 'mean',
    'play_user_num': 'mean',
    'like_user_num': 'mean',
    'video_duration': 'first'  # Prendre la première valeur car elle est constante par vidéo
}).reset_index()

video_engagement

Unnamed: 0,video_id,play_cnt,like_cnt,comment_cnt,share_cnt,play_user_num,like_user_num,video_duration
0,0,6534.777778,388.777778,7.111111,1.666667,5042.587302,385.634921,5966.0
1,1,2248.682540,131.222222,0.936508,1.015873,1970.412698,126.412698,
2,2,10638.857143,50.476190,0.476190,0.301587,6169.333333,49.857143,8000.0
3,3,145.031746,14.936508,0.111111,0.412698,124.063492,14.761905,
4,4,11.730769,0.057692,0.000000,0.000000,10.596154,0.057692,18000.0
...,...,...,...,...,...,...,...,...
10723,10723,214.000000,24.000000,0.000000,0.000000,157.000000,24.000000,4833.0
10724,10724,965.000000,264.000000,29.000000,1.000000,856.000000,261.000000,54720.0
10725,10725,15487.000000,851.000000,36.000000,3.000000,14672.000000,845.000000,15800.0
10726,10726,7859.000000,44.000000,0.000000,1.000000,7480.000000,44.000000,5132.0


In [10]:
# 2.3 Ajouter les informations de catégorie (si disponibles)
if not kuairec_caption_category.empty:
    # Extraire les catégories principales
    video_categories = kuairec_caption_category[['video_id', 'first_level_category_name']]
    # Encoder en one-hot les catégories principales
    enc = OneHotEncoder(sparse=False)
    category_onehot = enc.fit_transform(video_categories[['first_level_category_name']])
    category_cols = [f'category_{i}' for i in range(category_onehot.shape[1])]
    category_df = pd.DataFrame(category_onehot, columns=category_cols)
    category_df['video_id'] = video_categories['video_id'].values
    
    # Fusionner avec les caractéristiques des vidéos
    video_features = video_popularity.merge(video_engagement, on='video_id', how='left')
    video_features = video_features.merge(category_df, on='video_id', how='left')
else:
    video_features = video_popularity.merge(video_engagement, on='video_id', how='left')

print(f"Caractéristiques vidéo créées: {video_features.shape}")
print(video_features.head())

ValueError: You are trying to merge on int64 and object columns for key 'video_id'. If you wish to proceed you should use pd.concat