In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
import json

In [15]:
# Charger les données
interactions = pd.read_csv("data_final_project/KuaiRec 2.0/data/small_matrix.csv")
kuairec_caption_category = pd.read_csv(
    "data_final_project/KuaiRec 2.0/data/kuairec_caption_category.csv",
    engine="python", sep=",", quotechar='"', on_bad_lines='skip'
)
item_categories = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_categories.csv")
item_daily_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/item_daily_features.csv")
social_network = pd.read_csv("data_final_project/KuaiRec 2.0/data/social_network.csv")
user_features = pd.read_csv("data_final_project/KuaiRec 2.0/data/user_features.csv")


In [17]:
display(interactions.head(5000))

Unnamed: 0,user_id,video_id,play_duration,video_duration,time,date,timestamp,watch_ratio
0,14,148,4381,6067,2020-07-05 05:27:48.378,20200705.0,1.593898e+09,0.722103
1,14,183,11635,6100,2020-07-05 05:28:00.057,20200705.0,1.593898e+09,1.907377
2,14,3649,22422,10867,2020-07-05 05:29:09.479,20200705.0,1.593898e+09,2.063311
3,14,5262,4479,7908,2020-07-05 05:30:43.285,20200705.0,1.593898e+09,0.566388
4,14,8234,4602,11000,2020-07-05 05:35:43.459,20200705.0,1.593899e+09,0.418364
...,...,...,...,...,...,...,...,...
4995,19,5818,7005,9700,2020-08-01 07:29:38.403,20200801.0,1.596238e+09,0.722165
4996,19,7298,8411,12367,2020-08-01 07:29:56.81,20200801.0,1.596238e+09,0.680116
4997,19,5664,8243,17134,2020-08-01 07:30:05.171,20200801.0,1.596238e+09,0.481090
4998,19,839,5975,7400,2020-08-01 07:34:38.324,20200801.0,1.596238e+09,0.807432


In [43]:
# Créer un indicateur d'interaction positive basé sur le temps de visionnement
# Si l'utilisateur regarde au moins 50% de la vidéo, on considère que c'est un feedback positif
interactions['positive_interaction'] = (interactions['watch_ratio'] >= 0.5).astype(int)

In [39]:
social_network['num_friends'] = social_network['friend_list_parsed'].apply(len)

In [32]:
# Diviser les données en ensembles d'entraînement et de test
# Utiliser une division chronologique si possible, sinon utiliser une division aléatoire
if 'timestamp' in interactions.columns:
    # Trier par timestamp
    interactions = interactions.sort_values('timestamp')
    # Prendre les 80% premières interactions pour l'entraînement
    split_idx = int(len(interactions) * 0.8)
    train_data = interactions.iloc[:split_idx]
    test_data = interactions.iloc[split_idx:]
else:
    # Division aléatoire stratifiée par utilisateur
    unique_users = interactions['user_id'].unique()
    train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)
    
    train_data = interactions[interactions['user_id'].isin(train_users)]
    test_data = interactions[interactions['user_id'].isin(test_users)]

print(f"Taille de l'ensemble d'entraînement: {train_data.shape}")
print(f"Taille de l'ensemble de test: {test_data.shape}")

# Sauvegarder les données prétraitées
train_data.to_csv('interactions_train.csv', index=False)
test_data.to_csv('interactions_test.csv', index=False)

Taille de l'ensemble d'entraînement: (3741256, 9)
Taille de l'ensemble de test: (935314, 9)


In [35]:
test_users = test_data['user_id'].unique()
test_videos = interactions['video_id'].unique()

In [36]:
# # Exemple de format de soumission (top 10 vidéos pour chaque utilisateur)
# sample_submission = pd.DataFrame({
#     'user_id': np.repeat(test_users, 10),
#     'video_id': [0] * (len(test_users) * 10),  # Sera remplacé par les vraies prédictions
#     'rank': np.tile(range(1, 11), len(test_users))
# })
# sample_submission.to_csv('sample_submission.csv', index=False)

# print("Prétraitement terminé. Fichiers sauvegardés: interactions_train.csv, interactions_test.csv, sample_submission.csv")

In [42]:
num_users = train_data['user_id'].nunique()
num_videos = train_data['video_id'].nunique()

print(f"Number of unique users in train data: {num_users}")
print(f"Number of unique videos in train data: {num_videos}")

Number of unique users in train data: 1411
Number of unique videos in train data: 2993


In [None]:
# Créer une matrice d'interactions utilisateur-item
def create_interaction_matrix(data, user_col='user_id', item_col='video_id', rating_col='positive_interaction'):
    """
    Crée une matrice d'interactions entre utilisateurs et items.
    """
    interactions = data.groupby([user_col, item_col])[rating_col].sum().unstack().fillna(0)
    return interactions

# Créer la matrice pour l'ensemble d'entraînement
train_matrix = create_interaction_matrix(train_data)
print(f"Dimensions de la matrice d'interactions d'entraînement: {train_matrix.shape}")
print(f"Densité de la matrice: {train_matrix.count().sum() / (train_matrix.shape[0] * train_matrix.shape[1]):.4f}")

# Sauvegarder la matrice d'interactions
train_matrix.to_pickle('train_interaction_matrix.pkl')

Dimensions de la matrice d'interactions d'entraînement: (1411, 2993)
Densité de la matrice: 1.0000


In [38]:
train_matrix

video_id,103,109,120,122,128,130,131,133,136,137,...,10370,10371,10372,10374,10375,10377,10378,10382,10385,10386
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
19,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
21,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
24,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7142,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7147,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7153,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
7159,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
