<a href="https://colab.research.google.com/github/dashatenoff/recsys-vk/blob/main/notebooks/supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Supervised Model


In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from google.colab import drive
import os

In [None]:
drive.mount('/content/drive')
os.listdir('/content/drive/MyDrive')
train = pd.read_parquet('/content/drive/MyDrive/VK/train.parquet')
test = pd.read_parquet('/content/drive/MyDrive/VK/test.parquet')
item_embeddings = pd.read_parquet('/content/drive/MyDrive/VK/item_embeddings.parquet')
item_metadata = pd.read_parquet('/content/drive/MyDrive/VK/item_metadata.parquet')
user_metadata = pd.read_parquet('/content/drive/MyDrive/VK/user_metadata.parquet')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train.head()

Unnamed: 0,user_id,item_id,place,platform,agent,timespent,like,dislike,share,bookmark,click_on_author,open_comments
0,141827770,160593626,1,1,1,56,False,False,False,False,False,False
1,468779351,327484273,1,0,0,1,False,False,False,False,False,False
2,494341617,402842289,1,0,0,11,False,False,False,False,False,False
3,453833313,102672169,0,0,0,45,False,False,False,False,False,False
4,154047442,139795075,1,0,0,39,False,False,False,False,False,False


In [None]:
train_sup = train[
    ['user_id', 'item_id', 'timespent']
].copy()

train_sup['rating'] = train_sup['timespent'] > 5

train_sup =train_sup.merge(
    user_metadata[['user_id', 'age', 'gender', 'geo']],
    on = 'user_id',
    how = 'left'
)

train_sup = train_sup.merge(
    item_metadata[['item_id', 'author_id', 'duration']],
    on = 'item_id',
    how = 'left'
)

train_sup = train_sup.merge(
    item_embeddings[['item_id', 'embedding']],
    on='item_id',
    how='left'
)
train_sup['embedding'].iloc[0]

array([-0.52490234,  0.065979  ,  0.05831909, -0.01412201, -0.25610352,
       -0.12597656,  0.09906006,  0.13769531,  0.10577393,  0.31152344,
       -0.15002441, -0.13293457,  0.01779175, -0.07501221,  0.05148315,
        0.29614258, -0.0894165 , -0.056427  , -0.10980225,  0.04632568,
       -0.03396606,  0.20874023,  0.18920898,  0.04910278,  0.08551025,
       -0.05654907, -0.02313232, -0.04901123, -0.05541992, -0.02261353,
       -0.06463623, -0.15270996])

In [None]:
train_sup.head()

Unnamed: 0,user_id,item_id,timespent,rating,age,gender,geo,author_id,duration,embedding
0,141827770,160593626,56,True,43,1,78,250935,83,"[-0.52490234375, 0.06597900390625, 0.058319091..."
1,468779351,327484273,1,False,26,1,67,1178975,12,"[-0.390869140625, 0.140869140625, 0.0028190612..."
2,494341617,402842289,11,True,55,2,23,1238991,11,"[-0.478759765625, 0.2003173828125, 0.213500976..."
3,453833313,102672169,45,True,42,2,77,1040966,45,"[-0.28125, -0.1488037109375, 0.30322265625, 0...."
4,154047442,139795075,39,True,27,2,67,361464,14,"[-0.658203125, 0.1685791015625, -0.0888671875,..."


In [None]:
emb_df = pd.DataFrame( train_sup['embedding'].tolist(), index=train_sup.index)


In [None]:
emb_df.columns = [f'emb_{i}' for i in range(emb_df.shape[1])]
emb_df.head()

train_sup = pd.concat(
    [train_sup.drop(columns=['embedding']), emb_df],
      axis=1
)

In [None]:
emb_df.head()
train_sup['rating'].isna().sum()


np.int64(0)

In [None]:
y = train_sup['rating'].astype(int)

feature_cols = (
    ['age', 'gender', 'geo', 'author_id', 'duration'] +
    [c for c in train_sup.columns if c.startswith('emb_')]
)

X = train_sup[feature_cols]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
X_train.shape, X_val.shape



((3108253, 37), (777064, 37))

#Обучаем LightGBM

In [None]:
import lightgbm as lgb

model = lgb.LGBMClassifier(
    class_weight='balanced',
    n_estimators=100,
    learning_rate=0.1,
    num_leaves=31,
    random_state=42,
    n_jobs=-1
)


In [None]:
model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1639577, number of negative: 1468676
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.207855 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8727
[LightGBM] [Info] Number of data points in the train set: 3108253, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


In [None]:
y_val_pred = model.predict_proba(X_val)[:, 1]

#ROC-AUC

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(y_val, y_val_pred)
roc_auc

np.float64(0.6783868775905818)

### Supervised Model: LightGBM (Baseline)

В рамках supervised-подхода была обучена модель **LightGBM** для предсказания бинарного таргета `rating` (просмотр видео более 5 секунд).  
В качестве признаков использовались демографические характеристики пользователя, атрибуты видео и контентные embedding.

Модель обучалась на обучающей выборке с сохранением баланса классов и оценивалась на валидации с помощью метрики **ROC-AUC**, которая позволяет проверить наличие обучаемого сигнала независимо от порога классификации.

Полученное значение **ROC-AUC ≈ 0.68** подтверждает, что модель способна различать релевантные и нерелевантные взаимодействия и может быть использована в качестве базового ранкера в рекомендательной системе.

На следующем этапе модель применяется для ранжирования айтемов и используется как компонент гибридной рекомендательной системы.


In [None]:
user_candidates = (
    train_sup.groupby('user_it')['item_id'].apply(set)
)

