##### import libs

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import lightfm
import itertools
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

In [2]:
from utils import evaluate, load_data

In [3]:
from feature_processor import like_dislike_group_features

In [4]:
pd.set_option('display.max_rows', 100)

##### read data

In [49]:
user_item_data, user_meta_data, item_meta_data, test_pairs_data = load_data()

In [6]:
user_item_data = user_item_data.merge(
    right=item_meta_data.drop(columns="embeddings"),
    on="item_id",
    how="left",
)

In [7]:
test_pairs_data = test_pairs_data.merge(
    right=item_meta_data.drop(columns="embeddings"),
    on="item_id",
    how="left",
)

## group features

#### views and feedback grouped by user_id

In [8]:
user_group_features = like_dislike_group_features(
    user_item_data=user_item_data,
    group_col="user_id"
)

In [9]:
user_group_features.head(3)

Unnamed: 0_level_0,num_of_views_by_user_id,num_of_likes_by_user_id,ratio_of_likes_by_user_id,num_of_dislikes_by_user_id,ratio_of_dislikes_by_user_id
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,368,25,0.067935,0,0.0
1,203,13,0.064039,0,0.0
2,127,6,0.047244,0,0.0


In [9]:
user_item_data = user_item_data.merge(
    right=user_group_features,
    on="user_id",
    how="left",
)

In [13]:
test_pairs_data = test_pairs_data.merge(
    right=user_group_features,
    on="user_id",
    how="left",
)

#### views and feedback grouped by item_id

In [15]:
item_group_features = like_dislike_group_features(
    user_item_data=user_item_data,
    group_col="item_id"
)

In [16]:
item_group_features.head(3)

Unnamed: 0_level_0,num_of_views_by_item_id,num_of_likes_by_item_id,ratio_of_likes_by_item_id,num_of_dislikes_by_item_id,ratio_of_dislikes_by_item_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,51,3,0.058824,0,0.0
1,43,3,0.069767,0,0.0
2,63,5,0.079365,0,0.0


In [15]:
user_item_data = user_item_data.merge(
    right=item_group_features,
    on="item_id",
    how="left",
)

In [16]:
test_pairs_data = test_pairs_data.merge(
    right=item_group_features,
    on="item_id",
    how="left",
)

#### source_id-specific views and feedback grouped by user_id

In [18]:
user_view_counts_by_source_id = user_item_data.groupby(by="user_id").source_id.apply(lambda x: x.value_counts())
user_like_counts_by_source_id = user_item_data[user_item_data.like==1].groupby(by="user_id").source_id.apply(lambda x: x.value_counts())

и теперь нужно сделать нормальный мерж этих данных в user_item_data

 - BANNED user_viewed_this_source_id - 1/0
 - BANNED user_liked_this_source_id - 1/0
 - DONE user_view_counts_by_source_id - int
 - DONE user_like_counts_by_source_id - int
 - DONE user_view_counts_by_source_id_ratio_to_views - float[0,1] - доля просмотров пользователем этого автора среди всех просмотров пользователя
 - DONE user_like_counts_by_source_id_ratio_to_likes - float[0,1] - доля лайков пользователем этого автора среди всех лайков пользователя
 - DONE user_like_counts_by_source_id_ratio_to_views - float[0,1] - доля лайков среди просмотров пользователем этого атвора 

In [19]:
user_view_counts_by_source_id_dict = user_view_counts_by_source_id.to_dict()

user_item_data["user_view_counts_by_source_id"] = [
    user_view_counts_by_source_id_dict.get((user_id, source_id), 0)
    for user_id, source_id in zip(user_item_data['user_id'], user_item_data['source_id'])
]

In [22]:
test_pairs_data["user_view_counts_by_source_id"] = [
    user_view_counts_by_source_id_dict.get((user_id, source_id), 0)
    for user_id, source_id in zip(test_pairs_data['user_id'], test_pairs_data['source_id'])
]

In [24]:
user_like_counts_by_source_id_dict = user_like_counts_by_source_id.to_dict()

In [None]:

user_item_data["user_like_counts_by_source_id"] = [
    user_like_counts_by_source_id_dict.get((user_id, source_id), 0)
    for user_id, source_id in zip(user_item_data['user_id'], user_item_data['source_id'])
]

In [25]:
test_pairs_data["user_like_counts_by_source_id"] = [
    user_like_counts_by_source_id_dict.get((user_id, source_id), 0)
    for user_id, source_id in zip(test_pairs_data['user_id'], test_pairs_data['source_id'])
]

In [None]:
user_item_data.user_like_counts_by_source_id = user_item_data.user_like_counts_by_source_id - user_item_data.like
user_item_data.user_view_counts_by_source_id = user_item_data.user_view_counts_by_source_id - 1

In [27]:
test_pairs_data.user_like_counts_by_source_id = test_pairs_data.user_like_counts_by_source_id
test_pairs_data.user_view_counts_by_source_id = test_pairs_data.user_view_counts_by_source_id

In [None]:
user_item_data["user_view_counts_by_source_id_ratio_to_views"] = user_item_data["user_view_counts_by_source_id"] / user_item_data["num_of_views_by_user"]
user_item_data["user_like_counts_by_source_id_ratio_to_views"] = user_item_data["user_like_counts_by_source_id"] / user_item_data["user_view_counts_by_source_id"]
user_item_data["user_like_counts_by_source_id_ratio_to_likes"] = user_item_data["user_like_counts_by_source_id"] / user_item_data["num_of_likes_by_user"]

user_item_data["user_view_counts_by_source_id_ratio_to_views"] = user_item_data["user_view_counts_by_source_id_ratio_to_views"].fillna(0)
user_item_data["user_like_counts_by_source_id_ratio_to_views"] = user_item_data["user_like_counts_by_source_id_ratio_to_views"].fillna(0)
user_item_data["user_like_counts_by_source_id_ratio_to_likes"] = user_item_data["user_like_counts_by_source_id_ratio_to_likes"].fillna(0)

In [28]:
test_pairs_data["user_view_counts_by_source_id_ratio_to_views"] = test_pairs_data["user_view_counts_by_source_id"] / test_pairs_data["num_of_views_by_user"]
test_pairs_data["user_like_counts_by_source_id_ratio_to_views"] = test_pairs_data["user_like_counts_by_source_id"] / test_pairs_data["user_view_counts_by_source_id"]
test_pairs_data["user_like_counts_by_source_id_ratio_to_likes"] = test_pairs_data["user_like_counts_by_source_id"] / test_pairs_data["num_of_likes_by_user"]

test_pairs_data["user_view_counts_by_source_id_ratio_to_views"] = test_pairs_data["user_view_counts_by_source_id_ratio_to_views"].fillna(0)
test_pairs_data["user_like_counts_by_source_id_ratio_to_views"] = test_pairs_data["user_like_counts_by_source_id_ratio_to_views"].fillna(0)
test_pairs_data["user_like_counts_by_source_id_ratio_to_likes"] = test_pairs_data["user_like_counts_by_source_id_ratio_to_likes"].fillna(0)

In [19]:
user_item_data = user_item_data.astype({
    "user_like_counts_by_source_id": np.int16,
    "user_view_counts_by_source_id": np.int16,
    "user_view_counts_by_source_id_ratio_to_views": np.float32,
    "user_like_counts_by_source_id_ratio_to_views": np.float32,
    "user_like_counts_by_source_id_ratio_to_likes": np.float32,
})

In [29]:
test_pairs_data = test_pairs_data.astype({
    "user_like_counts_by_source_id": np.int16,
    "user_view_counts_by_source_id": np.int16,
    "user_view_counts_by_source_id_ratio_to_views": np.float32,
    "user_like_counts_by_source_id_ratio_to_views": np.float32,
    "user_like_counts_by_source_id_ratio_to_likes": np.float32,
})

In [30]:
user_item_data.head(3)

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit,source_id,duration,...,num_of_likes_by_user,ratio_of_likes_by_user,num_of_dislikes_by_user,ratio_of_dislikes_by_user,num_of_views_by_item,num_of_likes_by_item,ratio_of_likes_by_item,num_of_dislikes_by_item,ratio_of_dislikes_by_item,user_view_counts_by_source_id
0,3810,138979,6,0,0,0,0,0,4278,54,...,3,0.001268,0,0.0,629,37,0.058824,0,0.0,18
1,101874,331160,6,0,0,0,0,0,2049,6,...,0,0.0,0,0.0,50,6,0.12,0,0.0,1
2,150332,73709,11,0,0,0,0,0,16375,16,...,55,0.035347,1,0.000643,6760,52,0.007692,5,0.00074,1


In [31]:
test_pairs_data.head(3)

Unnamed: 0,user_id,item_id,num_of_views_by_user,num_of_likes_by_user,ratio_of_likes_by_user,num_of_dislikes_by_user,ratio_of_dislikes_by_user,num_of_views_by_item,num_of_likes_by_item,ratio_of_likes_by_item,num_of_dislikes_by_item,ratio_of_dislikes_by_item,source_id,duration,user_view_counts_by_source_id,user_like_counts_by_source_id,user_view_counts_by_source_id_ratio_to_views,user_like_counts_by_source_id_ratio_to_views,user_like_counts_by_source_id_ratio_to_likes
0,1,7363,203,13,0.064039,0,0.0,15659,2308,0.147391,1,6.4e-05,5119,9,0,0,0.0,0.0,0.0
1,1,73770,203,13,0.064039,0,0.0,6091,1110,0.182236,1,0.000164,2720,5,0,0,0.0,0.0,0.0
2,1,75700,203,13,0.064039,0,0.0,3126,379,0.121241,1,0.00032,11601,32,0,0,0.0,0.0,0.0


In [21]:
user_item_data.to_parquet("data/user_item_data_w_group_features.parquet")

In [33]:
test_pairs_data.to_parquet("data/test_pairs_data_w_group_features.parquet")

: 

## lag features

In [None]:
def compute_rolling_features(df,
                             group_col,
                             target_col,
                             lags,
                             windows,
                             agg_funcs):
    """
    Computes rolling lag, window and expanding window features for a target column within groups of a specified column.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        group_col (str): Column name to group by.
        target_col (str): Column name for which lagged features are computed.
        lags (list): List of lag steps to compute.
        windows (list): List of window sizes to compute.
        agg_funcs (list): List of functions to be used for aggregation.

    Returns:
        pd.DataFrame: DataFrame with additional rolling features.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    result_df = df.copy()
    
    #########
    result_df.loc[-int(len(result_df)*0.15):].like = 0
    #########

    # Group by the specified column and apply shifting for each lag
    grouped = result_df.groupby(group_col, as_index=False)
    for lag in tqdm(lags):
        result_df[f"{target_col}_lag_{lag}"] = grouped[target_col].shift(lag).fillna(0).astype(np.int8)

    for window in tqdm(windows):
        for agg_func in agg_funcs:
            result_df[f"{target_col}_window_{window}_agg_{agg_func}"] = grouped[target_col].rolling(window=window, min_periods=0).agg(agg_func).like.astype(np.int16)
    
    return result_df

In [65]:
def compute_lag(
        train_df,
        val_df,
        group_col,
        target_col,
        lag,
        ):
    grouped = train_df.groupby(group_col, as_index=False)
    lag_col_name = f"{target_col}_lag_{lag}"
    train_df[lag_col_name] = grouped[target_col].shift(lag).fillna(0).astype(np.int16)
    grouped = train_df.groupby(group_col, as_index=False)
    last_lag_by_user_id = grouped[lag_col_name].agg("last")
    val_df = val_df.merge(
        right=last_lag_by_user_id,
        on=group_col,
        how="left"
    )
    return train_df, val_df

In [54]:
grouped = user_item_data.groupby("user_id", as_index=False)

In [55]:
user_item_data["like_lag_1"] = grouped["like"].shift(1).fillna(0).astype(np.int16)
grouped = user_item_data.groupby("user_id", as_index=False)

In [57]:
last_like_lag_1_by_user_id = grouped.like_lag_1.agg("last")

In [58]:
train_df, val_df = train_test_split(
    user_item_data,
    test_size=0.15,
    random_state=42,
    shuffle=False,
)

In [60]:
val_df = val_df.drop(columns=["like_lag_1"]).merge(
    right=last_like_lag_1_by_user_id,
    on="user_id",
    how="left"
)

In [63]:
train_df.corr()

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit,like_lag_1
user_id,1.0,0.001844,-0.04151,0.040867,-0.001346,0.002052,0.007363,0.040796,0.040854
item_id,0.001844,1.0,-0.005022,0.000779,-2.4e-05,0.00084,-8e-06,0.000777,0.002375
timespent,-0.04151,-0.005022,1.0,0.043284,0.005536,0.067186,0.021156,0.042557,-0.025944
like,0.040867,0.000779,0.043284,1.0,-0.004356,0.133063,0.064381,0.995601,0.280107
dislike,-0.001346,-2.4e-05,0.005536,-0.004356,1.0,-0.000258,0.003142,-0.098034,-0.000692
share,0.002052,0.00084,0.067186,0.133063,-0.000258,1.0,0.034109,0.132447,0.038528
bookmarks,0.007363,-8e-06,0.021156,0.064381,0.003142,0.034109,1.0,0.063777,0.020155
explicit,0.040796,0.000777,0.042557,0.995601,-0.098034,0.132447,0.063777,1.0,0.278825
like_lag_1,0.040854,0.002375,-0.025944,0.280107,-0.000692,0.038528,0.020155,0.278825,1.0


In [64]:
val_df.corr()

Unnamed: 0,user_id,item_id,timespent,like,dislike,share,bookmarks,explicit,like_lag_1
user_id,1.0,-9.5e-05,-0.042766,0.03981,-0.001179,0.00052,0.008495,0.03973,0.03227
item_id,-9.5e-05,1.0,-0.005194,0.001415,0.00018,0.001174,-0.000414,0.001391,0.004571
timespent,-0.042766,-0.005194,1.0,0.03916,0.006221,0.064452,0.020738,0.038395,-0.028442
like,0.03981,0.001415,0.03916,1.0,-0.004547,0.127442,0.069273,0.995664,0.221687
dislike,-0.001179,0.00018,0.006221,-0.004547,1.0,-0.000566,0.004025,-0.097547,-9.7e-05
share,0.00052,0.001174,0.064452,0.127442,-0.000566,1.0,0.036677,0.126888,0.028943
bookmarks,0.008495,-0.000414,0.020738,0.069273,0.004025,0.036677,1.0,0.068569,0.018213
explicit,0.03973,0.001391,0.038395,0.995664,-0.097547,0.126888,0.068569,1.0,0.220641
like_lag_1,0.03227,0.004571,-0.028442,0.221687,-9.7e-05,0.028943,0.018213,0.220641,1.0
