In [1]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
import typing as tp

import dill
import pandas as pd
import numpy as np
from implicit.nearest_neighbours import BM25Recommender, TFIDFRecommender
from rectools import Columns
import scipy as sp

В ноутбуку "HW-3.3" c помощью стратегии валидации по неделям были отобраны несколько моделей с наиболее высокими метриками:

- BMP25Recommender с гиперпараметром k = 60
- TFIDFRecommender с гиперпараметром k = 60

Для этих моделей сформированы оффлайн рекомендации, которые показали 0.10384918 и 0.09577425 соответственно.

Для формирования онлайн рекомендаций следует обучить те же архитектуры моделей с такими же гиперпараметрами из библиотеки implicit

In [3]:
interactions = pd.read_csv('../data/kion_train/interactions.csv')

interactions.rename(columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True
) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

# Create train data

In [4]:
# формирование id для user и item
users_inv_mapping = dict(enumerate(interactions['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}
items_inv_mapping = dict(enumerate(interactions['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}
print(f"Unique users: {len(users_inv_mapping)}")
print(f"Unique items: {len(items_inv_mapping)}")

Unique users: 962179
Unique items: 15706


In [5]:
def get_matrix(
    df: pd.DataFrame,
    user_col: str = Columns.User,
    item_col: str = Columns.Item,
    weight_col: str = None,
    users_mapping: tp.Dict[int, int] = None,
    items_mapping: tp.Dict[int, int] = None
):

    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights,
        (
            df[user_col].map(users_mapping.get),
            df[item_col].map(items_mapping.get)
        )
    ))

    watched = df.groupby(user_col).agg({item_col: list})
    return interaction_matrix

In [6]:
weight_matrix = get_matrix(
    df=interactions,
    users_mapping=users_mapping,
    items_mapping=items_mapping
)

# Models train

In [7]:
model_implicit_tfidf = TFIDFRecommender(K=60)
model_implicit_bmp25 = BM25Recommender(K=60)

In [8]:
model_implicit_tfidf.fit(weight_matrix)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [10]:
with open("../service/weights/userKNN/tfidf-k60-implicit.dill", "wb") as file:
    dill.dump(model_implicit_tfidf, file)

In [11]:
model_implicit_bmp25.fit(weight_matrix)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [12]:
with open("../service/weights/userKNN/bmp25-k60-implicit.dill", "wb") as file:
    dill.dump(model_implicit_bmp25, file)

# Reco for one user

In [66]:
watched = interactions.groupby('user_id').agg({'item_id': list})
pd.concat([watched.head(), watched.tail()])

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[7102, 14359, 15297, 6006, 9728, 12192]"
1,"[3669, 10440]"
2,"[7571, 3541, 15266, 13867, 12841, 10770, 4475,..."
3,"[12192, 9728, 16406, 15719, 10440, 3475, 2025,..."
4,"[4700, 6317]"
1097553,"[24, 13058, 12463, 12659]"
1097554,"[16361, 496, 1053, 11275, 4580, 1151, 849, 350..."
1097555,"[14703, 140, 9728, 496, 6916, 4662, 4880]"
1097556,[12812]
1097557,"[4151, 3182, 15297]"


In [95]:
def recs_mapper(user, model, user_mapping, user_inv_mapping, k_reco: int = 10, bmp: bool = False):
    user_id = user_mapping[user]
    recs = model.similar_items(user_id, N=k_reco)
    result = pd.DataFrame(
        {
            "sim_user_id": [user_inv_mapping[user] for user, _ in recs], 
            "sim": [sim for _, sim in recs]  def
        }
    )
    
    if bmp:
        return result[result['sim_user_id'] != user]
    else: 
        return result[~(result['sim'] >= 1)] 

In [96]:
sample_users = interactions[Columns.User].sample(100).tolist()

In [99]:
print(sample_users[0])
df_sim = recs_mapper(sample_users[0], model_implicit_tfidf, users_mapping, users_inv_mapping)
df_sim

12861


Unnamed: 0,sim_user_id,sim
1,737239,0.479295
2,1069558,0.427898
3,933371,0.419511
4,409850,0.391727
5,989253,0.384045
6,817636,0.380609
7,1078420,0.372851
8,163595,0.370077
9,1003783,0.368852


In [100]:
df_sim = df_sim.merge(
    watched, left_on=['sim_user_id'], right_on=['user_id'], how='left'
).explode('item_id').sort_values(
    [ 'sim'], ascending=False
).drop_duplicates(
    ['item_id'], keep='first'
)
df_sim

Unnamed: 0,sim_user_id,sim,item_id
0,737239,0.479295,10755
0,737239,0.479295,496
0,737239,0.479295,12324
0,737239,0.479295,10219
0,737239,0.479295,6898
0,737239,0.479295,14476
0,737239,0.479295,13411
0,737239,0.479295,9194
0,737239,0.479295,6404
0,737239,0.479295,14961


In [101]:
print(sample_users[0])
df_sim = recs_mapper(sample_users[0], model_implicit_bmp25, users_mapping, users_inv_mapping, bmp=True)
df_sim

12861


Unnamed: 0,sim_user_id,sim
1,737239,85.368217
2,1006216,75.500227
3,933371,71.779085
4,989253,71.633147
5,1069558,71.248461
6,124735,69.342326
7,1078420,67.839079
8,289854,67.379224
9,409850,66.214999


In [102]:
df_sim = df_sim.merge(
    watched, left_on=['sim_user_id'], right_on=['user_id'], how='left'
).explode('item_id').sort_values(
    [ 'sim'], ascending=False
).drop_duplicates(
    ['item_id'], keep='first'
)
df_sim

Unnamed: 0,sim_user_id,sim,item_id
0,737239,85.368217,10755
0,737239,85.368217,14961
0,737239,85.368217,6404
0,737239,85.368217,9194
0,737239,85.368217,13411
0,737239,85.368217,496
0,737239,85.368217,6898
0,737239,85.368217,10219
0,737239,85.368217,14476
0,737239,85.368217,12324
