In [8]:
# %pip install service

In [9]:
# %pip install rectools

In [10]:
# %pip install implicit

In [3]:
import pandas as pd
import numpy as np
import scipy as sp
import requests

from joblib import Parallel, delayed
from tqdm.auto import tqdm
from scipy.stats import mode
from pprint import pprint
from implicit.nearest_neighbours import ItemItemRecommender, CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings
import dill

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, Serendipity, MeanInvUserFreq, calc_metrics, Precision, Recall
from rectools.model_selection import TimeRangeSplitter

from typing import Dict, List
from collections import Counter

# from service.userknn import UserKnn 

In [12]:
# !wget -q https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip -O data_original.zip
# %unzip -o data_original.zip
# %rm data_original.zip

### Смотрим на данные

In [5]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [6]:
interactions_df = interactions.rename(columns={'total_dur': Columns.Weight,
                                            'last_watch_dt': Columns.Datetime})

interactions_df['datetime'] = pd.to_datetime(interactions_df['datetime'])

In [15]:
interactions = Interactions(interactions_df)

In [16]:
pd.concat([interactions_df.head(), interactions_df.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0
5476246,648596,12225,2021-08-13,76.0,0.0
5476247,546862,9673,2021-04-13,2308.0,49.0
5476248,697262,15297,2021-08-20,18307.0,63.0
5476249,384202,16197,2021-04-19,6203.0,100.0
5476250,319709,4436,2021-08-15,3921.0,45.0


In [17]:
print(f"Interactions dataframe shape: {interactions_df.shape}")
print(f"Unique users in interactions: {interactions_df['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions_df['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [18]:
max_date = interactions_df['datetime'].max()
min_date = interactions_df['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


### users

In [19]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0
3,721985,age_45_54,income_20_40,Ж,0
4,704055,age_35_44,income_60_90,Ж,0
840192,339025,age_65_inf,income_0_20,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1
840194,251008,,,,0
840195,590706,,,Ж,0
840196,166555,age_65_inf,income_20_40,Ж,0


In [20]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


In [21]:
pd.concat([items.head(2), items.tail(2)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
15961,4538,series,Среди камней,Darklands,2019.0,"драмы, спорт, криминал",Россия,0.0,18.0,,"Марк О’Коннор, Конор МакМахон","Дэйн Уайт О’Хара, Томас Кэйн-Бирн, Джудит Родд...",Семнадцатилетний Дэмиен мечтает вырваться за п...,"Среди, камней, 2019, Россия"
15962,3206,series,Гоша,,2019.0,комедии,Россия,0.0,16.0,,Михаил Миронов,"Мкртыч Арзуманян, Виктория Рунцова","Добродушный Гоша не может выйти из дома, чтобы...","Гоша, 2019, Россия"


In [22]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


### Разделяем по фолдам

In [23]:
# Number of splits for time-based cross-validation
N_SPLITS = 3

# Time duration for each test set in the cross-validation
TEST_SIZE = '7D'  # 7 days

# Initializing a generator for time-based cross-validation folds
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)


In [24]:
# Obtain the borders of test folds in the time-based cross-validation
test_fold_borders = cv.get_test_fold_borders(interactions)

### Выбираем модели и метрики

In [25]:
# Dictionary containing various recommendation metrics to evaluate the models
metrics = {
    'map@10': MAP(k=10),                # Mean Average Precision at 10
    'novelty': MeanInvUserFreq(k=10),   # Novelty based on the mean inverse user frequency at 10
    'serendipity': Serendipity(k=10),   # Serendipity at 10
    "precision@10": Precision(k=10),    # Precision at 10
    "recall@10": Recall(k=10),          # Recall at 10
}

# Dictionary containing several simple recommendation models for comparison
models = {
    'tfidf_userknn': TFIDFRecommender(),
    'bm25_userknn': BM25Recommender(),
}

# List of user count values for evaluation
N_users_values = [40, 50]

### Создаем класс с моделью

In [26]:
from typing import Dict
from collections import Counter

import pandas as pd
import numpy as np
import scipy as sp
from implicit.nearest_neighbours import ItemItemRecommender


class UserKnn():
    """Class for fit-perdict UserKNN model 
       based on ItemKNN model from implicit.nearest_neighbours
    """
    
    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False
        
    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}
        
        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}
    
    def get_matrix(self, df: pd.DataFrame, 
                   user_col: str = 'user_id', 
                   item_col: str = 'item_id', 
                   weight_col: str = None, 
                   users_mapping: Dict[int, int] = None, 
                   items_mapping: Dict[int, int] = None):
    
        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        self.interaction_matrix = sp.sparse.coo_matrix((
            weights, 
            (
                df[item_col].map(self.items_mapping.get),
                df[user_col].map(self.users_mapping.get)
            )
            ))
        
        self.watched = df\
            .groupby(user_col, as_index=False)\
            .agg({item_col: list})\
            .rename(columns={user_col: 'sim_user_id'})
        
        return self.interaction_matrix
        
    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)
        
    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df['item_id'].values)
        item_idf = pd.DataFrame.from_dict(item_cnt, orient='index', 
                                          columns=['doc_freq']).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf 
    
    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(train, 
                                              users_mapping=self.users_mapping, 
                                              items_mapping=self.items_mapping)
        
        self.n = train.shape[0]
        self._count_item_idf(train)
        
        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int], 
                              user_inv_mapping: Dict[int, int], N: int):
        def _recs_mapper(user):
            user_id = self.users_mapping[user]
            users, sim = model.similar_items(user_id, N=N)
            return [self.users_inv_mapping[user] for user in users], sim
        return _recs_mapper
    
    def predict(self, test: pd.DataFrame, N_recs: int = 10):
        
        if not self.is_fitted:
            raise ValueError("Please call fit before predict")
        
        mapper = self._generate_recs_mapper(
            model=self.user_knn, 
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()
        
        recs = recs[~(recs['user_id'] == recs['sim_user_id'])]\
            .merge(self.watched, on=['sim_user_id'], how='left')\
            .explode('item_id')\
            .sort_values(['user_id', 'sim'], ascending=False)\
            .drop_duplicates(['user_id', 'item_id'], keep='first')\
            .merge(self.item_idf, left_on='item_id', right_on='index', how='left')
        
        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1 
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]
    
    
    def make_recommendations(self, target_user_id: int, num_recommendations: int = 10):
        # проверка того, что модель натренерована
        if not self.is_fitted:
            raise ValueError("Please train the model before making predictions")

        # создаем внутренние id юзеров
        mapped_user_id = self.user_id_mapping.get(target_user_id)

        # Находим похожих фользователей и степень их похржести 
        similar_users, similarities = self.user_knn.find_similar_users(mapped_user_id, N=self.num_users)

        recommendations = []  # список для рекомендаций
        for sim_user, similarity in zip(similar_users, similarities):
            sim_user_id = self.inverse_user_id_mapping.get(sim_user)
            # Исключаем там где нет рекомендаций и рекомендации только с сами мобой 
            if sim_user_id is not None and sim_user_id != target_user_id:
                # item, которые смотрели похожие юзеры
                watched_items = self.user_history.loc[self.user_history['sim_user_id'] == sim_user_id, 'item_id'].values
                watched_items_flat = np.concatenate(watched_items)
                watched_items_flat = np.unique(watched_items_flat[watched_items_flat != None])

                recommendations.extend(filter(None, map(self.item_mapping.get, watched_items_flat)))

        # возвращаем заданное количество рекомендаций 
        recommendations = list(set(recommendations))[:num_recommendations]
        return recommendations
        
    
    

In [28]:
%%time

results = []

# Generate time-based cross-validation folds
fold_iterator = cv.split(interactions, collect_fold_stats=True)

# итерация по каждому фолду
for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    # трэйн и тест для конкретного фолда
    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    # список уникальных item в данном каталоге
    catalog = df_train[Columns.Item].unique()
    
    for N_users in N_users_values:
        for model_name, model in models.items():
            # создаем и обучанием модель с указанынми параметрамии 
            userknn_model = UserKnn(model=model, N_users=N_users)
            userknn_model.fit(df_train)

            # генируем рекомандации для тестовой выборки
            recos = userknn_model.predict(df_test)

            # рассчитываем метрики 
            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )

            fold = {"fold": i_fold, "model": model_name, "N_users": N_users}
            fold.update(metric_values)
            results.append(fold)


{'end': Timestamp('2021-08-09 00:00:00'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}




  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]



  0%|          | 0/797423 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-16 00:00:00'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}




  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]



  0%|          | 0/850489 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}




  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]



  0%|          | 0/906071 [00:00<?, ?it/s]

CPU times: user 3h 33s, sys: 1min 40s, total: 3h 2min 14s
Wall time: 7h 2min


In [29]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,N_users,precision@10,recall@10,map@10,novelty,serendipity
0,0,tfidf_userknn,40,0.006922,0.035573,0.006773,7.573736,6.1e-05
1,0,bm25_userknn,40,0.003119,0.014228,0.002784,9.199335,9e-05
2,0,tfidf_userknn,50,0.006922,0.035573,0.006773,7.573736,6.1e-05
3,0,bm25_userknn,50,0.003119,0.014228,0.002784,9.199335,9e-05
4,1,tfidf_userknn,40,0.00641,0.033086,0.006279,7.631293,6.6e-05
5,1,bm25_userknn,40,0.003011,0.013753,0.002679,9.28558,0.000102
6,1,tfidf_userknn,50,0.00641,0.033086,0.006279,7.631293,6.6e-05
7,1,bm25_userknn,50,0.003011,0.013753,0.002679,9.28558,0.000102
8,2,tfidf_userknn,40,0.006257,0.031099,0.005949,7.710766,7e-05
9,2,bm25_userknn,40,0.00302,0.013504,0.002633,9.364591,0.000105


In [30]:
model_metrics_mean = df_metrics.groupby(['model', 'N_users']).mean()[metrics.keys()]
model_metrics_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,map@10,novelty,serendipity,precision@10,recall@10
model,N_users,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bm25_userknn,40,0.002699,9.283169,9.9e-05,0.00305,0.013828
bm25_userknn,50,0.002699,9.283169,9.9e-05,0.00305,0.013828
tfidf_userknn,40,0.006334,7.638598,6.5e-05,0.006529,0.033253
tfidf_userknn,50,0.006334,7.638598,6.5e-05,0.006529,0.033253


In [31]:
userknn = UserKnn(model=TFIDFRecommender(), N_users=50)
userknn.fit(interactions.df)



  0%|          | 0/962179 [00:00<?, ?it/s]

In [32]:
# Сохраняем модель 
with open('userknn.dill', 'wb') as f:
    dill.dump(userknn, f)

In [10]:
# загрузка модели из сохраненного файла
with open('userknn.dill', 'rb') as f:
    userknn = dill.load(f)

In [13]:
predict = userknn.predict(interactions_df)

In [15]:
predict.head(3)

Unnamed: 0,user_id,item_id,score,rank
2,1097557,3182,6.137841,1
0,1097557,4151,4.111983,2
1,1097557,15297,3.379502,3


формирование результатов для cold users 

In [1]:
from rectools.models.popular import PopularModel

In [7]:
dataset = Dataset.construct(
    interactions_df=interactions_df,
    user_features_df=None,
    item_features_df=None
)

popular_model = PopularModel()
popular_model.fit(dataset)

item_inv = dict(enumerate(interactions_df["item_id"].unique()))
recos_pop = []
for item_pop in popular_model.popularity_list[0]:
    recos_pop.append(item_inv[item_pop])

df_pop_recos = pd.DataFrame({"item_id": recos_pop})

df_pop_recos.to_csv("popular_item.csv", index=False)

In [16]:
df_pop_recos

Unnamed: 0,item_id
0,10440
1,15297
2,9728
3,13865
4,4151
...,...
15701,8076
15702,8954
15703,15664
15704,818
