# Установка библиотек

In [1]:
!pip install rectools

Collecting rectools
  Downloading rectools-0.4.1-py3-none-any.whl (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.0/99.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting implicit<0.8.0,>=0.7.1 (from rectools)
  Downloading implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
Collecting typeguard<3.0.0,>=2.0.1 (from rectools)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Installing collected packages: typeguard, implicit, rectools
Successfully installed implicit-0.7.2 rectools-0.4.1 typeguard-2.13.3


In [2]:
import pandas as pd
import numpy as np
import zipfile as zf

import requests
from tqdm.auto import tqdm

from rectools import Columns
from rectools.models import PopularModel
from rectools.model_selection import TimeRangeSplitter
from rectools.metrics import MAP,Serendipity,MeanInvUserFreq, calc_metrics
from rectools.dataset import Interactions, Dataset

from implicit.nearest_neighbours import CosineRecommender,ItemItemRecommender,BM25Recommender,TFIDFRecommender



# Чтение данных

In [3]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'


In [4]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [5]:


files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()


In [6]:
interactions_df = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions_df.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

In [7]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [8]:
interactions = Interactions(interactions_df)
interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


# Доработка модели

In [15]:
from typing import Dict
from collections import Counter

import pandas as pd
import numpy as np
import scipy as sp
from rectools.models import PopularModel


class UserKnn():
    """Class for fit-perdict UserKNN model
       based on ItemKNN model from implicit.nearest_neighbours
    """

    def __init__(self, model: ItemItemRecommender, N_users: int = 50):
        self.N_users = N_users
        self.model = model
        self.is_fitted = False
        self.popular_model = PopularModel()

    def get_mappings(self, train):
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}

        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_matrix(self, df: pd.DataFrame,
                   user_col: str = 'user_id',
                   item_col: str = 'item_id',
                   weight_col: str = None,
                   users_mapping: Dict[int, int] = None,
                   items_mapping: Dict[int, int] = None):

        if weight_col:
            weights = df[weight_col].astype(np.float32)
        else:
            weights = np.ones(len(df), dtype=np.float32)

        self.interaction_matrix = sp.sparse.coo_matrix((
            weights,
            (
                df[item_col].map(self.items_mapping.get),
                df[user_col].map(self.users_mapping.get)
            )
            ))

        self.watched = df\
            .groupby(user_col, as_index=False)\
            .agg({item_col: list})\
            .rename(columns={user_col: 'sim_user_id'})

        return self.interaction_matrix

    def idf(self, n: int, x: float):
        return np.log((1 + n) / (1 + x) + 1)

    def _count_item_idf(self, df: pd.DataFrame):
        item_cnt = Counter(df['item_id'].values)
        item_idf = pd.DataFrame.from_dict(item_cnt, orient='index',
                                          columns=['doc_freq']).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf

    def fit(self, train: pd.DataFrame):
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(train,
                                              users_mapping=self.users_mapping,
                                              items_mapping=self.items_mapping)

        self.n = train.shape[0]
        self._count_item_idf(train)

        self.user_knn.fit(self.weights_matrix)
        self.popular_model.fit(Dataset.construct(train))
        self.is_fitted = True

    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int],
                              user_inv_mapping: Dict[int, int], N: int):
        def _recs_mapper(user):
            user_id = self.users_mapping[user]
            users, sim = model.similar_items(user_id, N=N)
            return [self.users_inv_mapping[user] for user in users], sim
        return _recs_mapper

    def predict(self, test: pd.DataFrame, N_recs: int = 10):

        if not self.is_fitted:
            raise ValueError("Please call fit before predict")

        mapper = self._generate_recs_mapper(
            model=self.user_knn,
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

        recs = recs[~(recs['user_id'] == recs['sim_user_id'])]\
            .merge(self.watched, on=['sim_user_id'], how='left')\
            .explode('item_id')\
            .sort_values(['user_id', 'sim'], ascending=False)\
            .drop_duplicates(['user_id', 'item_id'], keep='first')\
            .merge(self.item_idf, left_on='item_id', right_on='index', how='left')

        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]

    def recommend(self,user_id, N_recs: int = 10 ):
        user_predict = pd.DataFrame({"user_id": [user_id]})
        user_recommendations = list(self.predict(user_predict, N_recs=N_recs).item_id)
        if len(user_recommendations) == N_recs:
            return user_recommendations

        unique_items = set(user_recommendations)

        popular_recommendations =  [self.items_inv_mapping[item] for item in self.popular_model.popularity_list[0][:N_recs]]

        for item in popular_recommendations:
            if item not in unique_items:
                user_recommendations.append(item)
                unique_items.add(item)
                if len(user_recommendations) == N_recs:
                    break
        return user_recommendations

Что сделано:

Добавлена обработка холодных юзеров

Модель возвращает ровно N рекомендация, а не меньше

In [16]:
userknn_model = UserKnn(model=TFIDFRecommender(), N_users=50)
userknn_model.fit(interactions.df)



  0%|          | 0/962179 [00:00<?, ?it/s]

In [17]:
userknn_model.recommend(176549)

[15469, 5518, 12448, 6737, 5482, 10688, 4273, 5695, 7453, 5600]

In [18]:
import pickle

with open('/content/drive/MyDrive/Colab Notebooks/Recsys/userknn_model.pkl', 'wb') as file:
    pickle.dump(userknn_model, file)

In [19]:
with open('/content/drive/MyDrive/Colab Notebooks/Recsys/userknn_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [20]:
loaded_model.recommend(176549)

[15469, 5518, 12448, 6737, 5482, 10688, 4273, 5695, 7453, 5600]