In [None]:
%cd content

/content


In [None]:
!pip install  rectools implicit requests tqdm

In [None]:
from pprint import pprint

import numpy as np
import pandas as pd

import requests
from tqdm.auto import tqdm
from rectools import Columns
from rectools.dataset import Dataset,Interactions
from rectools.models import ImplicitItemKNNWrapperModel

In [None]:
url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'

In [None]:
req = requests.get(url, stream=True)

with open('kion.zip', 'wb') as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

In [None]:
import zipfile as zf

files = zf.ZipFile('kion.zip','r')
files.extractall()
files.close()

In [None]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [None]:
interactions = pd.read_csv('data_original/interactions.csv', parse_dates=["last_watch_dt"])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)
interactions.head()


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0


In [None]:
max_date = interactions['datetime'].max()

train = interactions[(interactions['datetime'] < max_date - pd.Timedelta(days=7))]
test = interactions[(interactions['datetime'] >= max_date - pd.Timedelta(days=7))]


# оставляем только теплых пользователей в тесте
test = test[test['user_id'].isin(train['user_id'].unique())]

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 5)
test: (349088, 5)


In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=None,
    item_features_df=None
)
dataset

Dataset(user_id_map=IdMap(external_ids=array([176549, 699317, 656683, ..., 882138, 805174, 648596])), item_id_map=IdMap(external_ids=array([ 9506,  1659,  7107, ..., 13516, 13019, 10542])), interactions=Interactions(df=         user_id  item_id   weight   datetime
0              0        0   4250.0 2021-05-11
1              1        1   8317.0 2021-05-29
2              2        2     10.0 2021-05-09
3              3        3  14483.0 2021-07-05
4              4        0   6725.0 2021-04-30
...          ...      ...      ...        ...
5476244    69627      219   6804.0 2021-08-02
5476245    40052      132    753.0 2021-05-12
5476246   896790      318     76.0 2021-08-13
5476247   206604     2546   2308.0 2021-04-13
5476249     7236     1609   6203.0 2021-04-19

[4985269 rows x 4 columns]), user_features=None, item_features=None)

In [None]:
from typing import Dict
from collections import Counter

import pandas as pd
import numpy as np
import scipy as sp
from implicit.nearest_neighbours import ItemItemRecommender


class UserKnn():
    """Class for fit-perdict UserKNN model
       based on ItemKNN model from implicit.nearest_neighbours
    """

    def __init__(self, model: ItemItemRecommender, N_users: int = 50, cold_start_recommender=None):
        # Initialize UserKnn with the ItemItemRecommender model
        self.N_users = N_users
        self.model = model
        self.cold_start_recommender = cold_start_recommender
        self.is_fitted = False

    def get_mappings(self, train):
        # Map user and item IDs to internal indices
        self.users_inv_mapping = dict(enumerate(train['user_id'].unique()))
        self.users_mapping = {v: k for k, v in self.users_inv_mapping.items()}

        self.items_inv_mapping = dict(enumerate(train['item_id'].unique()))
        self.items_mapping = {v: k for k, v in self.items_inv_mapping.items()}

    def get_matrix(self, df: pd.DataFrame,
               user_col: str = 'user_id',
               item_col: str = 'item_id',
               weight_col: str = None,
               users_mapping: Dict[int, int] = None,
               items_mapping: Dict[int, int] = None):
      # Generate the interaction matrix (user-item matrix) with optional weights
      if weight_col:
        weights = df[weight_col].astype(np.float64)  # Convert to double
      else:
        weights = np.ones(len(df), dtype=np.float64)  # Use double type

      self.interaction_matrix = sp.sparse.coo_matrix((
        weights,
        (
            df[item_col].map(self.items_mapping.get),
            df[user_col].map(self.users_mapping.get)
        )
    ))
      # Create a dataframe with watched items for each user
      self.watched = df\
        .groupby(user_col, as_index=False)\
        .agg({item_col: list})\
        .rename(columns={user_col: 'sim_user_id'})

      return self.interaction_matrix


    def idf(self, n: int, x: float):
        # Inverse Document Frequency (IDF) calculation
        return np.log((1 + n) / (1 + x) + 1)

    def _count_item_idf(self, df: pd.DataFrame):
        # Count item IDF values
        item_cnt = Counter(df['item_id'].values)
        item_idf = pd.DataFrame.from_dict(item_cnt, orient='index',
                                          columns=['doc_freq']).reset_index()
        item_idf['idf'] = item_idf['doc_freq'].apply(lambda x: self.idf(self.n, x))
        self.item_idf = item_idf

    def fit(self, train: pd.DataFrame):
        # Fit the UserKnn model
        self.user_knn = self.model
        self.get_mappings(train)
        self.weights_matrix = self.get_matrix(train,
                                              users_mapping=self.users_mapping,
                                              items_mapping=self.items_mapping)

        self.n = train.shape[0]
        self._count_item_idf(train)

        # Fit the ItemItemRecommender model
        self.user_knn.fit(self.weights_matrix)
        self.is_fitted = True

    def _generate_recs_mapper(self, model: ItemItemRecommender, user_mapping: Dict[int, int],
                              user_inv_mapping: Dict[int, int], N: int):
        # Generate recommendations mapper for similar items
        def _recs_mapper(user):
            user_id = self.users_mapping[user]
            users, sim = model.similar_items(user_id, N=N)
            return [self.users_inv_mapping[user] for user in users], sim
        return _recs_mapper

    def predict(self, test: pd.DataFrame, N_recs: int = 10):
        # Make recommendations for test users
        if not self.is_fitted:
            raise ValueError("Please call fit before predict")

        mapper = self._generate_recs_mapper(
            model=self.user_knn,
            user_mapping=self.users_mapping,
            user_inv_mapping=self.users_inv_mapping,
            N=self.N_users
        )

        recs = pd.DataFrame({'user_id': test['user_id'].unique()})
        recs['sim_user_id'], recs['sim'] = zip(*recs['user_id'].map(mapper))
        recs = recs.set_index('user_id').apply(pd.Series.explode).reset_index()

        recs = recs[~(recs['user_id'] == recs['sim_user_id'])]\
            .merge(self.watched, on=['sim_user_id'], how='left')\
            .explode('item_id')\
            .sort_values(['user_id', 'sim'], ascending=False)\
            .drop_duplicates(['user_id', 'item_id'], keep='first')\
            .merge(self.item_idf, left_on='item_id', right_on='index', how='left')

        recs['score'] = recs['sim'] * recs['idf']
        recs = recs.sort_values(['user_id', 'score'], ascending=False)
        recs['rank'] = recs.groupby('user_id').cumcount() + 1

        # Handle cold-start users
        cold_start_users = test[~test['user_id'].isin(self.users_inv_mapping)]
        if not cold_start_users.empty and self.cold_start_recommender:
            cold_start_recs = self.cold_start_recommender.predict(cold_start_users, N_recs)
            recs = pd.concat([recs, cold_start_recs], ignore_index=True)

        # Task 2: Make recommendations for cold-start users using the cold_start_recommender
        return recs[recs['rank'] <= N_recs][['user_id', 'item_id', 'score', 'rank']]





In [None]:
# Instantiate UserKnn with ItemItemRecommender model
user_knn_model = UserKnn(model=ItemItemRecommender(), N_users=50)
user_knn_model.fit(train)



  0%|          | 0/896791 [00:00<?, ?it/s]

In [None]:
import pickle
# Save the model to a pickle file
with open('user_knn_model.pkl', 'wb') as file:
    pickle.dump(user_knn_model, file)

In [None]:
%pwd

'/content'

In [None]:
test_users = pd.DataFrame({'user_id': [123, 456, 789]})  # Example test users
recommendations = user_knn_model.predict(test_users, N_recs=10)


In [None]:
print(recommendations)


     user_id item_id      score  rank
1        789    9031  11.509985     1
5        789   10665   8.180912     2
0        789   12837   7.774632     3
3        789    9103   7.335318     4
4        789    4880   4.556208     5
2        789   13865   3.801718     6
6        456   10077   6.746927     1
433      123   12496  28.071409     1
627      123   11401  28.071409     2
629      123    3830  28.071409     3
711      123    6043  28.071409     4
327      123   14985  27.625122     5
340      123    9837  27.625122     6
657      123   15144  27.625122     7
727      123   13335  27.625122     8
770      123      77  27.625122     9
641      123    7445   27.26048    10
