In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender
import warnings
warnings.filterwarnings("ignore")


from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# üé¨ Get KION dataset 

<a href="https://ods.ai/competitions/competition-recsys-21/data"> Dataset description [ru] </a>


In [33]:
# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

HBox(children=(HTML(value='kion dataset download'), FloatProgress(value=0.0, max=78795385.0), HTML(value='')))

In [34]:
!unzip kion_train.zip

"unzip" –Ω–µ —è–≤–ª—è–µ—Ç—Å—è –≤–Ω—É—Ç—Ä–µ–Ω–Ω–µ–π –∏–ª–∏ –≤–Ω–µ—à–Ω–µ–π
–∫–æ–º–∞–Ω–¥–æ–π, –∏—Å–ø–æ–ª–Ω—è–µ–º–æ–π –ø—Ä–æ–≥—Ä–∞–º–º–æ–π –∏–ª–∏ –ø–∞–∫–µ—Ç–Ω—ã–º —Ñ–∞–π–ª–æ–º.


# EDA

In [2]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [3]:
# rename columns, convert timestamp
interactions.rename(columns={'last_watch_dt': Columns.Datetime,
                            'total_dur': Columns.Weight}, 
                    inplace=True) 

interactions['datetime'] = pd.to_datetime(interactions['datetime'])

## interactions

In [4]:
pd.concat([interactions.head(), interactions.tail()])

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [5]:
print(f"Interactions dataframe shape: {interactions.shape}")
print(f"Unique users in interactions: {interactions['user_id'].nunique():_}")
print(f"Unique items in interactions: {interactions['item_id'].nunique():_}")

Interactions dataframe shape: (5476251, 5)
Unique users in interactions: 962_179
Unique items in interactions: 15_706


In [6]:
max_date = interactions['datetime'].max()
min_date = interactions['datetime'].min()

print(f"min date in interactions: {min_date}")
print(f"max date in interactions: {max_date}")

min date in interactions: 2021-03-13 00:00:00
max date in interactions: 2021-08-22 00:00:00


In [7]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


## users

In [8]:
pd.concat([users.head(), users.tail()])

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,–ú,1
1,962099,age_18_24,income_20_40,–ú,0
2,1047345,age_45_54,income_40_60,–ñ,0
3,721985,age_45_54,income_20_40,–ñ,0
4,704055,age_35_44,income_60_90,–ñ,0
840192,339025,age_65_inf,income_0_20,–ñ,0
840193,983617,age_18_24,income_20_40,–ñ,1
840194,251008,,,,0
840195,590706,,,–ñ,0
840196,166555,age_65_inf,income_20_40,–ñ,0


In [9]:
print(f"Users dataframe shape {users.shape}")
print(f"Unique users: {users['user_id'].nunique():_}")

Users dataframe shape (840197, 5)
Unique users: 840_197


## items

In [10]:
pd.concat([items.head(3), items.tail(3)])

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,–ü–æ–≥–æ–≤–æ—Ä–∏ —Å –Ω–µ–π,Hable con ella,2002.0,"–¥—Ä–∞–º—ã, –∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã, –º–µ–ª–æ–¥—Ä–∞–º—ã",–ò—Å–ø–∞–Ω–∏—è,,16.0,,–ü–µ–¥—Ä–æ –ê–ª—å–º–æ–¥–æ–≤–∞—Ä,"–ê–¥–æ–ª—å—Ñ–æ –§–µ—Ä–Ω–∞–Ω–¥–µ—Å, –ê–Ω–∞ –§–µ—Ä–Ω–∞–Ω–¥–µ—Å, –î–∞—Ä–∏–æ –ì—Ä–∞–Ω–¥–∏–Ω–µ—Ç—Ç–∏, –î–∂–µ—Ä–∞–ª—å–¥–∏–Ω –ß–∞–ø–ª–∏–Ω, –ï–ª–µ–Ω–∞ –ê–Ω–∞–π—è, –ö–∞—ç—Ç–∞–Ω–æ –í–µ–ª–æ–∑–æ, –õ–µ–æ–Ω–æ—Ä –£–æ—Ç–ª–∏–Ω–≥, –õ–æ–ª–∞ –î—É—ç–Ω—å—è—Å, –õ–æ–ª–µ—Å –õ–µ–æ–Ω, –ú–∞–ª—É –ê–π—Ä–æ–¥–æ, –ú–∞—Ä–∏–æ–ª–∞ –§—É—ç–Ω—Ç–µ—Å, –ü–∞—Å –í–µ–≥–∞, –ü–∏–Ω–∞ –ë–∞—É—à, –†–æ...",–ú–µ–ª–æ–¥—Ä–∞–º–∞ –ª–µ–≥–µ–Ω–¥–∞—Ä–Ω–æ–≥–æ –ü–µ–¥—Ä–æ –ê–ª—å–º–æ–¥–æ–≤–∞—Ä–∞ ¬´–ü–æ–≥–æ–≤–æ—Ä–∏ —Å –Ω–µ–π¬ª –≤ 2003 –≥–æ–¥—É –ø–æ–ª—É—á–∏–ª–∞ –ø—Ä–µ–º–∏—é ¬´–û—Å–∫–∞—Ä¬ª –∑–∞ –ª—É—á—à–∏–π —Å—Ü–µ–Ω–∞—Ä–∏–π. –ñ—É—Ä–Ω–∞–ª–∏—Å—Ç –ú–∞—Ä–∫–æ –±–µ—Ä–µ—Ç –∏–Ω—Ç–µ—Ä–≤—å—é —É –∑–Ω–∞–º–µ–Ω–∏—Ç–æ–π –∂–µ–Ω—â–∏–Ω—ã-—Ç–æ—Ä–µ—Ä–æ –õ–∏–¥–∏–∏ –∏ –≤—Å–∫–æ—Ä–µ –≤–ª—é–±–ª—è–µ...,"–ü–æ–≥–æ–≤–æ—Ä–∏, –Ω–µ–π, 2002, –ò—Å–ø–∞–Ω–∏—è, –¥—Ä—É–∑—å—è, –ª—é–±–æ–≤—å, —Å–∏–ª—å–Ω—ã–µ, –∂–µ–Ω—â–∏–Ω—ã, –ø—Ä–µ–æ–¥–æ–ª–µ–Ω–∏–µ, —Ç—Ä—É–¥–Ω–æ—Å—Ç–µ–π, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –¥—Ä—É–∂–±–∞, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –ø–∞—Ä–µ, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –º—É–∂—á–∏–Ω—ã, –∂–µ–Ω—â–∏–Ω—ã, —Ä–æ–º–∞–Ω—Ç–∏—á–µ—Å–∫–∏–µ, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –ø–æ—Ç–µ—Ä—è, –±–ª–∏–∑–∫–∏—Ö,..."
1,2508,film,–ì–æ–ª—ã–µ –ø–µ—Ä—Ü—ã,Search Party,2014.0,"–∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è, –∫–æ–º–µ–¥–∏–∏",–°–®–ê,,16.0,,–°–∫–æ—Ç –ê—Ä–º—Å—Ç—Ä–æ–Ω–≥,"–ê–¥–∞–º –ü–∞–ª–ª–∏, –ë—Ä–∞–π–∞–Ω –•–∞—Å–∫–∏, –î–∂.–ë. –°–º—É–≤, –î–∂–µ–π—Å–æ–Ω –ú–∞–Ω—Ü—É–∫–∞—Å, –î–∂–æ–Ω –ì–ª–µ–π—Å–µ—Ä, –ö–∞—Ä–ª –ì—Ä–∏–Ω, –ö—Ä–∏—Å—Ç–µ–Ω –†–∏—Ç—Ç–µ—Ä, –õ—ç–Ω—Å –†–µ–¥–¥–∏–∫, –ú–æ—Ä–∏—Å –ö–æ–º—Ç, –ü–∞—Ç—Ä–∏–∫ –ö–µ—Ä–Ω—Å, –†–µ–±–µ–∫–∫–∞ –ö–æ–ª–ª–∏–Ω–∑, –†–æ–∑–∞ –°–∞–ª–∞–∑–∞—Ä, –†–æ—Å—Å –ü. –ö—É–∫, –°—Ç–µ—Ñ—Ñ–∏ –ì—Ä–æ—É—Ç, –¢–∏...","–£–º–æ—Ä–∏—Ç–µ–ª—å–Ω–∞—è —Å–æ–≤—Ä–µ–º–µ–Ω–Ω–∞—è –∫–æ–º–µ–¥–∏—è –Ω–∞ –ø–æ–ø—É–ª—è—Ä–Ω—É—é —Ç–µ–º—É –æ —Ç–æ–º, –∫–∞–∫ –Ω–µ –Ω–∞–¥–æ –æ—Ç–º–µ—á–∞—Ç—å –º–∞–ª—å—á–∏—à–Ω–∏–∫. –ì–ª–∞–≤–Ω—ã–π –≥–µ—Ä–æ–π —É—Å–≤–æ–∏–ª, —á—Ç–æ –Ω–µ –Ω–∞–¥–æ –∑–≤–∞—Ç—å –Ω–∞ —Å–≤–∞–¥—å–±—É —Å–≤–æ–µ–≥–æ –¥—Ä—É–≥–∞ –î–∂–µ–π—Å–æ–Ω–∞, –∏–∑-–∑–∞ –∫–æ—Ç–æ—Ä–æ–≥–æ –æ–Ω –≤–º–µ—Å—Ç–æ —Å–≤–∞–¥–µ...","–ì–æ–ª—ã–µ, –ø–µ—Ä—Ü—ã, 2014, –°–®–ê, –¥—Ä—É–∑—å—è, —Å–≤–∞–¥—å–±—ã, –ø—Ä–µ–æ–¥–æ–ª–µ–Ω–∏–µ, —Ç—Ä—É–¥–Ω–æ—Å—Ç–µ–π, —Ä–∞—Å—Å—Ç–∞–≤–∞–Ω–∏—è, –æ—Ç–Ω–æ—à–µ–Ω–∏—è, –¥—Ä—É–∂–±–∞, —Ä–∏—Å–∫, –Ω–µ–¥–æ—Ä–∞–∑—É–º–µ–Ω–∏–µ, –º—É–∂—Å–∫–∞—è, –¥—Ä—É–∂–±–∞, –º–∞–ª—å—á–∏—à–Ω–∏–∫–∏, –¥–µ–≤–∏—á–Ω–∏–∫–∏"
2,10716,film,–¢–∞–∫—Ç–∏—á–µ—Å–∫–∞—è —Å–∏–ª–∞,Tactical Force,2011.0,"–∫—Ä–∏–º–∏–Ω–∞–ª, –∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, —Ç—Ä–∏–ª–ª–µ—Ä—ã, –±–æ–µ–≤–∏–∫–∏, –∫–æ–º–µ–¥–∏–∏",–ö–∞–Ω–∞–¥–∞,,16.0,,–ê–¥–∞–º –ü. –ö–∞–ª—Ç—Ä–∞—Ä–æ,"–ê–¥—Ä–∏–∞–Ω –•–æ–ª–º—Å, –î–∞—Ä—Ä–µ–Ω –®–∞–ª–∞–≤–∏, –î–∂–µ—Ä—Ä–∏ –í–∞—Å—Å–µ—Ä–º–∞–Ω, –î—ç–Ω –†–∏–∑–∑—É—Ç–æ, –ö–µ–Ω–¥–µ—Å –ò–ª—ç–π–Ω –ö–∞–ª—Ç—Ä–∞—Ä–æ, –ö–∏—Ç –î–∂–∞—Ä–¥–∏–Ω, –õ–µ–∫—Å–∞ –î–æ–π–≥, –ú–∞–π–∫–ª –î–∂–µ–π –£–∞–π—Ç, –ú–∞–π–∫–ª –®—ç–Ω–∫—Å, –ú–∞–π–∫–ª –≠–∫–ª—É–Ω–¥, –ü–∏—Ç–µ—Ä –ë—Ä–∞–π–∞–Ω—Ç, –ü–∏—Ç–µ—Ä –ö–µ–Ω—Ç, –°—Ç–∏–≤ –ë–∞—á–∏—á, –°—Ç–∏–≤ ...","–ü—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–π —Ä–µ—Å—Ç–ª–µ—Ä –°—Ç–∏–≤ –û—Å—Ç–∏–Ω (¬´–í—Å–µ –∏–ª–∏ –Ω–∏—á–µ–≥–æ¬ª) –∏ —Ç–µ–º–Ω–æ–∫–æ–∂–∏–π –º–∞—á–æ –ú–∞–π–∫–ª –î–∂–µ–π –£–∞–π—Ç (¬´–¢–µ–º–Ω—ã–π —Ä—ã—Ü–∞—Ä—å¬ª) –≤ –∏–Ω—Ç—Ä–∏–≥—É—é—â–µ–º –∫—Ä–∏–º–∏–Ω–∞–ª—å–Ω–æ–º –±–æ–µ–≤–∏–∫–µ. –í —Ü–µ–Ω—Ç—Ä–µ —Å—é–∂–µ—Ç–∞ ‚Äì –∫–æ–º–∞–Ω–¥–∞ —Å–ø–µ—Ü–Ω–∞–∑–æ–≤—Ü–µ–≤, –∫–æ—Ç–æ—Ä–∞—è –æ–∫–∞–∑–∞...","–¢–∞–∫—Ç–∏—á–µ—Å–∫–∞—è, —Å–∏–ª–∞, 2011, –ö–∞–Ω–∞–¥–∞, –±–∞–Ω–¥–∏—Ç—ã, –≥–∞–Ω–≥—Å—Ç–µ—Ä—ã, –ø—Ä–µ—Å—Ç—É–ø–ª–µ–Ω–∏—è, –ø—Ä–µ–æ–¥–æ–ª–µ–Ω–∏–µ, —Ç—Ä—É–¥–Ω–æ—Å—Ç–µ–π, —É–±–∏–π—Å—Ç–≤–∞, —É–±–∏–π—Ü—ã, –Ω–∞—Å—Ç–æ—è—â–∏–µ, –º—É–∂—á–∏–Ω—ã, —Ä–∏—Å–∫, –Ω–µ–¥–æ—Ä–∞–∑—É–º–µ–Ω–∏–µ, —Å–∏–ª—ã, –ø—Ä–∞–≤–æ–ø–æ—Ä—è–¥–∫–∞, –±–æ—Ä—å–±–∞, –∑–∞, –≤—ã–∂–∏–≤–∞–Ω–∏–µ, —Å–ø..."
15960,10632,series,–°–≥–æ–≤–æ—Ä,Hassel,2017.0,"–¥—Ä–∞–º—ã, —Ç—Ä–∏–ª–ª–µ—Ä—ã, –∫—Ä–∏–º–∏–Ω–∞–ª",–†–æ—Å—Å–∏—è,0.0,18.0,,"–≠—à—Ä–µ—Ñ –†–µ–π–±—Ä—É–∫, –ê–º–∏—Ä –ö–∞–º–¥–∏–Ω, –≠—Ä–∏–∫ –≠–≥–µ—Ä","–û–ª–∞ –†–∞–ø–∞—Å, –ê–ª–∏–µ—Ç—Ç –û—Ñ–µ–π–º, –£–∏–ª—å–º–∞ –õ–∏–¥–µ–Ω, –®–∞–Ω—Ç–∏ –†–æ–Ω–∏, –¢–æ–º–∞ –•–æ–ª–º–∏–Ω","–ö—Ä–∏–º–∏–Ω–∞–ª—å–Ω–∞—è –¥—Ä–∞–º–∞ –ø–æ –º–æ—Ç–∏–≤–∞–º —Ä–æ–º–∞–Ω–æ–≤ –æ —à–≤–µ–¥—Å–∫–æ–º –¥–µ—Ç–µ–∫—Ç–∏–≤–µ –†–æ–ª–∞–Ω–¥–µ –•–∞—Å—Å–µ–ª–µ. –°—Ä–µ–¥—å –±–µ–ª–∞ –¥–Ω—è —É–±–∏—Ç –ø–æ–ª–∏—Ü–µ–π—Å–∫–∏–π, –∏ –Ω–∏—Ç–∏ –≤ —ç—Ç–æ–º –¥–µ–ª–µ –≤–µ–¥—É—Ç –ø—Ä—è–º–æ –≤ –∫–æ—Ä–∏–¥–æ—Ä—ã –≤–ª–∞—Å—Ç–∏. –†–∞—Å—Å–ª–µ–¥–æ–≤–∞–Ω–∏–µ–º –∑–∞–Ω–∏–º–∞–µ—Ç—Å—è –¥–µ—Ç–µ–∫—Ç–∏–≤ –•–∞...","–°–≥–æ–≤–æ—Ä, 2017, –†–æ—Å—Å–∏—è"
15961,4538,series,–°—Ä–µ–¥–∏ –∫–∞–º–Ω–µ–π,Darklands,2019.0,"–¥—Ä–∞–º—ã, —Å–ø–æ—Ä—Ç, –∫—Ä–∏–º–∏–Ω–∞–ª",–†–æ—Å—Å–∏—è,0.0,18.0,,"–ú–∞—Ä–∫ –û‚Äô–ö–æ–Ω–Ω–æ—Ä, –ö–æ–Ω–æ—Ä –ú–∞–∫–ú–∞—Ö–æ–Ω","–î—ç–π–Ω –£–∞–π—Ç –û‚Äô–•–∞—Ä–∞, –¢–æ–º–∞—Å –ö—ç–π–Ω-–ë–∏—Ä–Ω, –î–∂—É–¥–∏—Ç –†–æ–¥–¥–∏, –ú–∞—Ä–∫ –û‚Äô–•–∞–ª–ª–æ—Ä–∞–Ω, –î–∂–∏–º–º–∏ –°–º–æ–ª–ª—Ö–æ—Ä–Ω","–°–µ–º–Ω–∞–¥—Ü–∞—Ç–∏–ª–µ—Ç–Ω–∏–π –î—ç–º–∏–µ–Ω –º–µ—á—Ç–∞–µ—Ç –≤—ã—Ä–≤–∞—Ç—å—Å—è –∑–∞ –ø—Ä–µ–¥–µ–ª—ã —Å–≤–æ–µ–≥–æ —Ä–∞–π–æ–Ω–∞ –∏ —Å—Ç–∞—Ç—å –ø—Ä–æ—Ñ–µ—Å—Å–∏–æ–Ω–∞–ª—å–Ω—ã–º –±–æ–π—Ü–æ–º. –ö–æ–≥–¥–∞ –µ–≥–æ –∫—É–º–∏—Ä –∏ —Å—Ç–∞—Ä—à–∏–π –±—Ä–∞—Ç –∏—Å—á–µ–∑–∞–µ—Ç, –ø–∞—Ä–µ–Ω—å –ø–æ–ø–∞–¥–∞–µ—Ç –≤ —á—É–∂–¥—ã–π –µ–º—É –º–∏—Ä –Ω–∞—Å–∏–ª–∏—è, –Ω–∞—Ä–∫–æ—Ç–∏–∫–æ–≤ –∏ ...","–°—Ä–µ–¥–∏, –∫–∞–º–Ω–µ–π, 2019, –†–æ—Å—Å–∏—è"
15962,3206,series,–ì–æ—à–∞,,2019.0,–∫–æ–º–µ–¥–∏–∏,–†–æ—Å—Å–∏—è,0.0,16.0,,–ú–∏—Ö–∞–∏–ª –ú–∏—Ä–æ–Ω–æ–≤,"–ú–∫—Ä—Ç—ã—á –ê—Ä–∑—É–º–∞–Ω—è–Ω, –í–∏–∫—Ç–æ—Ä–∏—è –†—É–Ω—Ü–æ–≤–∞","–î–æ–±—Ä–æ–¥—É—à–Ω—ã–π –ì–æ—à–∞ –Ω–µ –º–æ–∂–µ—Ç –≤—ã–π—Ç–∏ –∏–∑ –¥–æ–º–∞, —á—Ç–æ–±—ã –Ω–µ –ø–æ–ø–∞—Å—Ç—å –≤ –Ω–µ–ª–µ–ø—É—é –∏ –∫—É—Ä—å—ë–∑–Ω—É—é –∏—Å—Ç–æ—Ä–∏—é. –ù–æ –¥–∞–∂–µ –Ω–µ—É–¥–∞—á–Ω–∏–∫–∏ –º–µ—á—Ç–∞—é—Ç –æ –ª—é–±–≤–∏, –∏ –Ω–∞—à –≥–µ—Ä–æ–π ‚Äî –Ω–µ –∏—Å–∫–ª—é—á–µ–Ω–∏–µ, –≤–µ–¥—å –æ–ø—Ç–∏–º–∏–∑–º–∞ –µ–º—É –Ω–µ –∑–∞–Ω–∏–º–∞—Ç—å.","–ì–æ—à–∞, 2019, –†–æ—Å—Å–∏—è"


In [11]:
print(f"Items dataframe shape {items.shape}")
print(f"Unique item_id: {items['item_id'].nunique():_}")

Items dataframe shape (15963, 14)
Unique item_id: 15_963


# üß© New model: from `itemkNN` to `userkNN `

- we'll use `implicit.nearest_neighbours` itemKNN model and convert it to userkNN model 


## 0. train test split 

In [12]:
from rectools.dataset import Interactions

In [13]:
interactions

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0


In [14]:
# train test split 
# test = last 1 week 
from rectools.model_selection import TimeRangeSplitter
from rectools.dataset import IdMap


n_folds = 1
unit = "W"
n_units = 1
periods = n_folds + 1
freq = f"{n_units}{unit}"

last_date = interactions[Columns.Datetime].max().normalize()
start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)  
print(f"Start date and last date of the test fold: {start_date, last_date}")
    
date_range = pd.date_range(start=start_date, periods=periods, freq=freq, tz=last_date.tz)
print(f"Test fold borders: {date_range.values.astype('datetime64[D]')}")

Start date and last date of the test fold: (Timestamp('2021-08-08 00:00:00'), Timestamp('2021-08-22 00:00:00'))
Test fold borders: ['2021-08-08' '2021-08-15']


In [15]:
user_id_map = IdMap.from_values(interactions.user_id)
item_id_map = IdMap.from_values(interactions.item_id)


interactions = Interactions.from_raw(interactions, user_id_map, item_id_map)
# generator of folds
cv = TimeRangeSplitter(
    date_range=date_range,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
print(f"Real number of folds: {cv.get_n_splits(interactions)}")

Real number of folds: 1


In [16]:
# we have just 1 test fold - no need to iterate over fold
(train_ids, test_ids, fold_info) = cv.split(interactions, collect_fold_stats=True).__next__()

In [17]:
train_ids

array([      0,       1,       2, ..., 5476245, 5476247, 5476249],
      dtype=int64)

In [18]:
test_ids

array([      6,      33,      56, ..., 5476229, 5476230, 5476240],
      dtype=int64)

# 1. Prepare train matrix 

### 1.1 Create `user` and `item` mappings (essential part for recsys models) 

users_mapping = `{user0: 0, user1: 1, ... , userN: N}`

items_mapping = `{item0: 0, item1: 1, ... , itemK: K}`

In [19]:
interactions = interactions.df

In [20]:
train = interactions.loc[train_ids]
test = interactions.loc[test_ids]

In [21]:
users_inv_mapping = dict(enumerate(train['user_id'].unique()))
users_mapping = {v: k for k, v in users_inv_mapping.items()}

In [22]:
items_inv_mapping = dict(enumerate(train['item_id'].unique()))
items_mapping = {v: k for k, v in items_inv_mapping.items()}

In [23]:
print(f"users_mapping amount: {len(users_mapping)}")
print(f"items_mapping amount: {len(items_mapping)}")

users_mapping amount: 842129
items_mapping amount: 15404


### 2.2 Get sparse matrix 

In [24]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=None, 
                   items_mapping=None):
    if weight_col:
        weights = df[weight_col].astype(np.float32)
    else:
        weights = np.ones(len(df), dtype=np.float32)

    interaction_matrix = sp.sparse.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

In [25]:
interaction_matrix = get_coo_matrix(train, weight_col='weight',
                                    users_mapping=users_mapping, 
                                    items_mapping=items_mapping)

In [26]:
interaction_matrix

<842129x15404 sparse matrix of type '<class 'numpy.float32'>'
	with 4587708 stored elements in COOrdinate format>

## 2. Fit simple ItemKNN model 

### `userknn disclaimer`:

implicit ItemItemRecommender requires `item-user matrix` (not `user-item` !) => ususally you call fit with transposed weights matrix `model.fit(matrix.T)`. That's how you get item's nearest neighbours. 

but for `the userknn model` we want to have **user** neighbours (not item neighbours). that's why we need to call fit **without transpose**: `model.fit(matrix)`

https://github.com/benfred/implicit/blob/main/implicit/nearest_neighbours.py

In [27]:
userknn = CosineRecommender(K=35)
userknn.fit(interaction_matrix)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=842129.0), HTML(value='')))




In [29]:
# save model
import dill

with open('userknn.dill', 'wb') as f:
    dill.dump(userknn, f)

In [30]:
with open('userknn.dill', 'rb') as f:
    userknn = dill.load(f)

In [28]:
userknn.similar_items(1)

[(1, 1.0000000098018893),
 (773911, 0.36543191174204726),
 (115615, 0.3610291122122819),
 (422766, 0.3590583883643988),
 (46469, 0.35822138476497434),
 (338851, 0.3544897034750306),
 (733896, 0.35448970121073087),
 (575, 0.35448970086392395),
 (38208, 0.35448970086392395),
 (289064, 0.35448970086392395)]