In [140]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import  MAP, calc_metrics
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

from implicit.als import AlternatingLeastSquares
from lightfm import LightFM
from implicit.lmf import LogisticMatrixFactorization
import joblib

# Load Data

In [5]:
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)
!unzip kion_train.zip

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [80]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [81]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        int64  
 1   item_id        int64  
 2   last_watch_dt  object 
 3   total_dur      int64  
 4   watched_pct    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 208.9+ MB


In [82]:
interactions.rename(columns={'user_id': Columns.User,
                             'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime}, inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [83]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   total_dur    int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


# Transform data

## Users

In [84]:
users[:3]

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [85]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   840197 non-null  int64 
 1   age       826102 non-null  object
 2   income    825421 non-null  object
 3   sex       826366 non-null  object
 4   kids_flg  840197 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 32.1+ MB


In [86]:
# replace sex to 0 or 1
users['sex'] = users['sex'].map({'Ж': 1, 'М': 0})

In [87]:
age_category = pd.CategoricalDtype(categories=['age_18_24',
                                               'age_25_34',
                                               'age_35_44',
                                               'age_45_54',
                                               'age_55_64',
                                               'age_65_inf'], ordered=True)
users['age'] = users['age'].astype(age_category)

In [88]:
income_category = pd.CategoricalDtype(categories=['income_0_20',
                                                  'income_20_40',
                                                  'income_40_60',
                                                  'income_60_90',
                                                  'income_90_150',
                                                  'income_150_inf'], ordered=True)
users['income'] = users['income'].astype(income_category)

## Items

In [89]:
items[:3]

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


In [90]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15963 entries, 0 to 15962
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   item_id       15963 non-null  int64  
 1   content_type  15963 non-null  object 
 2   title         15963 non-null  object 
 3   title_orig    11218 non-null  object 
 4   release_year  15865 non-null  float64
 5   genres        15963 non-null  object 
 6   countries     15926 non-null  object 
 7   for_kids      566 non-null    float64
 8   age_rating    15961 non-null  float64
 9   studios       1065 non-null   object 
 10  directors     14454 non-null  object 
 11  actors        13344 non-null  object 
 12  description   15961 non-null  object 
 13  keywords      15540 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 1.7+ MB


In [91]:
YEAR_FROM = 1990
STEP_SIZE = 5
bins = [year for year in range(YEAR_FROM, int(
    items['release_year'].max()) + STEP_SIZE, STEP_SIZE)]
bins = [int(items['release_year'].min())] + bins
items['year_bin'] = pd.cut(items['release_year'],
                           bins=bins,
                           include_lowest=True)

# Train/test split

In [92]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [93]:
max_date = interactions[Columns.Datetime].max()

In [94]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

In [96]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [98]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [99]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Baseline models

In [106]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 2
N_FACTORS = (4, 16, 32)

In [107]:
dataset = Dataset.construct(interactions_df=train)

In [108]:
models = {}

In [109]:
implicit_models = {'ALS': AlternatingLeastSquares}

for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        models[f"{implicit_name}_{n_factors}"] = ImplicitALSWrapperModel(
            model=implicit_model(
                factors=n_factors,
                random_state=RANDOM_STATE,
                num_threads=NUM_THREADS))

In [111]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(no_components=n_factors,
                    loss=loss,
                    random_state=RANDOM_STATE),
            epochs=10,
            num_threads=NUM_THREADS)

In [115]:
metrics_name = {'MAP': MAP}

metrics = {}
for metric_name, metric in metrics_name.items():
    metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

In [116]:
metrics

{'MAP@10': MAP(k=10, divide_by_k=False)}

In [117]:
results = []
for model_name, model in tqdm(models.items()):
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

  0%|          | 0/12 [00:00<?, ?it/s]

In [122]:
df_quality = pd.DataFrame(results).T
df_quality.columns = df_quality.iloc[0]
df_quality.drop('model', inplace=True)

In [125]:
df_quality.T.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0_level_0,MAP@10
model,Unnamed: 1_level_1
ALS_4,0.05536
ALS_16,0.031337
ALS_32,0.02804
LightFM_logistic_4,0.074705
LightFM_logistic_16,0.074717
LightFM_logistic_32,0.074803
LightFM_bpr_4,0.036435
LightFM_bpr_16,0.027962
LightFM_bpr_32,0.022902
LightFM_warp_4,0.077356


# Training best model

In [127]:
model = models['LightFM_warp_16']

In [128]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f90ce60c6a0>

In [142]:
# предиктим на тест
%time    
recos = list(model.recommend(
        users=[10010],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True)['item_id'])

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 35.3 µs


In [143]:
recos

[9728, 13865, 15297, 10772, 657, 12356, 7829, 4457, 6455, 13723]

In [141]:
joblib.dump(model, '/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/models/LightFM_warp_16.joblib')

['/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/models/LightFM_warp_16.joblib']