In [1]:
import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import  MAP, calc_metrics
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

from implicit.als import AlternatingLeastSquares
from lightfm import LightFM
from implicit.lmf import LogisticMatrixFactorization
import joblib

# Load Data

In [None]:
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)
!unzip kion_train.zip

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

Archive:  kion_train.zip
replace kion_train/interactions.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [2]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [3]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        int64  
 1   item_id        int64  
 2   last_watch_dt  object 
 3   total_dur      int64  
 4   watched_pct    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 208.9+ MB


In [4]:
interactions.rename(columns={'user_id': Columns.User,
                             'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime}, inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [5]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   total_dur    int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


# Transform data

## Users

In [6]:
users[:3]

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [7]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   840197 non-null  int64 
 1   age       826102 non-null  object
 2   income    825421 non-null  object
 3   sex       826366 non-null  object
 4   kids_flg  840197 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 32.1+ MB


In [8]:
# replace sex to 0 or 1
users['sex'] = users['sex'].map({'Ж': 1, 'М': 0})

In [9]:
age_category = pd.CategoricalDtype(categories=['age_18_24',
                                               'age_25_34',
                                               'age_35_44',
                                               'age_45_54',
                                               'age_55_64',
                                               'age_65_inf'], ordered=True)
users['age'] = users['age'].astype(age_category)

In [10]:
income_category = pd.CategoricalDtype(categories=['income_0_20',
                                                  'income_20_40',
                                                  'income_40_60',
                                                  'income_60_90',
                                                  'income_90_150',
                                                  'income_150_inf'], ordered=True)
users['income'] = users['income'].astype(income_category)

## Items

In [11]:
items[:3]

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


In [12]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15963 entries, 0 to 15962
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   item_id       15963 non-null  int64  
 1   content_type  15963 non-null  object 
 2   title         15963 non-null  object 
 3   title_orig    11218 non-null  object 
 4   release_year  15865 non-null  float64
 5   genres        15963 non-null  object 
 6   countries     15926 non-null  object 
 7   for_kids      566 non-null    float64
 8   age_rating    15961 non-null  float64
 9   studios       1065 non-null   object 
 10  directors     14454 non-null  object 
 11  actors        13344 non-null  object 
 12  description   15961 non-null  object 
 13  keywords      15540 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 1.7+ MB


In [13]:
YEAR_FROM = 1990
STEP_SIZE = 5
bins = [year for year in range(YEAR_FROM, int(
    items['release_year'].max()) + STEP_SIZE, STEP_SIZE)]
bins = [int(items['release_year'].min())] + bins
items['year_bin'] = pd.cut(items['release_year'],
                           bins=bins,
                           include_lowest=True)

# Train/test split

In [14]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [15]:
max_date = interactions[Columns.Datetime].max()

In [16]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

In [17]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [18]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [19]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Baseline models

In [31]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 1
N_FACTORS = (4, 16, 32)

In [32]:
dataset = Dataset.construct(interactions_df=train)

In [33]:
models = {}

In [34]:
implicit_models = {'ALS': AlternatingLeastSquares}

for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        models[f"{implicit_name}_{n_factors}"] = ImplicitALSWrapperModel(
            model=implicit_model(
                factors=n_factors,
                random_state=RANDOM_STATE,
                num_threads=NUM_THREADS))

In [35]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(no_components=n_factors,
                    loss=loss,
                    random_state=RANDOM_STATE),
            epochs=10,
            num_threads=NUM_THREADS)

In [36]:
metrics_name = {'MAP': MAP}

metrics = {}
for metric_name, metric in metrics_name.items():
    metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

In [37]:
metrics

{'MAP@10': MAP(k=10, divide_by_k=False)}

In [117]:
results = []
for model_name, model in tqdm(models.items()):
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

  0%|          | 0/12 [00:00<?, ?it/s]

In [146]:
df_quality = pd.DataFrame(results).set_index('model')

In [148]:
df_quality.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0_level_0,MAP@10
model,Unnamed: 1_level_1
ALS_4,0.05536
ALS_16,0.031337
ALS_32,0.02804
LightFM_logistic_4,0.074705
LightFM_logistic_16,0.074717
LightFM_logistic_32,0.074803
LightFM_bpr_4,0.036435
LightFM_bpr_16,0.027962
LightFM_bpr_32,0.022902
LightFM_warp_4,0.077356


# Training best model

In [40]:
model = models['LightFM_warp_16']

In [128]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f90ce60c6a0>

In [142]:
# предиктим на тест
%time    
recos = list(model.recommend(
        users=[10010],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True)['item_id'])

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 35.3 µs


In [143]:
recos

[9728, 13865, 15297, 10772, 657, 12356, 7829, 4457, 6455, 13723]

In [141]:
joblib.dump(model, '/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/models/LightFM_warp_16.joblib')

['/Users/dmitry/Library/CloudStorage/GoogleDrive-ceo@gangai.pro/Мой диск/Проекты/recsys/models/LightFM_warp_16.joblib']

# Prepearing features

## User features

In [202]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [203]:
users['age'].fillna(users['age'].mode()[0], inplace=True)
users['income'].fillna(users['income'].mode()[0], inplace=True)
users['sex'].fillna(users['age'].mode()[0], inplace=True)

In [204]:
users.isnull().sum()

user_id     0
age         0
income      0
sex         0
kids_flg    0
dtype: int64

In [205]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [206]:
user_features_frames = []
for feature in ["sex", "age", "income", 'kids_flg']:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,0.0,sex
1,962099,0.0,sex
3,721985,1.0,sex
4,704055,1.0,sex
5,1037719,0.0,sex


In [208]:
user_features[user_features['id'] == 973171]

Unnamed: 0,id,value,feature
0,973171,0.0,sex
0,973171,age_25_34,age
0,973171,income_60_90,income
0,973171,1,kids_flg


## Item features

In [209]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [210]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,year_bin
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ...","(2000.0, 2005.0]"
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео...","(2010.0, 2015.0]"
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг...","(2010.0, 2015.0]"
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю...","(2010.0, 2015.0]"
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж...","(1896.999, 1990.0]"


### Genre

In [211]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [221]:
def make_features(data: pd.DataFrame, user_id_col: str, feature_col=str) -> pd.DataFrame:
    feature = data.reindex(columns=[user_id_col, feature_col])
    feature.columns = ["id", "value"]
    feature["feature"] = feature_col
    return feature

In [222]:
content_feature = make_features(items, Columns.Item, 'content_type')
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


### Title

In [223]:
title_feature = make_features(items, Columns.Item, 'title')
title_feature.head()

Unnamed: 0,id,value,feature
0,10711,Поговори с ней,title
1,2508,Голые перцы,title
2,10716,Тактическая сила,title
3,7868,45 лет,title
4,16268,Все решает мгновение,title


### Countries

In [215]:
items['countries'].fillna('unknown', inplace=True)

In [224]:
countries_feature = make_features(items, Columns.Item, 'countries')
countries_feature.head()

Unnamed: 0,id,value,feature
0,10711,Испания,countries
1,2508,США,countries
2,10716,Канада,countries
3,7868,Великобритания,countries
4,16268,СССР,countries


### Studios

In [217]:
items['studios'].fillna('unknown', inplace=True)

In [227]:
studios_feature = make_features(items, Columns.Item, 'studios')
studios_feature.head()

Unnamed: 0,id,value,feature
0,10711,unknown,studios
1,2508,unknown,studios
2,10716,unknown,studios
3,7868,unknown,studios
4,16268,Ленфильм,studios


In [228]:
item_features = pd.concat((genre_feature, 
                           content_feature, 
                           title_feature, 
                           countries_feature, 
                           studios_feature))

In [229]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,unknown,studios
15959,2367,unknown,studios
15960,10632,unknown,studios
15961,4538,unknown,studios


# Check models with features

In [27]:
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [28]:
models = {}

In [29]:
implicit_models = {'ALS': AlternatingLeastSquares}

for implicit_name, implicit_model in implicit_models.items():
    for is_fitting_features in (True, False):
        for n_factors in N_FACTORS:
            models[f"{implicit_name}_{n_factors}_{is_fitting_features}"] = (
                ImplicitALSWrapperModel(model=implicit_model(factors=n_factors,
                                                             random_state=RANDOM_STATE,
                                                             num_threads=NUM_THREADS),
                                        fit_features_together=is_fitting_features))



In [234]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(no_components=n_factors,
                    loss=loss,
                    random_state=RANDOM_STATE,
                    learning_rate=LEARNING_RATE,
                    user_alpha=USER_ALPHA,
                    item_alpha=ITEM_ALPHA),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS)

In [235]:
models

{'ALS_4_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f8f77b46310>,
 'ALS_16_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f8f77b461c0>,
 'ALS_32_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f8f77b46340>,
 'ALS_4_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f8f77b46190>,
 'ALS_16_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f8f77b46370>,
 'ALS_32_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f8f77b46a60>,
 'LightFM_logistic_4': <rectools.models.lightfm.LightFMWrapperModel at 0x7f8f64dbbf70>,
 'LightFM_logistic_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f8f72434f70>,
 'LightFM_logistic_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7f8f72434160>,
 'LightFM_bpr_4': <rectools.models.lightfm.LightFMWrapperModel at 0x7f8f72434dc0>,
 'LightFM_bpr_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f8f72434b80>,
 'LightFM_bpr_32': <rectoo

In [238]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=user_features['feature'].unique(),
    item_features_df=item_features,
    cat_item_features=item_features['feature'].unique(),
)

In [239]:
TEST_USERS = test[Columns.User].unique()

In [None]:
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model ALS_4_True...


In [None]:
model