In [1]:
!pip install rectools==0.2.0 >> None

In [2]:
import gc

import pandas as pd
import numpy as np
import requests
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")

from rectools import Columns
from rectools.dataset import Dataset
from rectools.metrics import  MAP, calc_metrics
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.model_selection import TimeRangeSplit
import pyarrow.feather as feather
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM
from implicit.lmf import LogisticMatrixFactorization
import joblib
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Data

In [3]:
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)
!unzip kion_train.zip

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [4]:
interactions = pd.read_csv('kion_train/interactions.csv')
users = pd.read_csv('kion_train/users.csv')
items = pd.read_csv('kion_train/items.csv')

In [5]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column         Dtype  
---  ------         -----  
 0   user_id        int64  
 1   item_id        int64  
 2   last_watch_dt  object 
 3   total_dur      int64  
 4   watched_pct    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 208.9+ MB


In [6]:
interactions.rename(columns={'user_id': Columns.User,
                             'item_id': Columns.Item,
                             'last_watch_dt': Columns.Datetime}, inplace=True)
interactions['datetime'] = pd.to_datetime(interactions['datetime'])

In [7]:
interactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   total_dur    int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


# Transform data

## Users

In [8]:
users[:3]

Unnamed: 0,user_id,age,income,sex,kids_flg
0,973171,age_25_34,income_60_90,М,1
1,962099,age_18_24,income_20_40,М,0
2,1047345,age_45_54,income_40_60,Ж,0


In [9]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840197 entries, 0 to 840196
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   user_id   840197 non-null  int64 
 1   age       826102 non-null  object
 2   income    825421 non-null  object
 3   sex       826366 non-null  object
 4   kids_flg  840197 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 32.1+ MB


In [10]:
# replace sex to 0 or 1
users['sex'] = users['sex'].map({'Ж': 1, 'М': 0})

In [11]:
age_category = pd.CategoricalDtype(categories=['age_18_24',
                                               'age_25_34',
                                               'age_35_44',
                                               'age_45_54',
                                               'age_55_64',
                                               'age_65_inf'], ordered=True)
users['age'] = users['age'].astype(age_category)

In [12]:
income_category = pd.CategoricalDtype(categories=['income_0_20',
                                                  'income_20_40',
                                                  'income_40_60',
                                                  'income_60_90',
                                                  'income_90_150',
                                                  'income_150_inf'], ordered=True)
users['income'] = users['income'].astype(income_category)

## Items

In [13]:
items[:3]

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ..."
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео..."
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг..."


In [14]:
items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15963 entries, 0 to 15962
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   item_id       15963 non-null  int64  
 1   content_type  15963 non-null  object 
 2   title         15963 non-null  object 
 3   title_orig    11218 non-null  object 
 4   release_year  15865 non-null  float64
 5   genres        15963 non-null  object 
 6   countries     15926 non-null  object 
 7   for_kids      566 non-null    float64
 8   age_rating    15961 non-null  float64
 9   studios       1065 non-null   object 
 10  directors     14454 non-null  object 
 11  actors        13344 non-null  object 
 12  description   15961 non-null  object 
 13  keywords      15540 non-null  object 
dtypes: float64(3), int64(1), object(10)
memory usage: 1.7+ MB


In [15]:
YEAR_FROM = 1990
STEP_SIZE = 5
bins = [year for year in range(YEAR_FROM, int(
    items['release_year'].max()) + STEP_SIZE, STEP_SIZE)]
bins = [int(items['release_year'].min())] + bins
items['year_bin'] = pd.cut(items['release_year'],
                           bins=bins,
                           include_lowest=True)

# Train/test split

In [16]:
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)

In [17]:
max_date = interactions[Columns.Datetime].max()

In [18]:
train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=7)].copy()
test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=7)].copy()

In [19]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [20]:
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [21]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

# Baseline models

In [22]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 4
N_FACTORS = (4, 16, 32)

In [None]:
dataset = Dataset.construct(interactions_df=train)

In [None]:
models = {}

In [None]:
implicit_models = {'ALS': AlternatingLeastSquares}

for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        models[f"{implicit_name}_{n_factors}"] = ImplicitALSWrapperModel(
            model=implicit_model(
                factors=n_factors,
                random_state=RANDOM_STATE,
                num_threads=NUM_THREADS))



In [None]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(no_components=n_factors,
                    loss=loss,
                    random_state=RANDOM_STATE),
            epochs=10,
            num_threads=NUM_THREADS)

In [23]:
metrics_name = {'MAP': MAP}

metrics = {}
for metric_name, metric in metrics_name.items():
    metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

In [None]:
results = []
for model_name, model in tqdm(models.items()):
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=test[Columns.User].unique(),
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

In [None]:
df_quality = pd.DataFrame(results).set_index('model')

In [None]:
df_quality.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0_level_0,MAP@10
model,Unnamed: 1_level_1
ALS_4,0.05536
ALS_16,0.031337
ALS_32,0.02804
LightFM_logistic_4,0.074705
LightFM_logistic_16,0.074717
LightFM_logistic_32,0.074803
LightFM_bpr_4,0.036435
LightFM_bpr_16,0.027962
LightFM_bpr_32,0.022902
LightFM_warp_4,0.077356


# Training best model

In [None]:
model = models['LightFM_warp_16']

In [None]:
dataset = Dataset.construct(interactions_df=interactions)

In [None]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f2128cffdc0>

In [None]:
# предиктим на тест
%time    
recos = list(model.recommend(
        users=[176549],
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True)['item_id'])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.48 µs


In [None]:
recos

[9728, 15297, 4151, 7571, 4880, 6809, 11237, 8636, 13018, 142]

In [None]:
joblib.dump(model, '/content/drive/MyDrive/Проекты/recsys/models/LightFM_warp_16.joblib')

['/content/drive/MyDrive/Проекты/recsys/models/LightFM_warp_16.joblib']

# Predict for all users

In [32]:
model = joblib.load('/content/drive/MyDrive/Проекты/recsys/models/LightFM_warp_16.joblib')

In [33]:
dataset = Dataset.construct(interactions_df=interactions)

In [36]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fe32f183f70>

In [39]:
recos = model.recommend(users=interactions['user_id'].unique(), 
                        dataset=dataset, 
                        k=K_RECOS, 
                        filter_viewed=True)

In [40]:
feather.write_feather(
    recos, '/content/drive/MyDrive/Проекты/recsys/data/processed_data/recos_lightfm.feather')

In [41]:
recos

Unnamed: 0,user_id,item_id,score,rank
0,176549,7571,1.622674,1
1,176549,10440,1.592388,2
2,176549,13865,1.563450,3
3,176549,3734,1.501265,4
4,176549,15266,1.476143,5
...,...,...,...,...
9621785,697262,2657,3.700612,6
9621786,697262,4880,3.602230,7
9621787,697262,142,3.546226,8
9621788,697262,6809,3.492388,9


# Prepearing features

## User features

In [24]:
users.isnull().sum()

user_id         0
age         14095
income      14776
sex         13831
kids_flg        0
dtype: int64

In [25]:
users['age'].fillna(users['age'].mode()[0], inplace=True)
users['income'].fillna(users['income'].mode()[0], inplace=True)
users['sex'].fillna(users['age'].mode()[0], inplace=True)

In [26]:
users.isnull().sum()

user_id     0
age         0
income      0
sex         0
kids_flg    0
dtype: int64

In [27]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [28]:
user_features_frames = []
for feature in ["sex", "age", "income", 'kids_flg']:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,0.0,sex
1,962099,0.0,sex
3,721985,1.0,sex
4,704055,1.0,sex
5,1037719,0.0,sex


In [29]:
user_features[user_features['id'] == 973171]

Unnamed: 0,id,value,feature
0,973171,0.0,sex
0,973171,age_25_34,age
0,973171,income_60_90,income
0,973171,1,kids_flg


## Item features

In [30]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [31]:
items.head()

Unnamed: 0,item_id,content_type,title,title_orig,release_year,genres,countries,for_kids,age_rating,studios,directors,actors,description,keywords,year_bin
0,10711,film,Поговори с ней,Hable con ella,2002.0,"драмы, зарубежные, детективы, мелодрамы",Испания,,16.0,,Педро Альмодовар,"Адольфо Фернандес, Ана Фернандес, Дарио Гранди...",Мелодрама легендарного Педро Альмодовара «Пого...,"Поговори, ней, 2002, Испания, друзья, любовь, ...","(2000.0, 2005.0]"
1,2508,film,Голые перцы,Search Party,2014.0,"зарубежные, приключения, комедии",США,,16.0,,Скот Армстронг,"Адам Палли, Брайан Хаски, Дж.Б. Смув, Джейсон ...",Уморительная современная комедия на популярную...,"Голые, перцы, 2014, США, друзья, свадьбы, прео...","(2010.0, 2015.0]"
2,10716,film,Тактическая сила,Tactical Force,2011.0,"криминал, зарубежные, триллеры, боевики, комедии",Канада,,16.0,,Адам П. Калтраро,"Адриан Холмс, Даррен Шалави, Джерри Вассерман,...",Профессиональный рестлер Стив Остин («Все или ...,"Тактическая, сила, 2011, Канада, бандиты, ганг...","(2010.0, 2015.0]"
3,7868,film,45 лет,45 Years,2015.0,"драмы, зарубежные, мелодрамы",Великобритания,,16.0,,Эндрю Хэй,"Александра Риддлстон-Барретт, Джеральдин Джейм...","Шарлотта Рэмплинг, Том Кортни, Джеральдин Джей...","45, лет, 2015, Великобритания, брак, жизнь, лю...","(2010.0, 2015.0]"
4,16268,film,Все решает мгновение,,1978.0,"драмы, спорт, советские, мелодрамы",СССР,,12.0,Ленфильм,Виктор Садовский,"Александр Абдулов, Александр Демьяненко, Алекс...",Расчетливая чаровница из советского кинохита «...,"Все, решает, мгновение, 1978, СССР, сильные, ж...","(1896.999, 1990.0]"


### Genre

In [32]:
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"
genre_feature.head()

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre


### Content

In [33]:
def make_features(data: pd.DataFrame, user_id_col: str, feature_col: str) -> pd.DataFrame:
    feature = data.reindex(columns=[user_id_col, feature_col])
    feature.columns = ["id", "value"]
    feature["feature"] = feature_col
    return feature

In [34]:
content_feature = make_features(items, Columns.Item, 'content_type')
content_feature.head()

Unnamed: 0,id,value,feature
0,10711,film,content_type
1,2508,film,content_type
2,10716,film,content_type
3,7868,film,content_type
4,16268,film,content_type


### Title

In [35]:
title_feature = make_features(items, Columns.Item, 'title')
title_feature.head()

Unnamed: 0,id,value,feature
0,10711,Поговори с ней,title
1,2508,Голые перцы,title
2,10716,Тактическая сила,title
3,7868,45 лет,title
4,16268,Все решает мгновение,title


### Countries

In [36]:
items['countries'].fillna('unknown', inplace=True)

In [37]:
countries_feature = make_features(items, Columns.Item, 'countries')
countries_feature.head()

Unnamed: 0,id,value,feature
0,10711,Испания,countries
1,2508,США,countries
2,10716,Канада,countries
3,7868,Великобритания,countries
4,16268,СССР,countries


### Studios

In [38]:
items['studios'].fillna('unknown', inplace=True)

In [39]:
studios_feature = make_features(items, Columns.Item, 'studios')
studios_feature.head()

Unnamed: 0,id,value,feature
0,10711,unknown,studios
1,2508,unknown,studios
2,10716,unknown,studios
3,7868,unknown,studios
4,16268,Ленфильм,studios


In [40]:
item_features = pd.concat((genre_feature, 
                           content_feature,
                          #  title_feature,
                           countries_feature, 
                           studios_feature))

In [41]:
item_features

Unnamed: 0,id,value,feature
0,10711,драмы,genre
0,10711,зарубежные,genre
0,10711,детективы,genre
0,10711,мелодрамы,genre
1,2508,зарубежные,genre
...,...,...,...
15958,6443,unknown,studios
15959,2367,unknown,studios
15960,10632,unknown,studios
15961,4538,unknown,studios


# Check models with features

In [42]:
N_EPOCHS = 10 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [None]:
models = {}

In [None]:
implicit_models = {'ALS': AlternatingLeastSquares}

for implicit_name, implicit_model in implicit_models.items():
    for is_fitting_features in (True, False):
        for n_factors in N_FACTORS:
            models[f"{implicit_name}_{n_factors}_{is_fitting_features}"] = (
                ImplicitALSWrapperModel(model=implicit_model(factors=n_factors,
                                                             random_state=RANDOM_STATE,
                                                             num_threads=NUM_THREADS),
                                        fit_features_together=is_fitting_features))



In [None]:
lightfm_losses = ('logistic', 'bpr', 'warp')

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
            LightFM(no_components=n_factors,
                    loss=loss,
                    random_state=RANDOM_STATE,
                    learning_rate=LEARNING_RATE,
                    user_alpha=USER_ALPHA,
                    item_alpha=ITEM_ALPHA),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS)

In [None]:
models

{'ALS_4_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f4f911c4ee0>,
 'ALS_16_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f4f8ee35d60>,
 'ALS_32_True': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f4f8ee35a00>,
 'ALS_4_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f4f90bf2d30>,
 'ALS_16_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f4f90bf2ee0>,
 'ALS_32_False': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f4f8ee35e50>,
 'LightFM_logistic_4': <rectools.models.lightfm.LightFMWrapperModel at 0x7f4f8ee480d0>,
 'LightFM_logistic_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f4f8ee481c0>,
 'LightFM_logistic_32': <rectools.models.lightfm.LightFMWrapperModel at 0x7f4f8ee482b0>,
 'LightFM_bpr_4': <rectools.models.lightfm.LightFMWrapperModel at 0x7f4f8ee483a0>,
 'LightFM_bpr_16': <rectools.models.lightfm.LightFMWrapperModel at 0x7f4f8ee48490>,
 'LightFM_bpr_32': <rectoo

In [None]:
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=user_features['feature'].unique(),
    item_features_df=item_features,
    cat_item_features=item_features['feature'].unique(),
)

In [None]:
TEST_USERS = test[Columns.User].unique()

In [None]:
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)
    gc.collect()

Fitting model ALS_4_True...
Fitting model ALS_16_True...
Fitting model ALS_32_True...
Fitting model ALS_4_False...
Fitting model ALS_16_False...
Fitting model ALS_32_False...
Fitting model LightFM_logistic_4...
Fitting model LightFM_logistic_16...
Fitting model LightFM_logistic_32...
Fitting model LightFM_bpr_4...
Fitting model LightFM_bpr_16...
Fitting model LightFM_bpr_32...
Fitting model LightFM_warp_4...
Fitting model LightFM_warp_16...
Fitting model LightFM_warp_32...


In [None]:
df_quality = pd.DataFrame(results).set_index('model')

In [None]:
df_quality.style.highlight_max(color='lightgreen', axis=0)

Unnamed: 0_level_0,MAP@10
model,Unnamed: 1_level_1
ALS_4_True,0.074784
ALS_16_True,0.074784
ALS_32_True,0.074784
ALS_4_False,0.063915
ALS_16_False,0.063915
ALS_32_False,0.063915
LightFM_logistic_4,0.000271
LightFM_logistic_16,0.000241
LightFM_logistic_32,0.000243
LightFM_bpr_4,0.00504


# Tuning best model

In [None]:
!pip install optuna >> None

In [None]:
import optuna

In [None]:
n_folds = 3
unit = "W"
n_units = 1

In [None]:
def objective(trial, 
              interactions, 
              user_features, 
              item_features, 
              test, 
              users: pd.DataFrame, 
              items: pd.DataFrame):

    last_date = interactions[Columns.Datetime].max().normalize()
    start_date = last_date - pd.Timedelta(n_folds * n_units + 1, unit=unit)
    periods = n_folds + 1
    freq = f"{n_units}{unit}"

    date_range = pd.date_range(start=start_date, 
                               periods=periods, 
                               freq=freq, 
                               tz=last_date.tz)
    
    cv = TimeRangeSplit(date_range=date_range)

    print('making model')

    model = LightFMWrapperModel(LightFM(no_components=trial.suggest_categorical('no_components', [8, 16, 32]),
                                        loss=trial.suggest_categorical('loss', ['warp']),
                                        random_state=RANDOM_STATE,
                                        learning_rate=trial.suggest_float(
                                            'learning_rate', 0.01, 0.3, log=True),
                                        user_alpha=trial.suggest_float(
                                            'user_alpha', 0.001, 0.3, log=True),
                                        item_alpha=trial.suggest_float(
                                            'item_alpha', 0.001, 0.3, log=True),
                                        # learning_schedule = trial.suggest_categorical(
                                        #     'learning_schedule', ['adagrad', 'adadelta']),
                                        max_sampled = trial.suggest_int('max_sampled', 2, 5)),
                                epochs=1,
                                num_threads=NUM_THREADS,
                                verbose=1)
    
    fold_iterator = cv.split(interactions, collect_fold_stats=True)
    results = []

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")
        print(fold_info)
        df_train = interactions.iloc[train_ids]
        df_test = interactions.iloc[test_ids][Columns.UserItem]
        catalog = df_train[Columns.Item].unique()

        users = users.loc[users[Columns.User].isin(df_train[Columns.User])].copy()  
        user_features_frames = []
        for feature in ["sex", "age", "income", 'kids_flg']:
            feature_frame = users.reindex(columns=[Columns.User, feature])
            feature_frame.columns = ["id", "value"]
            feature_frame["feature"] = feature
            user_features_frames.append(feature_frame)
        user_features = pd.concat(user_features_frames)

        items = items.loc[items[Columns.Item].isin(df_train[Columns.Item])].copy()    
        items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
        genre_feature = items[["item_id", "genre"]].explode("genre")
        genre_feature.columns = ["id", "value"]
        genre_feature["feature"] = "genre"
        content_feature = make_features(items, Columns.Item, 'content_type')
        countries_feature = make_features(items, Columns.Item, 'countries')
        studios_feature = make_features(items, Columns.Item, 'studios')
        item_features = pd.concat((genre_feature, 
                           content_feature,
                           countries_feature, 
                           studios_feature))



        dataset = Dataset.construct(
            interactions_df=df_train,
            user_features_df=user_features,
            cat_user_features=user_features['feature'].unique(),
            item_features_df=item_features,
            cat_item_features=item_features['feature'].unique(),
          )
        TEST_USERS = df_test[Columns.User].unique()

        print('fit the model')
        model.fit(dataset)

        print('prepering recomendation')
        recos = model.recommend(users=TEST_USERS,
                                dataset=dataset,
                                k=K_RECOS,
                                filter_viewed=True)
        
        metrics_name = {'MAP': MAP}

        metrics = {}

        for metric_name, metric in metrics_name.items():
            metrics[f'{metric_name}@{K_RECOS}'] = metric(k=K_RECOS)

        print('calculate metrics')
        metric_values = calc_metrics(metrics, recos, test, train)
        results.append(metric_values['MAP@10'])

    return np.mean(results)

In [None]:
study = optuna.create_study(direction='maximize', study_name='Light_FM')
func = lambda trial: objective(trial, interactions, user_features, item_features, test, users, items)
study.optimize(func, n_trials=10, show_progress_bar=True)

[32m[I 2022-12-10 11:57:49,703][0m A new study created in memory with name: Light_FM[0m


  0%|          | 0/10 [00:00<?, ?it/s]

making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.09s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.32s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:02<00:00,  2.52s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:01:32,887][0m Trial 0 finished with value: 0.009231850760465295 and parameters: {'no_components': 8, 'loss': 'warp', 'learning_rate': 0.03004480870205248, 'user_alpha': 0.001184002280380923, 'item_alpha': 0.008257908225558115, 'max_sampled': 2}. Best is trial 0 with value: 0.009231850760465295.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:17<00:00, 17.27s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:17<00:00, 17.38s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:18<00:00, 18.85s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:06:21,999][0m Trial 1 finished with value: 0.0017647665547755296 and parameters: {'no_components': 32, 'loss': 'warp', 'learning_rate': 0.13550951665304628, 'user_alpha': 0.0015220834266536443, 'item_alpha': 0.2071067776848237, 'max_sampled': 5}. Best is trial 0 with value: 0.009231850760465295.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.24s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.53s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.80s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:10:08,863][0m Trial 2 finished with value: 0.009134678313023092 and parameters: {'no_components': 8, 'loss': 'warp', 'learning_rate': 0.04730875172092568, 'user_alpha': 0.013975598006592191, 'item_alpha': 0.18945143664886638, 'max_sampled': 4}. Best is trial 0 with value: 0.009231850760465295.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:13<00:00, 13.78s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:14<00:00, 14.11s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:15<00:00, 15.66s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:14:38,332][0m Trial 3 finished with value: 1.961764284788742e-05 and parameters: {'no_components': 32, 'loss': 'warp', 'learning_rate': 0.05637676677243153, 'user_alpha': 0.0015005979518155097, 'item_alpha': 0.03458885719980059, 'max_sampled': 2}. Best is trial 0 with value: 0.009231850760465295.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:07<00:00,  7.53s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:07<00:00,  7.59s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:07<00:00,  7.90s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:18:39,833][0m Trial 4 finished with value: 0.008858336378480143 and parameters: {'no_components': 16, 'loss': 'warp', 'learning_rate': 0.05913339542391164, 'user_alpha': 0.0032083961427253616, 'item_alpha': 0.008522022166431947, 'max_sampled': 3}. Best is trial 0 with value: 0.009231850760465295.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:15<00:00, 15.78s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:17<00:00, 17.14s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:18<00:00, 18.42s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:23:41,270][0m Trial 5 finished with value: 0.015911553956382173 and parameters: {'no_components': 32, 'loss': 'warp', 'learning_rate': 0.2181636329286954, 'user_alpha': 0.07403424459068454, 'item_alpha': 0.02791630509662843, 'max_sampled': 4}. Best is trial 5 with value: 0.015911553956382173.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.79s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.99s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.80s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:27:33,657][0m Trial 6 finished with value: 0.007823630951283923 and parameters: {'no_components': 8, 'loss': 'warp', 'learning_rate': 0.2419752894667719, 'user_alpha': 0.005288012553597096, 'item_alpha': 0.0017814579817029526, 'max_sampled': 2}. Best is trial 5 with value: 0.015911553956382173.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.28s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:03<00:00,  3.68s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.39s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:31:29,485][0m Trial 7 finished with value: 0.01633402315631378 and parameters: {'no_components': 8, 'loss': 'warp', 'learning_rate': 0.1930957035463117, 'user_alpha': 0.02976836811657212, 'item_alpha': 0.01658933260303973, 'max_sampled': 3}. Best is trial 7 with value: 0.01633402315631378.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:10<00:00, 10.60s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:11<00:00, 11.12s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:13<00:00, 13.06s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:36:13,953][0m Trial 8 finished with value: 0.006147796205400757 and parameters: {'no_components': 32, 'loss': 'warp', 'learning_rate': 0.29144503551461715, 'user_alpha': 0.0042060687438637495, 'item_alpha': 0.0012632766474229545, 'max_sampled': 2}. Best is trial 7 with value: 0.01633402315631378.[0m
making model

{'Start date': Timestamp('2021-07-25 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'Train': 3838180, 'Train users': 734701, 'Train items': 15061, 'Test': 249396, 'Test users': 93092, 'Test items': 6611}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.13s/it]


prepering recomendation


  0%|          | 0/93092 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-01 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'Train': 4203885, 'Train users': 788721, 'Train items': 15212, 'Test': 264039, 'Test users': 98161, 'Test items': 6609}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.20s/it]


prepering recomendation


  0%|          | 0/98161 [00:00<?, ?it/s]

calculate metrics

{'Start date': Timestamp('2021-08-08 00:00:00', freq='W-SUN'), 'End date': Timestamp('2021-08-15 00:00:00', freq='W-SUN'), 'Train': 4587708, 'Train users': 842129, 'Train items': 15404, 'Test': 276699, 'Test users': 101983, 'Test items': 6715}
fit the model




Epoch:   0%|          | 0/1 [00:00<?, ?it/s][A[A

Epoch: 100%|██████████| 1/1 [00:04<00:00,  4.75s/it]


prepering recomendation


  0%|          | 0/101983 [00:00<?, ?it/s]

calculate metrics
[32m[I 2022-12-10 12:40:15,088][0m Trial 9 finished with value: 0.015156317595037707 and parameters: {'no_components': 8, 'loss': 'warp', 'learning_rate': 0.04211716441580764, 'user_alpha': 0.02508106940417908, 'item_alpha': 0.10966532770493737, 'max_sampled': 4}. Best is trial 7 with value: 0.01633402315631378.[0m


In [None]:
study.best_value

0.01633402315631378

In [None]:
study.best_params

{'no_components': 8,
 'loss': 'warp',
 'learning_rate': 0.1930957035463117,
 'user_alpha': 0.02976836811657212,
 'item_alpha': 0.01658933260303973,
 'max_sampled': 3}

# ANN

In [46]:
import nmslib

In [43]:
model = LightFMWrapperModel(LightFM(no_components=16,
                                    loss='warp',
                                    random_state=RANDOM_STATE,
                                    learning_rate=LEARNING_RATE,
                                    user_alpha=USER_ALPHA,
                                    item_alpha=ITEM_ALPHA),
                            epochs=N_EPOCHS,
                            num_threads=NUM_THREADS)

In [44]:
dataset = Dataset.construct(interactions_df=interactions,
                            user_features_df=user_features,
                            cat_user_features=user_features['feature'].unique(),
                            item_features_df=item_features,
                            cat_item_features=item_features['feature'].unique())

In [45]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f395f2ff760>

In [47]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [48]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [49]:
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)

In [50]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)

In [51]:
print(augmented_item_embeddings.shape)
print(augmented_user_embeddings.shape)

(15706, 19)
(962179, 19)


In [53]:
M = 48
efC = 100
K=10
space_name='negdotprod'

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [54]:
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

15706

In [55]:
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 

In [56]:
efS = 100
query_time_params = {'efSearch': efS}

In [57]:
query_matrix = augmented_user_embeddings

In [58]:
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)

In [68]:
nbrs[0]

(array([14534,  9906,  3941,  9242,  5874,  5701, 13171,  2525, 11574,
         3545], dtype=int32),
 array([909.3835 , 909.49567, 909.8572 , 910.09937, 910.1322 , 910.1344 ,
        910.373  , 910.46094, 910.4621 , 910.50323], dtype=float32))