In [2]:
import os
import warnings
warnings.filterwarnings('ignore')

In [2]:
%pip install implicit
%pip install rectools
%pip install lightfm
%pip install optuna



In [3]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset,Interactions
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm
import pandas as pd
from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [5]:
%%capture
!wget https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip
!unzip -o data_original.zip

In [4]:
interactions = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

In [5]:
interactions.rename(
        columns={
                'last_watch_dt': 'datetime',
                'total_dur': 'weight'
        },
        inplace=True)

In [6]:
interactions['datetime'] = pd.to_datetime(interactions['datetime'])
max_date = interactions['datetime'].max()
train = interactions[(interactions['datetime'] < max_date - pd.Timedelta(days=7))]
test = interactions[(interactions['datetime'] >= max_date - pd.Timedelta(days=7))]
train.drop(train.query("weight < 300").index, inplace=True)
print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (3832711, 5)
test: (490982, 5)


In [7]:
cold_users = set(test['user_id']) - set(train['user_id'])
test.drop(test[test['user_id'].isin(cold_users)].index, inplace=True)

In [8]:
dataset = Dataset.construct(
    interactions_df=train
)

In [9]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f'{metric_name}@{k}'] = metric(k=k)

In [10]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 64
N_FACTORS = 4

In [11]:
train

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5,1032142,6686,2021-05-13,11286,100.0
...,...,...,...,...,...
5476242,268216,3071,2021-04-21,5752,98.0
5476244,438585,7829,2021-08-02,6804,100.0
5476245,786732,4880,2021-05-12,753,0.0
5476247,546862,9673,2021-04-13,2308,49.0


#Baseline

In [12]:
model = LightFMWrapperModel(
            LightFM(
                no_components=10,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=0.05
            ),
            epochs=10,
            num_threads=NUM_THREADS,
        )

model.fit(dataset)

recs = model.recommend(
    users=test['user_id'].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True
)


calc_metrics(metrics, recs, test, train)

{'Precision@1': 0.083158518990591,
 'Recall@1': 0.041903265571052846,
 'Precision@2': 0.07009833859989201,
 'Recall@2': 0.06889460657467425,
 'Precision@3': 0.06274728484558803,
 'Recall@3': 0.0909325424998226,
 'Precision@4': 0.05661771865069881,
 'Recall@4': 0.10770747996504039,
 'Precision@5': 0.05144586951326375,
 'Recall@5': 0.12158278399921735,
 'Precision@6': 0.04677943298193438,
 'Recall@6': 0.13127959947830875,
 'Precision@7': 0.042932885201665556,
 'Recall@7': 0.1391165443964717,
 'Precision@8': 0.03993180027793267,
 'Recall@8': 0.1468865581592649,
 'Precision@9': 0.03742551335424221,
 'Recall@9': 0.15404583499303923,
 'Precision@10': 0.03532488913672695,
 'Recall@10': 0.16049615096371342,
 'MAP@1': 0.041903265571052846,
 'MAP@2': 0.056108593499169565,
 'MAP@3': 0.06410838873920938,
 'MAP@4': 0.06879153744216499,
 'MAP@5': 0.07200293631355272,
 'MAP@6': 0.07390842037778568,
 'MAP@7': 0.07524375587137153,
 'MAP@8': 0.0764146132949252,
 'MAP@9': 0.07739016888691584,
 'MAP@10': 

#Перебор параметров с помощью Optuna

In [None]:
import optuna
def objective(trial):
    n_factors = trial.suggest_categorical('n_factors', [16, 32, 64])
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.1, log=True)
    user_alpha = trial.suggest_float('user_alpha', 1e-10, 1e-2, log=True)
    item_alpha = trial.suggest_float('item_alpha', 1e-10, 1e-2, log=True)
    loss = trial.suggest_categorical('loss', ['logistic', 'bpr', 'warp'])
    epochs = trial.suggest_int('epochs', 1, 5)

    model = LightFMWrapperModel(
        LightFM(
            no_components=n_factors,
            loss=loss,
            learning_rate=learning_rate,
            user_alpha=user_alpha,
            item_alpha=item_alpha,
            random_state=RANDOM_STATE
        ),
        epochs=epochs,
        num_threads=8
    )

    dataset = Dataset.construct(
        interactions_df=train
    )
    model.fit(dataset)

    TEST_USERS = test['user_id'].unique()
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )

    metric_values = calc_metrics(metrics, recos, test, train)
    print(metric_values)
    return metric_values['MAP@10']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print(f"Value: {trial.value}")
print("Params: ")
for key, value in trial.params.items():
   print(f"{key}: {value}")

In [13]:
model = LightFMWrapperModel(
            LightFM(
                no_components=64,
                learning_rate=0.018752552061051517,
                user_alpha=1.7865068939394515e-06,
                item_alpha=1.930834633424772e-07,
                loss='warp',
                random_state=RANDOM_STATE,
            ),
            epochs=2,
            num_threads=NUM_THREADS,
        )

model.fit(dataset)

recs = model.recommend(
    users=test['user_id'].unique(),
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True
)


In [14]:
metric_values = calc_metrics(metrics, recs, test, train)
metric_values

{'Precision@1': 0.08376926277029838,
 'Recall@1': 0.04237956846421405,
 'Precision@2': 0.06979296671003833,
 'Recall@2': 0.06870873494728935,
 'Precision@3': 0.06268827578474674,
 'Recall@3': 0.09105295456889458,
 'Precision@4': 0.05773298990059924,
 'Recall@4': 0.11091180777068202,
 'Precision@5': 0.0519503969834568,
 'Recall@5': 0.1232852843701501,
 'Precision@6': 0.047401978573810004,
 'Recall@6': 0.1337477651091506,
 'Precision@7': 0.043388097956726955,
 'Recall@7': 0.14164521776516464,
 'Precision@8': 0.04011878523947352,
 'Recall@8': 0.1487114884739727,
 'Precision@9': 0.03767826883151241,
 'Recall@9': 0.15642200767199363,
 'Precision@10': 0.03560370694920205,
 'Recall@10': 0.16360219229183995,
 'MAP@1': 0.04237956846421405,
 'MAP@2': 0.05622549817610193,
 'MAP@3': 0.06431177017261644,
 'MAP@4': 0.06981100908712562,
 'MAP@5': 0.07269175664934822,
 'MAP@6': 0.07475395564056701,
 'MAP@7': 0.07610896378576285,
 'MAP@8': 0.07716481446187483,
 'MAP@9': 0.07821258165355395,
 'MAP@10': 

 'angular' метрики Annoy

In [27]:
from annoy import AnnoyIndex

def annoy_recs(user_ids: np.array, dataset: Dataset, model: LightFMWrapperModel):
    internal_ids = dataset.user_id_map.to_internal.loc[user_ids].values
    user_vecs, product_vecs = model.get_vectors(dataset)
    normalized_user_vecs = user_vecs / np.linalg.norm(user_vecs, axis=1)[:, np.newaxis]
    target_user_vecs = normalized_user_vecs[internal_ids]

    # Построение индекса с помощью Annoy для поиска ближайших продуктов
    product_annoy_index = AnnoyIndex(product_vecs.shape[1], 'angular')
    for idx, vec in enumerate(product_vecs):
        product_annoy_index.add_item(idx, vec)
    product_annoy_index.build(10)
    recommendations = []
    for idx, vec in tqdm(enumerate(target_user_vecs), total=len(target_user_vecs)):
        nearest_items = product_annoy_index.get_nns_by_vector(vec, k)
        recommendations.append({
            'internal_user_id': idx,
            'recommended_items': nearest_items
        })

    # Подготовка итоговой таблицы
    user_mapping = dataset.user_id_map.to_external\
        .reset_index()\
        .rename(columns={'index': 'internal_user_id', 0: 'user_id'})

    recs_df = pd.DataFrame(recommendations)
    recs_df = recs_df.explode('recommended_items').reset_index(drop=True)
    final_df = recs_df.merge(user_mapping, on='internal_user_id')[['user_id', 'recommended_items']]

    final_df['position'] = final_df.groupby('user_id').cumcount() + 1
    return final_df
ann_recs = annoy_recs(train['user_id'].unique(), dataset, model)

calc_metrics(metrics, ann_recs, test, train)

100%|██████████| 756562/756562 [00:22<00:00, 34066.92it/s]


{'Precision@1': 5.3790336028229166e-05,
 'Recall@1': 1.2020328058702782e-05,
 'Precision@2': 0.00010758067205645833,
 'Recall@2': 7.144207697374152e-05,
 'Precision@3': 0.00016316401928562844,
 'Recall@3': 0.00016674534691950578,
 'Precision@4': 0.00019095569290021356,
 'Recall@4': 0.00027865289374707434,
 'Precision@5': 0.0002366774785242083,
 'Recall@5': 0.000449224409444676,
 'Precision@6': 0.00026715866894020486,
 'Recall@6': 0.0006276300538897916,
 'Precision@7': 0.00030122588175808323,
 'Recall@7': 0.0008102078589800416,
 'Precision@8': 0.0003550162177863125,
 'Recall@8': 0.0011575373696815607,
 'Precision@9': 0.00040940422421485527,
 'Recall@9': 0.001482901749137433,
 'Precision@10': 0.0003684638017933699,
 'Recall@10': 0.001482901749137433,
 'MAP@1': 1.2020328058702782e-05,
 'MAP@2': 4.1731202516222146e-05,
 'MAP@3': 7.349895916481025e-05,
 'MAP@4': 0.00010155494930703799,
 'MAP@5': 0.000135702871406576,
 'MAP@6': 0.0001654371454807619,
 'MAP@7': 0.00019157879932445507,
 'MAP@8

Метрика значительно ухудшилась

Изменяем построение индекса на dot

In [28]:
from annoy import AnnoyIndex

def annoy_recs(user_ids: np.array, dataset: Dataset, model: LightFMWrapperModel):
    internal_ids = dataset.user_id_map.to_internal.loc[user_ids].values
    user_vecs, product_vecs = model.get_vectors(dataset)
    normalized_user_vecs = user_vecs / np.linalg.norm(user_vecs, axis=1)[:, np.newaxis]
    target_user_vecs = normalized_user_vecs[internal_ids]

    # Построение индекса с помощью Annoy для поиска ближайших продуктов
    product_annoy_index = AnnoyIndex(product_vecs.shape[1], 'dot')
    for idx, vec in enumerate(product_vecs):
        product_annoy_index.add_item(idx, vec)
    product_annoy_index.build(10)
    recommendations = []
    for idx, vec in tqdm(enumerate(target_user_vecs), total=len(target_user_vecs)):
        nearest_items = product_annoy_index.get_nns_by_vector(vec, k)
        recommendations.append({
            'internal_user_id': idx,
            'recommended_items': nearest_items
        })

    # Подготовка итоговой таблицы
    user_mapping = dataset.user_id_map.to_external\
        .reset_index()\
        .rename(columns={'index': 'internal_user_id', 0: 'user_id'})

    recs_df = pd.DataFrame(recommendations)
    recs_df = recs_df.explode('recommended_items').reset_index(drop=True)
    final_df = recs_df.merge(user_mapping, on='internal_user_id')[['user_id', 'recommended_items']]

    final_df['position'] = final_df.groupby('user_id').cumcount() + 1
    return final_df
ann_recs = annoy_recs(train['user_id'].unique(), dataset, model)

calc_metrics(metrics, ann_recs, test, train)

100%|██████████| 756562/756562 [00:25<00:00, 29778.32it/s]


{'Precision@1': 5.3790336028229166e-05,
 'Recall@1': 1.2020328058702782e-05,
 'Precision@2': 0.00010758067205645833,
 'Recall@2': 7.144207697374152e-05,
 'Precision@3': 0.00016316401928562844,
 'Recall@3': 0.00016674534691950578,
 'Precision@4': 0.00019095569290021356,
 'Recall@4': 0.00027865289374707434,
 'Precision@5': 0.0002366774785242083,
 'Recall@5': 0.000449224409444676,
 'Precision@6': 0.00026715866894020486,
 'Recall@6': 0.0006276300538897916,
 'Precision@7': 0.00030122588175808323,
 'Recall@7': 0.0008102078589800416,
 'Precision@8': 0.0003550162177863125,
 'Recall@8': 0.0011575373696815607,
 'Precision@9': 0.00040940422421485527,
 'Recall@9': 0.001482901749137433,
 'Precision@10': 0.0003684638017933699,
 'Recall@10': 0.001482901749137433,
 'MAP@1': 1.2020328058702782e-05,
 'MAP@2': 4.1731202516222146e-05,
 'MAP@3': 7.349895916481025e-05,
 'MAP@4': 0.00010155494930703799,
 'MAP@5': 0.000135702871406576,
 'MAP@6': 0.0001654371454807619,
 'MAP@7': 0.00019157879932445507,
 'MAP@8

Если изменять тип построение индекса результат не меняется

Делаем предсказания по всем пользователям

In [None]:
model = LightFMWrapperModel(
            LightFM(
                no_components=64,
                learning_rate=0.018752552061051517,
                user_alpha=1.7865068939394515e-06,
                item_alpha=1.930834633424772e-07,
                loss='warp',
                random_state=RANDOM_STATE,
            ),
            epochs=2,
            num_threads=NUM_THREADS,
        )

model.fit(Dataset.construct(interactions))

final_recs = model.recommend(
    users=interactions['user_id'].unique(),
    dataset=Dataset.construct(interactions),
    k=K_RECOS,
    filter_viewed=True
)

In [20]:
users_recs = final_recs.groupby('user_id').agg({'item_id': list})
users_recs.to_json('lightfm_recs.json')