Данный ноутбук является частью проекта "Рекомендательные системы с элементами NLP" команды МОВС ВШЭ

In [None]:
!pip install lightfm



In [None]:
from datetime import datetime
from matplotlib import pyplot as plt
from scipy import stats

import pandas as pd
import seaborn as sb
import numpy as np
import requests
import pickle

# Загружаем данные

In [None]:
items_df = pd.read_csv('items.csv')
users_df = pd.read_csv('users.csv')
interactions_df = pd.read_csv('interactions.csv')
interactions_df['last_watch_dt'] = pd.to_datetime(interactions_df['last_watch_dt'])

In [None]:
def split_df(df, n_days):
  test_df = df[df['last_watch_dt'] >= df['last_watch_dt'].max() - pd.DateOffset(days=n_days)].copy()
  train_df = df[df['last_watch_dt'] < df['last_watch_dt'].max() - pd.DateOffset(days=n_days)].copy()

  return train_df.loc[:, ['user_id', 'item_id', 'watched_pct']], test_df.loc[:, ['user_id', 'item_id', 'watched_pct']]

In [None]:
train_df, test_df = split_df(interactions_df, 7)

In [None]:
from lightfm.data import Dataset
dataset = Dataset()
item_features = tuple(map(lambda x: tuple(x), items_df.loc[:, ['directors']].values.tolist()))
dataset.fit(train_df['user_id'], train_df['item_id'], item_features=items_df['directors'].values)

# Добавим пользователей и фильмы из test без взаимодействий чтобы потом по ним можно было делать предикты
dataset.fit_partial(users=test_df['user_id'], items=test_df['item_id'])

In [None]:
interactions, weights = dataset.build_interactions(train_df.loc[:, ['user_id', 'item_id']].values.tolist())

In [None]:
items_with_interations_df = items_df[items_df['item_id'].isin(train_df['item_id'])]
features = [[v[0], v[1:].values.tolist()] for i, v in items_with_interations_df.loc[:, ['item_id', 'directors']].iterrows()]
item_features = dataset.build_item_features(features)

In [None]:
from lightfm import LightFM

model = LightFM(loss='bpr')
model.fit(interactions, item_features=item_features)
model.fit(interactions)

<lightfm.lightfm.LightFM at 0x7962da68ef80>

In [None]:
from tqdm.notebook import tqdm

batch = 10
preds_total = {}
users_to_predict = [dataset._user_id_mapping[x] for x in test_df['user_id'].unique()]
movies_to_predict = [dataset._item_id_mapping[x] for x in test_df['item_id'].unique()]

reversed_item_mapping = {v: k for k, v in dataset._item_id_mapping.items()}
reversed_user_mapping = {v: k for k, v in dataset._user_id_mapping.items()}

for i in tqdm(range(len(users_to_predict)//batch+1)):
  user_id = users_to_predict[i*batch : (i+1)*batch]
  user_preds = model.predict(user_id*len(movies_to_predict), movies_to_predict*len(user_id))
  user_preds = np.array(user_preds).reshape(len(user_id), len(movies_to_predict))


  preds_sorted = np.argpartition(user_preds, -len(user_id))[:, -len(user_id):][:, ::-1]
  for j in range(len(user_id)):
    preds_total[reversed_user_mapping[user_id[j]]] = preds_sorted[j]
preds_original = {k: [reversed_item_mapping[movies_to_predict[x]] for x in v.tolist()] for k, v in preds_total.items()}

  0%|          | 0/18591 [00:00<?, ?it/s]

In [None]:
test_users = pd.DataFrame({'user_id': test_df['user_id'].unique()})
test_users['predict'] = test_users.apply(lambda x: preds_original[x.values[0]], axis=1)

In [None]:
def map_at_k(interactions, predicts, k):
  map_list = []
  for user in predicts['user_id'].values:
    user_interactions = interactions[interactions['user_id'] == user]
    user_predicts = predicts[predicts['user_id'] == user]
    max_k = min(len(user_interactions), k)
    user_precision = []
    for i in range(1, max_k + 1):
      precision = len(set(user_predicts['predict'].values[0][:i])&set(user_interactions['item_id'].values[:i]))/i
      user_precision.append(precision)
    map_list.append(np.mean(user_precision))
  return np.mean(map_list)

In [None]:
map_at_10 = map_at_k(test_df, test_users, 10)

In [None]:
print(f'MAP@10: {map_at_10}')

MAP@10: 0.03202724657669472


In [34]:
with open('lightfm_model.pkl', 'wb') as f:
  pickle.dump(model, f)

with open('lightfm_dataset.pkl', 'wb') as f:
  pickle.dump(dataset, f)