In [11]:
import os
import pickle
import time
from collections import defaultdict
from functools import reduce
from pathlib import Path
from typing import Optional

import numpy as np
import pandas as pd
import scipy as sp
from lightfm import LightFM
from scipy.sparse import csr_matrix
from tqdm import tqdm

from service.api.models_zoo import LightFMWrapper

In [2]:
os.environ['OPENBLAS_NUM_THREADS'] = "1"
datetime_col = 'last_watch_dt'
DATA_PATH = Path("../data/kion_train/")

In [3]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 2.01 s, sys: 224 ms, total: 2.23 s
Wall time: 2.24 s


In [4]:
interactions[datetime_col] = pd.to_datetime(interactions[datetime_col], format='%Y-%m-%d')
interactions.dropna(inplace=True)

In [5]:
interactions['watched'] = pd.cut(
    x=interactions['watched_pct'],
    bins=5,
    labels=[1, 2, 3, 4, 5]
)

In [6]:
# Удаляем те юзеры и айтемы, которых нет в основных табличках
interactions = interactions.merge(users.user_id.drop_duplicates(), on='user_id')
interactions = interactions.merge(items.item_id.drop_duplicates(), on='item_id')

In [7]:
model = LightFMWrapper(epochs=5)

In [8]:
model.fit(train=interactions, item_features=items, user_features=users)

Epoch: 100%|██████████| 5/5 [00:27<00:00,  5.43s/it]


In [9]:
model.predict(user_id=176549)

[3734, 2657, 11237, 13594, 12248, 11754, 1020, 10242, 142, 1451]

In [12]:
with open('../data/lightfm.pickle', 'wb') as f:
      pickle.dump(model, f)