In [2]:
import pandas as pd
import requests
from tqdm.auto import tqdm
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings

from rectools import Columns
from rectools.dataset import Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.model_selection import TimeRangeSplitter

from userknn import UserKnn

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# Датасет KION

In [3]:
# url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
# req = requests.get(url, stream=True)

# with open('kion_train.zip', "wb") as fd:
#     total_size_in_bytes = int(req.headers.get('Content-Length', 0))
#     progress_bar = tqdm(desc='Downloading the kion dataset...',
#                         total=total_size_in_bytes,
#                         unit='iB', unit_scale=True)
#     for chunk in req.iter_content(chunk_size=2 ** 20):
#         progress_bar.update(len(chunk))
#         fd.write(chunk)

In [4]:
# !unzip kion_train.zip -x '__MACOSX/*'

In [5]:
interactions_df = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)
print(interactions_df.shape)
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
# interactions = Interactions(interactions_df.sample(frac=0.01))

interactions.df.head()

(5476251, 5)


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


## Задаем фолды для кросс-валидации

In [6]:
N_SPLITS = 3
TEST_SIZE = '7D'

In [7]:
# Init generator of folds
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [8]:
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-08-02 00:00:00', freq='7D'),
  Timestamp('2021-08-09 00:00:00', freq='7D')),
 (Timestamp('2021-08-09 00:00:00', freq='7D'),
  Timestamp('2021-08-16 00:00:00', freq='7D')),
 (Timestamp('2021-08-16 00:00:00', freq='7D'),
  Timestamp('2021-08-23 00:00:00', freq='7D'))]

In [9]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10),
}

# few simple models to compare
models = {
    'cosine_userknn_20': CosineRecommender(),  # implicit
    'tfidf_userknn_20': TFIDFRecommender(),
    'bm25_userknn_20': BM25Recommender(),
    'cosine_userknn_10': CosineRecommender(K=10),
    'tfidf_userknn_10': TFIDFRecommender(K=10),
    'bm25_userknn_10': BM25Recommender(K=10),
}

# CV

In [10]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    catalog = df_train[Columns.Item].unique()
    
    for model_name, model in models.items():
        userknn_model = UserKnn(model=model, N_users=50)
        userknn_model.fit(df_train)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 263681,
 'test_items': 6602,
 'test_users': 98184,
 'train': 4266013,
 'train_items': 15237,
 'train_users': 797423}


  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]

  0%|          | 0/797423 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 279422,
 'test_items': 6698,
 'test_users': 103511,
 'train': 4649162,
 'train_items': 15415,
 'train_users': 850489}


  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]

  0%|          | 0/850489 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

  0%|          | 0/906071 [00:00<?, ?it/s]

CPU times: user 1d 23h 15min 21s, sys: 6min 48s, total: 1d 23h 22min 9s
Wall time: 2h 20min 24s


In [11]:
df_metrics = pd.DataFrame(results)
df_metrics

Unnamed: 0,fold,model,map@10,novelty
0,0,cosine_userknn_20,0.004246,7.476357
1,0,tfidf_userknn_20,0.006773,7.573736
2,0,bm25_userknn_20,0.002784,9.199335
3,0,cosine_userknn_10,0.0042,6.937947
4,0,tfidf_userknn_10,0.005957,7.378542
5,0,bm25_userknn_10,0.003256,8.699078
6,1,cosine_userknn_20,0.003785,7.531878
7,1,tfidf_userknn_20,0.006279,7.631293
8,1,bm25_userknn_20,0.002679,9.28558
9,1,cosine_userknn_10,0.0039,6.983614


In [12]:
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
bm25_userknn_10,0.003164,8.776933
bm25_userknn_20,0.002699,9.283169
cosine_userknn_10,0.003958,6.992581
cosine_userknn_20,0.003905,7.540823
tfidf_userknn_10,0.005659,7.433605
tfidf_userknn_20,0.006334,7.638598


# Train full model for service

In [13]:
interactions_df = pd.read_csv('data_original/interactions.csv')
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True)
print(interactions_df.shape)
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
# interactions = Interactions(interactions_df.sample(frac=0.01))

interactions.df.head()

(5476251, 5)


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
2,656683,7107,2021-05-09,10.0,0.0
3,864613,7638,2021-07-05,14483.0,100.0
4,964868,9506,2021-04-30,6725.0,100.0


In [14]:
catalog = interactions.df[Columns.Item].unique()

In [15]:
model = UserKnn(model=TFIDFRecommender(), N_users=50)
model.fit(interactions.df)

  0%|          | 0/962179 [00:00<?, ?it/s]

In [16]:
recos = model.predict(interactions.df)

In [17]:
recos

Unnamed: 0,user_id,item_id,score,rank
2,1097557,3182,6.137841,1
0,1097557,4151,4.111983,2
1,1097557,15297,3.379502,3
3,1097557,10440,3.029132,4
4,1097556,12812,8.013524,1
...,...,...,...,...
12083193,0,7829,3.915136,6
12083195,0,12192,3.466101,7
12083188,0,9728,2.683715,8
12083189,0,10440,2.390548,9


In [19]:
recos.groupby("user_id")["item_id"].agg(list).to_dict()

{0: [14359, 2852, 6006, 6192, 7102, 7829, 12192, 9728, 10440, 15297],
 1: [3669, 10440, 15297],
 2: [10267, 2553, 5819, 5438, 8482, 9635, 11287, 3183, 13096, 7921],
 3: [965, 9438, 10520, 10345, 8801, 8581, 10240, 15719, 2025, 8252],
 4: [4700, 7626, 8636, 142, 2657, 3734, 9728, 15297, 10440],
 5: [15890, 7043, 11437, 2856, 12789, 3145, 6933, 14112, 14397, 2276],
 7: [8710, 4141, 1554, 4436, 14741, 849, 9996, 12192, 10440],
 8: [6809],
 9: [3076, 341, 3784, 9996, 11863, 142, 6809, 2657, 3734, 4880],
 10: [1660, 14344, 3819, 10020, 10824, 3190, 3553, 10732, 15399, 10323],
 11: [10455, 10788, 14646, 10125, 7946, 16043, 4880, 12192, 4740, 6809],
 12: [846],
 13: [1358, 1136, 15580, 15021, 5791, 598, 2220, 15997, 10436, 10464],
 14: [1204, 10440],
 15: [16341, 7997, 14500, 11739, 14378, 7476, 7107, 13865, 15297, 10440],
 16: [2657],
 17: [10196, 3734],
 18: [14461],
 19: [14275, 8707, 7000, 4549, 12743, 9728, 10440],
 20: [15297],
 21: [12184, 12261, 308, 9419, 4345, 10283, 9159, 12138, 13

In [21]:
import json

with open("tfidf_20_model.json", "w") as f:
    json.dump(recos.groupby("user_id")["item_id"].agg(list).to_dict(), f)