In [168]:
import pickle
import numpy as np
import pandas as pd
import requests
from tqdm.auto import tqdm

from rectools.models import PopularModel
from rectools.dataset import Interactions, Dataset
from rectools.model_selection import TimeRangeSplitter
from rectools import Columns
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from service.userknn import UserKnn


In [169]:
import zipfile
import os

url = 'https://github.com/irsafilo/KION_DATASET/raw/f69775be31fa5779907cf0a92ddedb70037fb5ae/data_original.zip'
req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='Downloading the kion dataset...', 
                        total=total_size_in_bytes, 
                        unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

zip_file_path = './kion_train.zip'
output_directory = '../artifacts'
os.makedirs(output_directory, exist_ok=True)
desired_files = ['first_reco_result.csv', 'interactions.csv', 'items.csv']

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    for file_info in zip_ref.infolist():
        if file_info.filename in desired_files:
            destination_path = os.path.join(output_directory, file_info.filename)
            zip_ref.extract(file_info, output_directory)


In [170]:
interactions_df = pd.read_csv('../artifacts/interactions.csv')
users = pd.read_csv('../artifacts/users.csv')
items = pd.read_csv('../artifacts/items.csv')

interactions_df.rename(columns={'last_watch_dt': Columns.Datetime,
                                'total_dur': Columns.Weight}, inplace=True) 

# Мы хотим учитывать юзеров с 1 или двумя просмотрами, так как этого недостаточно чтобы говорить о пользователях хоть что-нибудь
user_ids_all = interactions_df.groupby('user_id')['item_id'].nunique().reset_index(name='unique_items_count')
hot_users = user_ids_all[user_ids_all['unique_items_count'] > 20]['user_id']
interactions_df_hot_users = interactions_df[interactions_df['user_id'].isin(hot_users)]

interactions = Interactions(interactions_df_hot_users)   
interactions.df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[Columns.Weight] = df[Columns.Weight].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[Columns.Datetime] = df[Columns.Datetime].astype("datetime64[ns]")


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250.0,72.0
1,699317,1659,2021-05-29,8317.0,100.0
6,1016458,354,2021-08-14,1672.0,25.0
14,5324,8437,2021-04-18,6598.0,92.0
18,927973,9617,2021-06-19,8422.0,100.0
...,...,...,...,...,...
5476235,977542,13126,2021-07-04,1830.0,26.0
5476239,610017,7107,2021-05-10,1133.0,75.0
5476241,1073802,9927,2021-08-07,6425.0,97.0
5476242,268216,3071,2021-04-21,5752.0,98.0


In [171]:
N_SPLITS = 3
TEST_SIZE = '14D'

cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=False,
)
cv.get_test_fold_borders(interactions)

[(Timestamp('2021-07-12 00:00:00', freq='14D'),
  Timestamp('2021-07-26 00:00:00', freq='14D')),
 (Timestamp('2021-07-26 00:00:00', freq='14D'),
  Timestamp('2021-08-09 00:00:00', freq='14D')),
 (Timestamp('2021-08-09 00:00:00', freq='14D'),
  Timestamp('2021-08-23 00:00:00', freq='14D'))]

In [172]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    'map@10': MAP(k=10),
    'novelty': MeanInvUserFreq(k=10)
}

# few simple models to compare
models = {
    'cosine_userknn': CosineRecommender(),
    'tfidf_userknn': TFIDFRecommender(), 
}
TFIDFRecommender.recommend

<function implicit.nearest_neighbours.ItemItemRecommender.recommend(self, userid, user_items, N=10, filter_already_liked_items=True, filter_items=None, recalculate_user=False, items=None)>

In [173]:
%%time

results = []

fold_iterator = cv.split(interactions, collect_fold_stats=True)

models_wrps = {}
for model_name, model in models.items():
    models_wrps[model_name] = UserKnn(model=model, N_users=50)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    print(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    catalog = interactions.df[Columns.Item].unique()
    
    for model_name, userknn_model in models_wrps.items():
        userknn_model.fit(interactions.df)
    
        recos = userknn_model.predict(df_test)
    
        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=interactions.df,
            catalog=catalog,
        )
    
        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'i_split': 0, 'start': Timestamp('2021-07-12 00:00:00', freq='14D'), 'end': Timestamp('2021-07-26 00:00:00', freq='14D'), 'train': 1200549, 'train_users': 43213, 'train_items': 13689, 'test': 235690, 'test_users': 33311, 'test_items': 6868}




  0%|          | 0/49806 [00:00<?, ?it/s]



  0%|          | 0/49806 [00:00<?, ?it/s]


{'i_split': 1, 'start': Timestamp('2021-07-26 00:00:00', freq='14D'), 'end': Timestamp('2021-08-09 00:00:00', freq='14D'), 'train': 1442643, 'train_users': 46893, 'train_items': 14066, 'test': 250963, 'test_users': 33689, 'test_items': 7172}




  0%|          | 0/49806 [00:00<?, ?it/s]



  0%|          | 0/49806 [00:00<?, ?it/s]


{'i_split': 2, 'start': Timestamp('2021-08-09 00:00:00', freq='14D'), 'end': Timestamp('2021-08-23 00:00:00', freq='14D'), 'train': 1705079, 'train_users': 49230, 'train_items': 14416, 'test': 239096, 'test_users': 31833, 'test_items': 7063}




  0%|          | 0/49806 [00:00<?, ?it/s]



  0%|          | 0/49806 [00:00<?, ?it/s]

CPU times: user 14min 20s, sys: 31 s, total: 14min 51s
Wall time: 7min 46s


In [174]:
df_metrics = pd.DataFrame(results)
df_metrics.groupby('model').mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
cosine_userknn,0.002763,9.84241
tfidf_userknn,0.01938,9.250365


In [175]:
for model_name, model_wrp in models_wrps.items():
    with open(f"../artifacts/task3_cropped20_experiment_{model_name}.pkl", "wb") as file:
        pickle.dump(model_wrp, file)

In [176]:
with open("../artifacts/task3_cropped20_experiment_tfidf_userknn.pkl", "rb") as file:
    model_1 = pickle.load(file)


user_id_kostyl = pd.DataFrame({'user_id': [774973]})
recos = model_1.predict(user_id_kostyl)

    
metric_values = calc_metrics(
    metrics,
    reco=recos,
    interactions=df_test,
    prev_interactions=df_train,
    catalog=catalog,
)

KeyError: 774973

In [None]:
df_metrics = pd.DataFrame(results)
df_metrics.groupby('model').mean()[metrics.keys()]

In [177]:
hot_users

2               2
3               3
20             21
56             60
93            106
           ...   
962097    1097470
962113    1097486
962133    1097508
962138    1097513
962141    1097516
Name: user_id, Length: 49806, dtype: int64