In [141]:
import pandas as pd
from tqdm.auto import tqdm
from pprint import pprint
from implicit.nearest_neighbours import CosineRecommender, TFIDFRecommender, BM25Recommender
import warnings

from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, MeanInvUserFreq, calc_metrics
from rectools.models.popular import PopularModel
from rectools.models import ImplicitItemKNNWrapperModel
from rectools.model_selection import TimeRangeSplitter

from src import UserKnn

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", 200)

In [134]:
interactions_df = pd.read_csv("../data_original/interactions.csv")
users = pd.read_csv("../data_original/users.csv")
items = pd.read_csv("../data_original/items.csv")

interactions_df.rename(columns={"last_watch_dt": Columns.Datetime, "total_dur": Columns.Weight}, inplace=True)
# will cast types and save new pd.DataFrame inside in Interactions.df
interactions = Interactions(interactions_df)

# ! если хотите быстро прогнать этот ноутбук - раскомментируйте эту строку - она уменьшает данные
interactions = Interactions(interactions_df.sample(frac=0.5))

interactions.df.head()

Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
5376384,468564,4151,2021-08-22,5.0,0.0
4931199,767571,483,2021-05-02,8300.0,96.0
5154463,736985,5853,2021-05-17,3662.0,61.0
3848150,187996,626,2021-04-13,2781.0,58.0
4126685,45808,12192,2021-07-14,15.0,0.0


In [145]:
def validate(models: dict, metrics: dict, interactions, n_splits=3, test_size="7D", k=10):
    results = []

    # Init generator of folds
    cv = TimeRangeSplitter(
        test_size=test_size,
        n_splits=n_splits,
        filter_already_seen=True,
        filter_cold_items=True,
        filter_cold_users=False,
    )
    fold_iterator = cv.split(interactions, collect_fold_stats=True)

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        print(f"\n==================== Fold {i_fold}")
        pprint(fold_info)

        df_train = interactions.df.iloc[train_ids].copy()
        df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

        dataset = Dataset.construct(interactions.df)

        cold_users = list(set(df_test.user_id) - set(df_train.user_id))
        test_users = list(df_test["user_id"])

        df_test = df_test.loc[~df_test["user_id"].isin(cold_users)]

        catalog = df_train[Columns.Item].unique()

        pop_model = PopularModel()
        pop_model.fit(dataset)
        pop_recos = pop_model.recommend(cold_users + test_users, dataset=dataset, k=k, filter_viewed=True)
        pop_recos["rank"] += k

        for model_name, model in models.items():
            userknn_model = UserKnn(model=model, N_users=50)
            userknn_model.fit(df_train)

            recos = userknn_model.predict(df_test)
            recos = pd.concat((recos, pop_recos), axis=0, ignore_index=True)
            recos.sort_values(by=["user_id", "rank"], ascending=True, inplace=True)
            recos.drop_duplicates(["user_id", "item_id"], keep="first", inplace=True)
            recos["rank"] = recos.groupby("user_id").cumcount() + 1
            recos = recos[recos["rank"] <= k]

            metric_values = calc_metrics(
                metrics,
                reco=recos,
                interactions=df_test,
                prev_interactions=df_train,
                catalog=catalog,
            )

            fold = {"fold": i_fold, "model": model_name}
            fold.update(metric_values)
            results.append(fold)

    return results

In [146]:
N_SPLITS = 3
TEST_SIZE = "7D"

In [137]:
# calculate several classic (precision@k and recall@k) and "beyond accuracy" metrics
metrics = {
    "map@10": MAP(k=10),
    "novelty": MeanInvUserFreq(k=10),
}

# few simple models to compare
models = {
    "cosine_userknn": CosineRecommender(),  # implicit
    "tfidf_userknn": TFIDFRecommender(),
    "bm25": BM25Recommender(),
}

In [138]:
%%time

results = validate(models, metrics, interactions, N_SPLITS, TEST_SIZE)
df_metrics = pd.DataFrame(results)
df_metrics.groupby("model").mean()[metrics.keys()]


{'end': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-02 00:00:00', freq='7D'),
 'test': 185509,
 'test_items': 5743,
 'test_users': 100960,
 'train': 2132439,
 'train_items': 13417,
 'train_users': 601016}


  0%|          | 0/601016 [00:00<?, ?it/s]

  0%|          | 0/601016 [00:00<?, ?it/s]

  0%|          | 0/601016 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'i_split': 1,
 'start': Timestamp('2021-08-09 00:00:00', freq='7D'),
 'test': 194025,
 'test_items': 5841,
 'test_users': 105090,
 'train': 2323923,
 'train_items': 13619,
 'train_users': 642433}


  0%|          | 0/642433 [00:00<?, ?it/s]

  0%|          | 0/642433 [00:00<?, ?it/s]

  0%|          | 0/642433 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 2,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 205989,
 'test_items': 5830,
 'test_users': 111242,
 'train': 2525897,
 'train_items': 13814,
 'train_users': 685628}


  0%|          | 0/685628 [00:00<?, ?it/s]

  0%|          | 0/685628 [00:00<?, ?it/s]

  0%|          | 0/685628 [00:00<?, ?it/s]

CPU times: user 1h 31min 27s, sys: 1min 15s, total: 1h 32min 43s
Wall time: 1h 34min 4s


Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
bm25,0.001774,6.452797
cosine_userknn,0.002166,5.631892
tfidf_userknn,0.003976,5.753848


In [139]:
N_SPLITS = 3
TEST_SIZE = "14D"
interactions_cut = Interactions(interactions_df.sample(frac=0.1))

In [140]:
%%time

results = validate(models, metrics, interactions, N_SPLITS, TEST_SIZE)
df_metrics = pd.DataFrame(results)
df_metrics.groupby("model").mean()[metrics.keys()]


{'end': Timestamp('2021-07-26 00:00:00', freq='14D'),
 'i_split': 0,
 'start': Timestamp('2021-07-12 00:00:00', freq='14D'),
 'test': 318418,
 'test_items': 6627,
 'test_users': 151061,
 'train': 1619375,
 'train_items': 12866,
 'train_users': 483519}


  0%|          | 0/483519 [00:00<?, ?it/s]

  0%|          | 0/483519 [00:00<?, ?it/s]

  0%|          | 0/483519 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'i_split': 1,
 'start': Timestamp('2021-07-26 00:00:00', freq='14D'),
 'test': 362595,
 'test_items': 6881,
 'test_users': 168438,
 'train': 1945871,
 'train_items': 13253,
 'train_users': 558256}


  0%|          | 0/558256 [00:00<?, ?it/s]

  0%|          | 0/558256 [00:00<?, ?it/s]

  0%|          | 0/558256 [00:00<?, ?it/s]


{'end': Timestamp('2021-08-23 00:00:00', freq='14D'),
 'i_split': 2,
 'start': Timestamp('2021-08-09 00:00:00', freq='14D'),
 'test': 393985,
 'test_items': 6834,
 'test_users': 180610,
 'train': 2323923,
 'train_items': 13619,
 'train_users': 642433}


  0%|          | 0/642433 [00:00<?, ?it/s]

  0%|          | 0/642433 [00:00<?, ?it/s]

  0%|          | 0/642433 [00:00<?, ?it/s]

CPU times: user 1h 11min 54s, sys: 1min 24s, total: 1h 13min 19s
Wall time: 1h 14min 27s


Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
bm25,0.001746,6.04133
cosine_userknn,0.002335,5.376592
tfidf_userknn,0.00445,5.466262


In [160]:
results = []

interactions = Interactions(interactions_df)

k = 10

# Init generator of folds
cv = TimeRangeSplitter(
    test_size="7D",
    n_splits=1,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)
fold_iterator = cv.split(interactions, collect_fold_stats=True)

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    print(f"\n==================== Fold {i_fold}")
    pprint(fold_info)

    df_train = interactions.df.iloc[train_ids].copy()
    df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    dataset = Dataset.construct(df_train)

    test_users = list(df_test["user_id"])

    catalog = df_train[Columns.Item].unique()

    for model_name, model in tqdm(models.items()):
        userknn_model = ImplicitItemKNNWrapperModel(model)
        userknn_model.fit(dataset)

        recos = userknn_model.recommend(
            users=test_users,
            dataset=dataset,
            k=k,
            filter_viewed=True,
        )

        metric_values = calc_metrics(
            metrics,
            reco=recos,
            interactions=df_test,
            prev_interactions=df_train,
            catalog=catalog,
        )

        fold = {"fold": i_fold, "model": model_name}
        fold.update(metric_values)
        results.append(fold)


{'end': Timestamp('2021-08-23 00:00:00', freq='7D'),
 'i_split': 0,
 'start': Timestamp('2021-08-16 00:00:00', freq='7D'),
 'test': 298878,
 'test_items': 6679,
 'test_users': 110076,
 'train': 5051815,
 'train_items': 15577,
 'train_users': 906071}


  0%|          | 0/3 [00:00<?, ?it/s]

In [161]:
df_metrics = pd.DataFrame(results)
df_metrics.groupby("model").mean()[metrics.keys()]

Unnamed: 0_level_0,map@10,novelty
model,Unnamed: 1_level_1,Unnamed: 2_level_1
bm25,0.073486,4.017902
cosine_userknn,0.044468,9.444826
tfidf_userknn,0.059698,6.930374


In [162]:
userknn_model = ImplicitItemKNNWrapperModel(BM25Recommender())
userknn_model.fit(dataset)

<rectools.models.implicit_knn.ImplicitItemKNNWrapperModel at 0x12117c8b0>

In [163]:
import pickle

with open("knn_weights.pkl", "wb") as f:
    pickle.dump(userknn_model, f)
