In [1]:
from google.colab import drive

drive.mount("/content/drive/")

Mounted at /content/drive/


In [25]:
!pip install implicit rectools lightfm nmslib

Collecting nmslib
  Downloading nmslib-2.1.1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11<2.6.2 (from nmslib)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
Building wheels for collected packages: nmslib
  Building wheel for nmslib (setup.py) ... [?25l[?25hdone
  Created wheel for nmslib: filename=nmslib-2.1.1-cp310-cp310-linux_x86_64.whl size=13578647 sha256=f68b3f275caafb6e7fb247df54688e3afbccb96acd94c56121566f933b98e060
  Stored in directory: /root/.cache/pip/wheels/21/1a/5d/4cc754a5b1a88405cad184b76f823897a63a8d19afcd4b9314
Successfully built nmslib
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [87]:
import os
import warnings

warnings.filterwarnings("ignore")
import itertools
import json
import math
import time
import typing as tp
from pathlib import Path
from pprint import pprint

import matplotlib.pyplot as plt
import nmslib
import numpy as np
import pandas as pd
import seaborn as sns
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from lightfm import LightFM
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.metrics import MAP, Precision, Recall, calc_metrics
from rectools.model_selection import TimeRangeSplitter
from rectools.models import (
    ImplicitALSWrapperModel,
    LightFMWrapperModel,
    PopularModel,
    RandomModel,
)
from tqdm import tqdm

# Load data

In [4]:
interactions = pd.read_csv(
    "/content/drive/MyDrive/itmo_recsys/kion_dataset/interactions.csv"
)
users = pd.read_csv("/content/drive/MyDrive/itmo_recsys/kion_dataset/users.csv")
items = pd.read_csv("/content/drive/MyDrive/itmo_recsys/kion_dataset/items.csv")

# Preprocess

In [5]:
Columns.Datetime = "last_watch_dt"

interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True
)
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format="%Y-%m-%d"
)
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

In [6]:
interactions = Interactions(interactions)

In [7]:
interactions.df.shape

(5476251, 6)

In [8]:
metrics_name = {
    "Precision": Precision,
    "Recall": Recall,
    "MAP": MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in range(1, 11):
        metrics[f"{metric_name}@{k}"] = metric(k=k)

In [9]:
N_SPLITS = 3
TEST_SIZE = "14D"

cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

In [10]:
def cv_train(cv, interactions, users, items, model, metrics):
    results = []

    fold_iterator = cv.split(interactions, collect_fold_stats=True)

    for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
        # print(f"\n==================== Fold {i_fold}")
        # pprint(fold_info)

        df_train = interactions.df.iloc[train_ids].copy()
        df_test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

        tmp_users = users.loc[users[Columns.User].isin(df_train[Columns.User])].copy()
        user_features_frames = []
        for feature in ["sex", "age", "income"]:
            feature_frame = tmp_users.reindex(columns=[Columns.User, feature])
            feature_frame.columns = ["id", "value"]
            feature_frame["feature"] = feature
            user_features_frames.append(feature_frame)
        user_features = pd.concat(user_features_frames)

        tmp_items = items.loc[items[Columns.Item].isin(df_train[Columns.Item])].copy()
        tmp_items["genre"] = (
            tmp_items["genres"]
            .str.lower()
            .str.replace(", ", ",", regex=False)
            .str.split(",")
        )
        genre_feature = tmp_items[["item_id", "genre"]].explode("genre")
        genre_feature.columns = ["id", "value"]
        genre_feature["feature"] = "genre"
        content_feature = tmp_items.reindex(columns=[Columns.Item, "content_type"])
        content_feature.columns = ["id", "value"]
        content_feature["feature"] = "content_type"
        item_features = pd.concat((genre_feature, content_feature))

        dataset = Dataset.construct(
            interactions_df=df_train,
            user_features_df=user_features,
            cat_user_features=["sex", "age", "income"],
            item_features_df=item_features,
            cat_item_features=["genre", "content_type"],
        )
        TEST_USERS = df_test[Columns.User].unique()
        model.fit(dataset)
        recos = model.recommend(
            users=TEST_USERS,
            dataset=dataset,
            k=10,
            filter_viewed=True,
        )
        metrics_values = calc_metrics(metrics, recos, df_test, df_train)
    return metrics_values

In [11]:
%%time


def sample_hyperparameters():
    while True:
        yield {
            "no_components": np.random.randint(32, 256),
            "loss": np.random.choice(["bpr", "warp", "logistic"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "num_epochs": np.random.randint(1, 4),
        }


results_metrics = []
results_hyperparams = []
cnt = 0

for hyperparams in itertools.islice(sample_hyperparameters(), 50):
    num_epochs = hyperparams.pop("num_epochs")
    model = LightFMWrapperModel(
        LightFM(**hyperparams, random_state=42),
        epochs=num_epochs,
        num_threads=2,
    )
    metrics_values = cv_train(
        cv=cv,
        interactions=interactions,
        users=users,
        items=items,
        model=model,
        metrics=metrics,
    )
    hyperparams["num_epochs"] = num_epochs
    results_metrics.append(metrics_values)
    results_hyperparams.append(hyperparams)
    cnt += 1
    print(f"CNT: {cnt}")
    print()

CNT: 1

CNT: 2

CNT: 3

CNT: 4

CNT: 5

CNT: 6

CNT: 7

CNT: 8

CNT: 9

CNT: 10

CNT: 11

CNT: 12

CNT: 13

CNT: 14

CNT: 15

CNT: 16

CNT: 17

CNT: 18

CNT: 19

CNT: 20

CNT: 21

CNT: 22

CNT: 23

CNT: 24

CNT: 25

CNT: 26

CNT: 27

CNT: 28

CNT: 29

CNT: 30

CNT: 31

CNT: 32

CNT: 33

CNT: 34

CNT: 35

CNT: 36

CNT: 37

CNT: 38

CNT: 39

CNT: 40

CNT: 41

CNT: 42

CNT: 43

CNT: 44

CNT: 45

CNT: 46

CNT: 47

CNT: 48

CNT: 49

CNT: 50

CPU times: user 10h 27min 12s, sys: 40min 52s, total: 11h 8min 5s
Wall time: 7h 13min 45s


In [12]:
results_metrics_df = pd.DataFrame(results_metrics)

In [13]:
results_hyperparams_df = pd.DataFrame(results_hyperparams)

In [14]:
results_metrics_df.to_csv("results_metrics_df.csv", sep=";", index=False)
results_hyperparams_df.to_csv("results_hyperparams_df.csv", sep=";", index=False)

In [15]:
!cp results_metrics_df.csv /content/drive/MyDrive/itmo_recsys/lesson_4
!cp results_hyperparams_df.csv /content/drive/MyDrive/itmo_recsys/lesson_4

In [8]:
results_metrics_df = pd.read_csv(
    "/content/drive/MyDrive/itmo_recsys/lesson_4/results_metrics_df.csv", sep=";"
)
results_hyperparams_df = pd.read_csv(
    "/content/drive/MyDrive/itmo_recsys/lesson_4/results_hyperparams_df.csv", sep=";"
)

In [13]:
results_metrics_df.style.highlight_max(color="lightgreen", axis=0)

Unnamed: 0,Precision@1,Recall@1,Precision@2,Recall@2,Precision@3,Recall@3,Precision@4,Recall@4,Precision@5,Recall@5,Precision@6,Recall@6,Precision@7,Recall@7,Precision@8,Recall@8,Precision@9,Recall@9,Precision@10,Recall@10,MAP@1,MAP@2,MAP@3,MAP@4,MAP@5,MAP@6,MAP@7,MAP@8,MAP@9,MAP@10
0,0.000429,8.7e-05,0.000666,0.000277,0.000446,0.000279,0.000338,0.00028,0.000273,0.000283,0.000244,0.0003,0.000267,0.000378,0.000262,0.00042,0.000254,0.000458,0.000242,0.000489,8.7e-05,0.000182,0.000183,0.000183,0.000184,0.000187,0.000198,0.000203,0.000207,0.00021
1,0.105831,0.049312,0.089808,0.081983,0.078949,0.10592,0.070692,0.125125,0.063778,0.139031,0.057834,0.149586,0.052848,0.157893,0.048866,0.165419,0.045608,0.172175,0.042912,0.178956,0.049312,0.066738,0.075635,0.081159,0.084484,0.086651,0.088132,0.08932,0.090307,0.091187
2,0.034822,0.017449,0.027389,0.026297,0.023116,0.032373,0.020398,0.037116,0.018415,0.041172,0.017032,0.044928,0.015914,0.048422,0.014978,0.051523,0.014198,0.054467,0.013542,0.057154,0.017449,0.022107,0.024302,0.025624,0.02653,0.027225,0.027787,0.028226,0.028601,0.028907
3,0.099394,0.045744,0.08409,0.075483,0.074038,0.098207,0.066605,0.116278,0.060734,0.130958,0.055563,0.142047,0.051312,0.151639,0.047943,0.160346,0.045014,0.168166,0.04255,0.175412,0.045744,0.061584,0.069946,0.075116,0.078598,0.080851,0.082514,0.083878,0.084982,0.085925
4,0.095292,0.043748,0.08223,0.074287,0.072693,0.09679,0.065139,0.114413,0.059109,0.128173,0.054296,0.139818,0.050383,0.15017,0.047098,0.159461,0.044418,0.168139,0.042121,0.17623,0.043748,0.059936,0.068243,0.07325,0.076489,0.078807,0.080609,0.082017,0.083212,0.084236
5,0.046587,0.023773,0.033002,0.032847,0.025633,0.037885,0.021185,0.041148,0.018246,0.043887,0.016122,0.046084,0.014483,0.047891,0.013227,0.049491,0.012251,0.051027,0.011456,0.052546,0.023773,0.028557,0.030344,0.031214,0.031799,0.032189,0.032464,0.03268,0.032864,0.033028
6,0.043679,0.022262,0.033526,0.033126,0.027829,0.040523,0.023892,0.045581,0.021216,0.049905,0.019221,0.053403,0.017652,0.056474,0.016448,0.05943,0.015454,0.062155,0.014572,0.064585,0.022262,0.027962,0.030631,0.032014,0.032963,0.033624,0.034118,0.034536,0.034886,0.035167
7,0.036827,0.017854,0.028939,0.027372,0.024707,0.034226,0.021546,0.038945,0.019304,0.042905,0.017721,0.046622,0.016486,0.049897,0.015428,0.052798,0.014577,0.055525,0.013879,0.058177,0.017854,0.022831,0.02529,0.026585,0.027472,0.028166,0.028695,0.029108,0.029449,0.029758
8,0.088611,0.04129,0.077103,0.070264,0.06929,0.093848,0.062798,0.111983,0.057155,0.125837,0.052449,0.136912,0.04845,0.145769,0.045067,0.153616,0.042257,0.160724,0.03998,0.167777,0.04129,0.0566,0.065242,0.070384,0.073646,0.075863,0.077413,0.078613,0.079601,0.080494
9,0.000429,8.7e-05,0.000666,0.000277,0.000446,0.000279,0.000359,0.000291,0.000342,0.000346,0.00029,0.000361,0.000269,0.000384,0.000301,0.000544,0.000271,0.000547,0.000265,0.000588,8.7e-05,0.000182,0.000183,0.000186,0.000197,0.000199,0.000203,0.000223,0.000223,0.000227


In [14]:
results_metrics_df[["MAP@10"]].style.highlight_max(color="lightgreen", axis=0)

Unnamed: 0,MAP@10
0,0.00021
1,0.091187
2,0.028907
3,0.085925
4,0.084236
5,0.033028
6,0.035167
7,0.029758
8,0.080494
9,0.000227


Для метрики MAP@10 самое большое значение получилось при 22 наборе гиперпараметров

In [23]:
results_hyperparams_df.iloc[22].to_dict()

{'no_components': 147,
 'loss': 'warp',
 'learning_rate': 0.0113568936785434,
 'item_alpha': 2.4930120203977194e-09,
 'user_alpha': 3.827995305671015e-09,
 'num_epochs': 1}

Обучим с такими гиперпараметрами

In [9]:
interactions = pd.read_csv(
    "/content/drive/MyDrive/itmo_recsys/kion_dataset/interactions.csv"
)
users = pd.read_csv("/content/drive/MyDrive/itmo_recsys/kion_dataset/users.csv")
items = pd.read_csv("/content/drive/MyDrive/itmo_recsys/kion_dataset/items.csv")

Columns.Datetime = "last_watch_dt"

interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True
)
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format="%Y-%m-%d"
)
max_date = interactions[Columns.Datetime].max()
interactions[Columns.Weight] = np.where(interactions["watched_pct"] > 10, 3, 1)

users.fillna("Unknown", inplace=True)

In [10]:
interactions = Interactions(interactions)

In [11]:
RANDOM_STATE = 42
FINAL_HYPERPARAMS = results_hyperparams_df.iloc[22].to_dict()
N_EPOCHS = FINAL_HYPERPARAMS.pop("num_epochs")
NUM_THREADS = 2

model = LightFMWrapperModel(
    LightFM(**FINAL_HYPERPARAMS, random_state=RANDOM_STATE),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS,
)

In [12]:
users = users.loc[users[Columns.User].isin(interactions.df[Columns.User])].copy()
user_features_frames = []

for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)


items = items.loc[items[Columns.Item].isin(interactions.df[Columns.Item])].copy()
items["genre"] = (
    items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
)
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

In [13]:
dataset = Dataset.construct(
    interactions_df=interactions.df,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [14]:
%%time

model.fit(dataset)

CPU times: user 2min 27s, sys: 842 ms, total: 2min 27s
Wall time: 1min 46s


<rectools.models.lightfm.LightFMWrapperModel at 0x7e94e6a94c40>

In [15]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [16]:
user_embeddings.shape, item_embeddings.shape

((962179, 149), (15706, 149))

In [17]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()

    extra_dim = np.sqrt(max_norm**2 - normed_factors**2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [18]:
print("pre shape: ", item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (15706, 149)


(15706, 150)

In [19]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(962179, 150)

In [22]:
M = 48
efC = 100

num_threads = 4
index_time_params = {
    "M": M,
    "indexThreadQty": num_threads,
    "efConstruction": efC,
    "post": 0,
}
print("Index-time parameters", index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [23]:
# Number of neighbors
K = 10

In [24]:
# Space name should correspond to the space name
# used for brute-force search
space_name = "negdotprod"

In [27]:
# Intitialize the library, specify the space, the type of the vector and add data points
index = nmslib.init(
    method="hnsw", space=space_name, data_type=nmslib.DataType.DENSE_VECTOR
)
index.addDataPointBatch(augmented_item_embeddings)

15706

In [29]:
# Create an index
start = time.time()
index_time_params = {"M": M, "indexThreadQty": num_threads, "efConstruction": efC}
index.createIndex(index_time_params)
end = time.time()
print("Index-time parameters", index_time_params)
print("Indexing time = %f" % (end - start))

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}
Indexing time = 1.899392


In [30]:
# Setting query-time parameters
efS = 100
query_time_params = {"efSearch": efS}
print("Setting query-time parameters", query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [31]:
query_matrix = augmented_user_embeddings[:1000, :]

In [32]:
# Querying
query_qty = query_matrix.shape[0]
start = time.time()
nbrs = index.knnQueryBatch(query_matrix, k=K, num_threads=num_threads)
end = time.time()
print(
    "kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)"
    % (
        end - start,
        float(end - start) / query_qty,
        num_threads * float(end - start) / query_qty,
    )
)

kNN time total=0.062700 (sec), per query=0.000063 (sec), per query adjusted for thread number=0.000251 (sec)


In [33]:
nbrs[0]

(array([ 32,  16, 235,  25,  10,  84,  18,  51, 174, 282], dtype=int32),
 array([51.393322, 51.646618, 51.978268, 51.989555, 52.163624, 52.202225,
        52.423367, 52.494106, 52.523647, 52.526386], dtype=float32))

Попробуем метод recommend

In [34]:
lightfm_recos = model.recommend(
    users=np.unique(interactions.df[Columns.User]),
    dataset=dataset,
    k=10,
    filter_viewed=True,
)

In [43]:
lightfm_recos.head()

Unnamed: 0,user_id,item_id,score,rank
0,0,10440,-48.290535,1
1,0,13865,-48.357216,2
2,0,3734,-48.773216,3
3,0,142,-48.874687,4
4,0,4151,-48.941368,5


In [44]:
final_recos = lightfm_recos.groupby("user_id").agg({"item_id": list})

In [45]:
final_recos.head()

Unnamed: 0_level_0,item_id
user_id,Unnamed: 1_level_1
0,"[10440, 13865, 3734, 142, 4151, 7829, 2657, 86..."
1,"[15297, 4151, 9728, 2657, 3734, 13865, 4880, 1..."
2,"[7829, 3182, 9728, 7626, 13865, 5411, 10942, 1..."
3,"[15297, 13865, 9996, 6809, 4740, 8636, 12995, ..."
4,"[15297, 10440, 9728, 13865, 3734, 4151, 2657, ..."


In [50]:
final_recos.shape

(962179, 1)

In [48]:
dataset.user_id_map.external_ids.shape

(962179,)

In [49]:
dataset.item_id_map.external_ids.shape

(15706,)

In [51]:
lightfm_recos.head()

Unnamed: 0,user_id,item_id,score,rank
0,0,10440,-48.290535,1
1,0,13865,-48.357216,2
2,0,3734,-48.773216,3
3,0,142,-48.874687,4
4,0,4151,-48.941368,5


In [59]:
user_mapping = {}
item_mapping = {}


for internal, external in enumerate(dataset.user_id_map.external_ids):
    user_mapping[internal] = external

for internal, external in enumerate(dataset.item_id_map.external_ids):
    item_mapping[internal] = external

In [68]:
lightfm_recos_final = lightfm_recos.copy()
lightfm_recos_final["user_id"] = lightfm_recos_final["user_id"].map(user_mapping)
lightfm_recos_final["item_id"] = lightfm_recos_final["item_id"].map(item_mapping)

In [114]:
final_recos = (
    lightfm_recos_final.groupby("user_id").agg({"item_id": list}).reset_index()
)

In [115]:
final_recos.head()

Unnamed: 0,user_id,item_id
0,0.0,"[10766.0, 802.0, 10753.0, 5781.0, 4225.0, 2758..."
1,1.0,"[10766.0, 802.0, 5781.0, 10753.0, 4880.0, 1263..."
2,3.0,"[12635.0, 10753.0, 8151.0, 10766.0, 9766.0, 42..."
3,4.0,"[5781.0, 802.0, 10766.0, 10753.0, 2758.0, 4225..."
4,7.0,"[5781.0, 802.0, 10766.0, 2758.0, 4225.0, 10753..."


In [116]:
def fill_na(items):
    POPULAR_RECOS = [
        202457,
        193123,
        132865,
        122119,
        91167,
        74803,
        68581,
        55043,
        45367,
        40372,
    ]
    final_items_list = []
    for item in items:
        if math.isnan(item):
            for popular in POPULAR_RECOS:
                if popular not in final_items_list:
                    final_items_list.append(int(popular))
                    break
        else:
            if item not in final_items_list:
                final_items_list.append(int(item))
    return final_items_list

In [117]:
final_recos["item_id"] = final_recos["item_id"].apply(fill_na)

In [118]:
final_recos["len"] = final_recos["item_id"].apply(lambda x: len(x))
final_recos["nunique"] = final_recos["item_id"].apply(lambda x: len(set(x)))
final_recos[final_recos["len"] != 10]

Unnamed: 0,user_id,item_id,len,nunique


In [120]:
final_recos[final_recos["nunique"] != 10]

Unnamed: 0,user_id,item_id,len,nunique


In [121]:
final_dict = dict(zip(final_recos["user_id"].astype(int), final_recos["item_id"]))

In [122]:
with open("lightfm_recos_final_2.json", "w") as file:
    json.dump(final_dict, file)

In [123]:
!cp lightfm_recos_final_2.json /content/drive/MyDrive/itmo_recsys/lesson_4