In [21]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import nmslib

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset, Interactions
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel
from rectools.model_selection import TimeRangeSplitter

import matplotlib.pyplot as plt
import seaborn as sns

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 200)

# Load Data

In [2]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')
interactions = pd.read_csv('data_original/interactions.csv')

users.fillna('Unknown', inplace=True)

# Preprocess

In [3]:
interactions.drop(interactions[interactions['last_watch_dt'].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions['last_watch_dt'], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions = interactions[interactions.watched_pct > 0]
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 5, 1) # MB drop 0 pct

interactions = Interactions(interactions)
interactions.df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,datetime,weight
0,176549,9506,2021-05-11,4250,72.0,2021-05-11,5.0
1,699317,1659,2021-05-29,8317,100.0,2021-05-29,5.0
3,864613,7638,2021-07-05,14483,100.0,2021-07-05,5.0
4,964868,9506,2021-04-30,6725,100.0,2021-04-30,5.0
5,1032142,6686,2021-05-13,11286,100.0,2021-05-13,5.0


In [4]:
metrics_name = {
    'Precision': Precision,
    'Recall': Recall,
    'MAP': MAP,
}

metrics = {}
for metric_name, metric in metrics_name.items():
    for k in [1, 5, 10]:
        metrics[f'{metric_name}@{k}'] = metric(k=k)
metrics

{'Precision@1': Precision(k=1),
 'Precision@5': Precision(k=5),
 'Precision@10': Precision(k=10),
 'Recall@1': Recall(k=1),
 'Recall@5': Recall(k=5),
 'Recall@10': Recall(k=10),
 'MAP@1': MAP(k=1, divide_by_k=False),
 'MAP@5': MAP(k=5, divide_by_k=False),
 'MAP@10': MAP(k=10, divide_by_k=False)}

In [5]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 24
N_FACTORS = (64, 128, 256)

N_EPOCHS = [3, 5, 7] # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = [0.5, 0.1, 0.01] # Lightfm

N_SPLITS = 3
TEST_SIZE = '7D'

In [6]:
models = {
    'popular': PopularModel(),
}
implicit_models = {
    'ALS': AlternatingLeastSquares,
}
lightfm_losses = ('logistic', 'bpr', 'warp')

for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        for n_epochs in N_EPOCHS:
            models[f"{implicit_name}_{n_factors}_{n_epochs}"] = (
                ImplicitALSWrapperModel(
                    model=implicit_model(
                        factors=n_factors, 
                        random_state=RANDOM_STATE, 
                        num_threads=NUM_THREADS,
                        use_gpu=True,
                        iterations=n_epochs
                    ),
                    fit_features_together=True,
                )
            )

for loss in lightfm_losses:
    for n_factors in N_FACTORS:
        for n_epochs in N_EPOCHS:
            for lr in LEARNING_RATE:
                models[f"LightFM_{loss}_{n_factors}_{n_epochs}_{lr}"] = LightFMWrapperModel(
                    LightFM(
                        no_components=n_factors, 
                        loss=loss, 
                        random_state=RANDOM_STATE,
                        learning_rate=lr,
                        user_alpha=USER_ALPHA,
                        item_alpha=ITEM_ALPHA,
                    ),
                    epochs=n_epochs,
                    num_threads=NUM_THREADS,
                )

models

{'popular': <rectools.models.popular.PopularModel at 0x7f32916e7ac0>,
 'ALS_64_3': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e7af0>,
 'ALS_64_5': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e7220>,
 'ALS_64_7': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e54b0>,
 'ALS_128_3': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e5510>,
 'ALS_128_5': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e6b00>,
 'ALS_128_7': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e7cd0>,
 'ALS_256_3': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e7d60>,
 'ALS_256_5': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e7df0>,
 'ALS_256_7': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f32916e7e80>,
 'LightFM_logistic_64_3_0.5': <rectools.models.lightfm.LightFMWrapperModel at 0x7f32916e7fd0>,
 'LightFM_logistic_64_3_0.1': <rectools.models

# CV

In [7]:
cv = TimeRangeSplitter(
    test_size=TEST_SIZE,
    n_splits=N_SPLITS,
    filter_already_seen=True,
    filter_cold_items=True,
    filter_cold_users=True,
)

fold_iterator = cv.split(interactions, collect_fold_stats=True)

In [8]:
%%time
results = []

for i_fold, (train_ids, test_ids, fold_info) in enumerate(fold_iterator):
    # TrainTest split
    train = interactions.df.iloc[train_ids].copy()
    
    test = interactions.df.iloc[test_ids][Columns.UserItem].copy()

    # Users feature
    users_fold = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users_fold.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)

    # Items feature
    items_fold = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
    # Explode genres to flatten table
    items_fold["genre"] = items_fold["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items_fold[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"

    content_feature = items_fold.reindex(columns=[Columns.Item, "content_type"])
    content_feature.columns = ["id", "value"]
    content_feature["feature"] = "content_type"
    item_features = pd.concat((genre_feature, content_feature))

    dataset = Dataset.construct(
        interactions_df=train,
        user_features_df=user_features,
        cat_user_features=["sex", "age", "income"],
        item_features_df=item_features,
        cat_item_features=["genre", "content_type"],
    )
    TEST_USERS = test[Columns.User].unique()

    for model_name, model in models.items():
        print(f"Fitting model {model_name}...")
        model_quality = {'model': model_name, "fold": i_fold}

        model.fit(dataset)
        recos = model.recommend(
            users=TEST_USERS,
            dataset=dataset,
            k=K_RECOS,
            filter_viewed=True,
        )
        metric_values = calc_metrics(metrics, recos, test, train)
        model_quality.update(metric_values)
        results.append(model_quality)

Fitting model popular...
Fitting model ALS_64_3...
Fitting model ALS_64_5...
Fitting model ALS_64_7...
Fitting model ALS_128_3...
Fitting model ALS_128_5...
Fitting model ALS_128_7...
Fitting model ALS_256_3...
Fitting model ALS_256_5...
Fitting model ALS_256_7...
Fitting model LightFM_logistic_64_3_0.5...
Fitting model LightFM_logistic_64_3_0.1...
Fitting model LightFM_logistic_64_3_0.01...
Fitting model LightFM_logistic_64_5_0.5...
Fitting model LightFM_logistic_64_5_0.1...
Fitting model LightFM_logistic_64_5_0.01...
Fitting model LightFM_logistic_64_7_0.5...
Fitting model LightFM_logistic_64_7_0.1...
Fitting model LightFM_logistic_64_7_0.01...
Fitting model LightFM_logistic_128_3_0.5...
Fitting model LightFM_logistic_128_3_0.1...
Fitting model LightFM_logistic_128_3_0.01...
Fitting model LightFM_logistic_128_5_0.5...
Fitting model LightFM_logistic_128_5_0.1...
Fitting model LightFM_logistic_128_5_0.01...
Fitting model LightFM_logistic_128_7_0.5...
Fitting model LightFM_logistic_128_

In [10]:
import gc

del items_fold, user_features, item_features

gc.collect()

0

In [13]:
df_metrics = pd.DataFrame(results)
df_metrics.groupby('model').mean()[metrics.keys()].style.highlight_max(color='green', axis=0)

Unnamed: 0_level_0,Precision@1,Precision@5,Precision@10,Recall@1,Recall@5,Recall@10,MAP@1,MAP@5,MAP@10
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ALS_128_3,0.059086,0.037951,0.024097,0.034001,0.103684,0.126365,0.034001,0.060056,0.063658
ALS_128_5,0.059549,0.037994,0.024179,0.034317,0.103877,0.126745,0.034317,0.060257,0.063886
ALS_128_7,0.064535,0.039931,0.02542,0.037801,0.110362,0.134816,0.037801,0.064981,0.068843
ALS_256_3,0.059073,0.037945,0.024103,0.033996,0.103675,0.126371,0.033996,0.060047,0.063651
ALS_256_5,0.059442,0.037976,0.024167,0.034216,0.103784,0.126637,0.034216,0.060174,0.063801
ALS_256_7,0.060714,0.038442,0.024524,0.034993,0.105367,0.128875,0.034993,0.061257,0.064974
ALS_64_3,0.05913,0.037957,0.024115,0.034031,0.103728,0.12646,0.034031,0.060088,0.0637
ALS_64_5,0.059875,0.038096,0.024252,0.034484,0.104225,0.127182,0.034484,0.060463,0.064105
ALS_64_7,0.069145,0.041787,0.026506,0.040807,0.116153,0.141487,0.040807,0.069094,0.0731
LightFM_bpr_128_3_0.01,0.025531,0.009422,0.00615,0.015885,0.027533,0.034263,0.015885,0.020301,0.021212


# Train full model

In [2]:
users = pd.read_csv('data_original/users.csv')
items = pd.read_csv('data_original/items.csv')
interactions = pd.read_csv('data_original/interactions.csv')

users.fillna('Unknown', inplace=True)

In [3]:
interactions.drop(interactions[interactions['last_watch_dt'].str.len() != 10].index, inplace=True)
interactions[Columns.Datetime] = pd.to_datetime(interactions['last_watch_dt'], format='%Y-%m-%d')
max_date = interactions[Columns.Datetime].max()
interactions = interactions[interactions.watched_pct > 0]
interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 5, 1) # MB drop 0 pct

interactions = Interactions(interactions)
interactions.df.head()

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct,datetime,weight
0,176549,9506,2021-05-11,4250,72.0,2021-05-11,5.0
1,699317,1659,2021-05-29,8317,100.0,2021-05-29,5.0
3,864613,7638,2021-07-05,14483,100.0,2021-07-05,5.0
4,964868,9506,2021-04-30,6725,100.0,2021-04-30,5.0
5,1032142,6686,2021-05-13,11286,100.0,2021-05-13,5.0


In [4]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 24
N_FACTORS = 64

N_EPOCHS = 3 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.01 # Lightfm

In [5]:
# LightFM_warp_64_3_0.01
model = LightFMWrapperModel(
    LightFM(
        no_components=N_FACTORS, 
        loss='warp', 
        random_state=RANDOM_STATE,
        learning_rate=LEARNING_RATE,
        user_alpha=USER_ALPHA,
        item_alpha=ITEM_ALPHA),
    epochs=N_EPOCHS,
    num_threads=NUM_THREADS)

In [7]:
users = users.loc[users[Columns.User].isin(interactions.df[Columns.User])].copy()
user_features_frames = []
for feature in ["sex", "age", "income"]:
    feature_frame = users.reindex(columns=[Columns.User, feature])
    feature_frame.columns = ["id", "value"]
    feature_frame["feature"] = feature
    user_features_frames.append(feature_frame)
user_features = pd.concat(user_features_frames)

In [8]:
items = items.loc[items[Columns.Item].isin(interactions.df[Columns.Item])].copy()
# Explode genres to flatten table
items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
genre_feature = items[["item_id", "genre"]].explode("genre")
genre_feature.columns = ["id", "value"]
genre_feature["feature"] = "genre"

content_feature = items.reindex(columns=[Columns.Item, "content_type"])
content_feature.columns = ["id", "value"]
content_feature["feature"] = "content_type"
item_features = pd.concat((genre_feature, content_feature))

In [9]:
dataset = Dataset.construct(
    interactions_df=interactions.df,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [10]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fac5ccf9f60>

In [11]:
user_embeddings, item_embeddings = model.get_vectors(dataset)

In [12]:
user_embeddings.shape, item_embeddings.shape

((862388, 66), (14701, 66))

In [13]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

(14701, 67)

In [14]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(862388, 67)

In [17]:
# Set index parameters
# These are the most important ones
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [18]:
# Number of neighbors 
K=10

In [19]:
# Space name should correspond to the space name 
# used for brute-force search
space_name='negdotprod'

In [22]:
# Intitialize the library, specify the space, the type of the vector and add data points 
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 

14701

In [23]:
# Create an index
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params)
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}


In [24]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [25]:
query_matrix = augmented_user_embeddings[:1000, :]

In [26]:
import time
# Querying
query_qty = query_matrix.shape[0]
start = time.time() 
nbrs = index.knnQueryBatch(query_matrix, k = K, num_threads = num_threads)
end = time.time() 
print('kNN time total=%f (sec), per query=%f (sec), per query adjusted for thread number=%f (sec)' % 
      (end-start, float(end-start)/query_qty, num_threads*float(end-start)/query_qty)) 

kNN time total=0.005046 (sec), per query=0.000005 (sec), per query adjusted for thread number=0.000020 (sec)


In [32]:
list(index.knnQuery(query_matrix[0], k=K)[0])

[9, 210, 29, 15, 325, 44, 74, 150, 541, 640]

In [33]:
index.saveIndex("light_fm", save_data=True)

In [40]:
import pickle

with open("int_u_id2vec.pkl", "wb") as f:
    pickle.dump(augmented_user_embeddings, f)

In [42]:
ext_u_id2int_u_id = {}

for int_u_id, ext_u_id in enumerate(dataset.user_id_map.external_ids):
    ext_u_id2int_u_id[ext_u_id] = int_u_id

In [44]:
with open("ext_u_id2int_u_id.pkl", "wb") as f:
    pickle.dump(ext_u_id2int_u_id, f)

In [47]:
int_i_id2ext_i_id = {}

for int_i_id, ext_i_id in enumerate(dataset.item_id_map.external_ids):
    int_i_id2ext_i_id[int_i_id] = ext_i_id

In [48]:
with open("int_i_id2ext_i_id.pkl", "wb") as f:
    pickle.dump(int_i_id2ext_i_id, f)

In [50]:
test = nmslib.init(
            method='hnsw', 
            space='negdotprod', 
            data_type=nmslib.DataType.DENSE_VECTOR)

nmslib.loadIndex(test, "light_fm")

In [51]:
list(test.knnQuery(query_matrix[0], k=K)[0])

[9, 210, 29, 15, 325, 44, 74, 150, 541, 640]