In [52]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset

from pathlib import Path
import typing as tp
from tqdm import tqdm

import optuna
import nmslib

In [2]:
DATA_PATH = Path("../data_original")

In [3]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 2.79 s, sys: 1.24 s, total: 4.03 s
Wall time: 4.06 s


In [4]:
def prepare_interactions(interactions: pd.DataFrame, test_size=7):
    Columns.Datetime = 'last_watch_dt'
    interactions = interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index)
    interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
    interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
    max_date = interactions[Columns.Datetime].max()
    train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=test_size)].copy()
    test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=test_size)].copy()
    return train, test

In [5]:
train, test = prepare_interactions(interactions)

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [6]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [7]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [8]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [9]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [10]:
def prepare_users(users: pd.DataFrame):
    users.fillna('Unknown', inplace=True)
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [11]:
user_features = prepare_users(users)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [12]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [13]:
def prepare_items(items: pd.DataFrame):
    # Explode genres to flatten table
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"

    # # Explode directors to flatten table
    # items["directors"] = items["directors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    # director_feature = items[["item_id", "directors"]].explode("directors")
    # director_feature.columns = ["id", "value"]
    # director_feature["feature"] = "directors"
    # director_feature.head()

    item_features_frames = []
    for feature in ["content_type"
                    # , "release_year"
                    ]:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        item_features_frames.append(feature_frame)
    item_feat = pd.concat(item_features_frames)
    
    item_features = pd.concat((genre_feature, item_feat))

    return item_features

In [14]:
item_features = prepare_items(items)
# item_features = pd.concat((item_features, item_feat))
item_features.shape

(50224, 3)

In [15]:
metrics = {
    'Precision@10': Precision(k=10),
    'Recall@10': Recall(k=10),
    'MAP@10': MAP(k=10),
}

In [16]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32, 64)

In [17]:
models = {
    'popular': PopularModel(),
}

In [18]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}
for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        models[f"{implicit_name}_{n_factors}"] = (
            ImplicitALSWrapperModel(
                model=implicit_model(
                    factors=n_factors, 
                    random_state=RANDOM_STATE, 
                    num_threads=NUM_THREADS,
                ),
                fit_features_together=True,
            )
        )

  check_blas_config()


In [19]:
models

{'popular': <rectools.models.popular.PopularModel at 0x7fceb0379a80>,
 'ALS_32': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fceb037abc0>,
 'ALS_64': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fcee4782d70>}

In [20]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.25 s, sys: 139 ms, total: 1.39 s
Wall time: 1.39 s


In [21]:
TEST_USERS = test[Columns.User].unique()

In [65]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model popular...
Fitting model ALS_32...




Fitting model ALS_64...




CPU times: user 35min 27s, sys: 43min 3s, total: 1h 18min 31s
Wall time: 20min 23s


In [66]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [45]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,popular,ALS_32,ALS_64
Precision@10,0.032803,0.031024,0.030943
Recall@10,0.15607,0.144221,0.14331
MAP@10,0.073836,0.074917,0.073967


### Optuna

In [22]:
def objective(trial):
    param = {
        "factors": trial.suggest_int("factors", 16, 64),
        "regularization": trial.suggest_float("regularization", 0.01, 0.05),
        "alpha": trial.suggest_float("alpha", 0.05, 1),
        "random_state": RANDOM_STATE,
    }
    model = ImplicitALSWrapperModel(
                model=implicit_model(
                    **param
                ),
                fit_features_together=True,
            )
    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=10,
        filter_viewed=True,
    )

    metric_value = calc_metrics({"MAP@10": MAP(k=10)}, recos, test, train)["MAP@10"]
    return metric_value

In [23]:
study = optuna.create_study(direction="maximize")
study.enqueue_trial(
    {
        'factors': 30, 
        'regularization': 0.03, 
        'random_state': 42, 
        'alpha': 0.5, 
    }
)

study.optimize(objective, n_trials=10, show_progress_bar=True, n_jobs=-1)

trial = study.best_trial

print(f"Best trial value: {trial.value}")

print("Best params: ")
for key, value in trial.params.items():
    print(f"{key}: {value}")

[I 2023-12-24 11:45:23,163] A new study created in memory with name: no-name-df93be79-a8da-4924-a036-b810bd18a7c3


  0%|          | 0/10 [00:00<?, ?it/s]



[I 2023-12-24 12:22:27,709] Trial 3 finished with value: 0.07503704775299193 and parameters: {'factors': 48, 'regularization': 0.021885984185420697, 'alpha': 0.45581029766356507}. Best is trial 3 with value: 0.07503704775299193.




[I 2023-12-24 12:22:56,367] Trial 1 finished with value: 0.07614787515934723 and parameters: {'factors': 35, 'regularization': 0.015020934354370779, 'alpha': 0.15927943685861523}. Best is trial 1 with value: 0.07614787515934723.




[I 2023-12-24 12:23:48,358] Trial 2 finished with value: 0.07535327615970797 and parameters: {'factors': 42, 'regularization': 0.021465114146510308, 'alpha': 0.5636219256375214}. Best is trial 1 with value: 0.07614787515934723.
[I 2023-12-24 12:23:48,543] Trial 0 finished with value: 0.07564838897280246 and parameters: {'factors': 30, 'regularization': 0.03, 'alpha': 0.5}. Best is trial 1 with value: 0.07614787515934723.




[I 2023-12-24 13:03:00,950] Trial 4 finished with value: 0.07446528969254806 and parameters: {'factors': 16, 'regularization': 0.042432299318720494, 'alpha': 0.9637762169156807}. Best is trial 1 with value: 0.07614787515934723.




[I 2023-12-24 13:06:43,368] Trial 5 finished with value: 0.07442299790127284 and parameters: {'factors': 58, 'regularization': 0.0261793229592841, 'alpha': 0.9291759324175843}. Best is trial 1 with value: 0.07614787515934723.




[I 2023-12-24 13:07:34,454] Trial 7 finished with value: 0.07655059926977245 and parameters: {'factors': 52, 'regularization': 0.03458539892561771, 'alpha': 0.1964018837464644}. Best is trial 7 with value: 0.07655059926977245.
[I 2023-12-24 13:08:15,482] Trial 6 finished with value: 0.07652474221094695 and parameters: {'factors': 41, 'regularization': 0.03105693383299247, 'alpha': 0.11325359890372698}. Best is trial 7 with value: 0.07655059926977245.
[I 2023-12-24 13:16:39,608] Trial 8 finished with value: 0.07483031360255969 and parameters: {'factors': 24, 'regularization': 0.029687236566962608, 'alpha': 0.9020231876385665}. Best is trial 7 with value: 0.07655059926977245.
[I 2023-12-24 13:17:18,706] Trial 9 finished with value: 0.07617435993391572 and parameters: {'factors': 54, 'regularization': 0.04587249802731701, 'alpha': 0.4052353107829702}. Best is trial 7 with value: 0.07655059926977245.
Best trial value: 0.07655059926977245
Best params: 
factors: 52
regularization: 0.03458539

In [25]:
trial.params

{'factors': 52,
 'regularization': 0.03458539892561771,
 'alpha': 0.1964018837464644}

In [26]:
als_model = ImplicitALSWrapperModel(
                model=implicit_model(
                    **trial.params,
                    random_state=RANDOM_STATE
                ),
                fit_features_together=True,
            )

users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

train, _ = prepare_interactions(interactions, test_size=-1)

users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features = prepare_users(users)

items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
item_features = prepare_items(items)

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [27]:
als_model.fit(dataset)



<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7fceac58cd90>

In [28]:
TEST_USERS = list(train.user_id.unique())

In [29]:
recos = als_model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [30]:
recos.sort_values(by=["user_id", "rank"], ascending=True, inplace=True)

In [31]:
empty_recs = np.zeros(recos.shape[0] * 10, dtype=int) - 1
empty_recs

array([-1, -1, -1, ..., -1, -1, -1])

In [32]:
recos["rrank"] = recos["rank"] - 1
recos["uuid"] = recos["user_id"] * 10 + recos["rrank"]

In [33]:
empty_recs[recos["uuid"].values] = recos["item_id"].values

In [34]:
user_id = 23
empty_recs[user_id * 10: user_id * 10 + K_RECOS] == recos.loc[recos["user_id"] == user_id, "item_id"].values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [35]:
np.save("../files/als_recos.npy", empty_recs)

### nmslib

In [37]:
user_embeddings, item_embeddings = als_model.get_vectors()

In [38]:
item_embeddings.shape

(15706, 166)

In [42]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [43]:
print('pre shape: ', item_embeddings.shape)
max_norm, augmented_item_embeddings = augment_inner_product(item_embeddings)
augmented_item_embeddings.shape

pre shape:  (15706, 166)


(15706, 167)

In [44]:
extra_zero = np.zeros((user_embeddings.shape[0], 1))
augmented_user_embeddings = np.append(user_embeddings, extra_zero, axis=1)
augmented_user_embeddings.shape

(962179, 167)

In [45]:
user_id = 30

In [46]:
user_embeddings[user_id]

array([ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  9.27601568e-03,  5.30983601e-03,  8.38958286e-03,
        8.32445826e-03,  7.07935449e-03,  3.90429678e-03,  1.01826765e-04,
       -4.87195008e-04,  4.25705966e-03,  4.33627330e-03,  3.40488087e-03,
        5.13034966e-03,  4.77235671e-03,  7.65083404e-03,  4.12185123e-04,
        1.59322843e-03,  7.18094362e-03,  7.77044427e-03,  9.36419982e-03,
        7.87048973e-03,  8.22234992e-03,  2.52533262e-03,  3.48106609e-03,
        1.93481974e-03,  5.35128638e-03,  8.68252479e-03,  2.87740398e-03,
        4.82007582e-03,  2.32109614e-03,  9.62804130e-04,  1.33584556e-03,
        1.42143954e-05,  8.95245560e-03,  3.32466862e-03,  7.66110653e-03,
        6.17265655e-03,  

In [47]:
augmented_user_embeddings[user_id]

array([ 0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  1.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  9.27601568e-03,  5.30983601e-03,  8.38958286e-03,
        8.32445826e-03,  7.07935449e-03,  3.90429678e-03,  1.01826765e-04,
       -4.87195008e-04,  4.25705966e-03,  4.33627330e-03,  3.40488087e-03,
        5.13034966e-03,  4.77235671e-03,  7.65083404e-03,  4.12185123e-04,
        1.59322843e-03,  7.18094362e-03,  7.77044427e-03,  9.36419982e-03,
        7.87048973e-03,  8.22234992e-03,  2.52533262e-03,  3.48106609e-03,
        1.93481974e-03,  5.35128638e-03,  8.68252479e-03,  2.87740398e-03,
        4.82007582e-03,  2.32109614e-03,  9.62804130e-04,  1.33584556e-03,
        1.42143954e-05,  8.95245560e-03,  3.32466862e-03,  7.66110653e-03,
        6.17265655e-03,  

In [48]:
# Set index parameters
# These are the most important ones
M = 48
efC = 100

num_threads = 4
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC, 'post' : 0}
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100, 'post': 0}


In [53]:
# Number of neighbors 
K=10
# Space name should correspond to the space name 
# used for brute-force search
space_name='negdotprod'
index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) 
index.addDataPointBatch(augmented_item_embeddings) 
index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
index.createIndex(index_time_params) 
print('Index-time parameters', index_time_params)

Index-time parameters {'M': 48, 'indexThreadQty': 4, 'efConstruction': 100}


In [54]:
# Setting query-time parameters
efS = 100
query_time_params = {'efSearch': efS}
print('Setting query-time parameters', query_time_params)
index.setQueryTimeParams(query_time_params)

Setting query-time parameters {'efSearch': 100}


In [55]:
augmented_user_embeddings.shape

(962179, 167)

In [56]:
query_matrix = augmented_user_embeddings[:1000, :]

In [57]:
query_matrix.shape

(1000, 167)

In [58]:
user_id = 10973
one_user_matrix = augmented_user_embeddings[user_id, :]

In [59]:
%%time
# Querying
nbrs = index.knnQueryBatch([one_user_matrix], k = K, num_threads = num_threads)

CPU times: user 980 µs, sys: 1.08 ms, total: 2.06 ms
Wall time: 1.46 ms


In [60]:
augmented_user_embeddings.shape[1]

167

In [61]:
nbrs

[(array([ 32,  16,  25,  84, 174,  21,  93, 122, 370,  68], dtype=int32),
  array([-0.10328997, -0.08343284, -0.05778901, -0.05060219, -0.04356086,
         -0.04163088, -0.02579274, -0.02402637, -0.02157612, -0.02016769],
        dtype=float32))]