In [1]:
import pandas as pd
import numpy as np

from implicit.als import AlternatingLeastSquares

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset

from pathlib import Path
import typing as tp
from tqdm import tqdm

# from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

In [2]:
DATA_PATH = Path("../data_original")

In [77]:
%%time
users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

CPU times: user 2.99 s, sys: 782 ms, total: 3.77 s
Wall time: 3.79 s


In [78]:
def prepare_interactions(interactions: pd.DataFrame, test_size=7):
    Columns.Datetime = 'last_watch_dt'
    interactions = interactions.drop(interactions[interactions[Columns.Datetime].str.len() != 10].index)
    interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime], format='%Y-%m-%d')
    interactions[Columns.Weight] = np.where(interactions['watched_pct'] > 10, 3, 1)
    max_date = interactions[Columns.Datetime].max()
    train = interactions[interactions[Columns.Datetime] < max_date - pd.Timedelta(days=test_size)].copy()
    test = interactions[interactions[Columns.Datetime] >= max_date - pd.Timedelta(days=test_size)].copy()
    return train, test

In [6]:
train, test = prepare_interactions(interactions)

print(f"train: {train.shape}")
print(f"test: {test.shape}")

train: (4985269, 6)
test: (490982, 6)


In [81]:
train.drop(train.query("total_dur < 300").index, inplace=True)

In [82]:
# отфильтруем холодных пользователей из теста
cold_users = set(test[Columns.User]) - set(train[Columns.User])

In [83]:
test.drop(test[test[Columns.User].isin(cold_users)].index, inplace=True)

In [None]:
users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()

In [96]:
def prepare_users(users: pd.DataFrame):
    users.fillna('Unknown', inplace=True)
    user_features_frames = []
    for feature in ["sex", "age", "income"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    user_features = pd.concat(user_features_frames)
    return user_features

In [89]:
user_features = prepare_users(users)
user_features.head()

Unnamed: 0,id,value,feature
0,973171,М,sex
1,962099,М,sex
3,721985,Ж,sex
4,704055,Ж,sex
5,1037719,М,sex


In [91]:
items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()

In [93]:
def prepare_items(items: pd.DataFrame):
    # Explode genres to flatten table
    items["genre"] = items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"

    # # Explode directors to flatten table
    # items["directors"] = items["directors"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    # director_feature = items[["item_id", "directors"]].explode("directors")
    # director_feature.columns = ["id", "value"]
    # director_feature["feature"] = "directors"
    # director_feature.head()

    item_features_frames = []
    for feature in ["content_type"
                    # , "release_year"
                    ]:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        item_features_frames.append(feature_frame)
    item_feat = pd.concat(item_features_frames)
    
    item_features = pd.concat((genre_feature, item_feat))

    return item_features

In [52]:
item_features = prepare_items(items)
# item_features = pd.concat((item_features, item_feat))
item_features.shape

(50224, 3)

In [53]:
metrics = {
    'Precision@10': Precision(k=10),
    'Recall@10': Recall(k=10),
    'MAP@10': MAP(k=10),
}

In [54]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = (32, 64)

In [55]:
models = {
    'popular': PopularModel(),
}

In [56]:
implicit_models = {
    'ALS': AlternatingLeastSquares,
}
for implicit_name, implicit_model in implicit_models.items():
    for n_factors in N_FACTORS:
        models[f"{implicit_name}_{n_factors}"] = (
            ImplicitALSWrapperModel(
                model=implicit_model(
                    factors=n_factors, 
                    random_state=RANDOM_STATE, 
                    num_threads=NUM_THREADS,
                ),
                fit_features_together=True,
            )
        )

In [57]:
# lightfm_losses = ('warp')

# for loss in lightfm_losses:
#     for n_factors in N_FACTORS:
#         models[f"LightFM_{loss}_{n_factors}"] = LightFMWrapperModel(
#             LightFM(
#                 no_components=n_factors, 
#                 loss=loss, 
#                 random_state=RANDOM_STATE,
#                 learning_rate=LEARNING_RATE,
#                 user_alpha=USER_ALPHA,
#                 item_alpha=ITEM_ALPHA,
#             ),
#             epochs=N_EPOCHS,
#             num_threads=NUM_THREADS,
#         )

In [58]:
models

{'popular': <rectools.models.popular.PopularModel at 0x7f37bb927f10>,
 'ALS_32': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f37b91d7940>,
 'ALS_64': <rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f37b91d5c00>}

In [62]:
%%time
dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

CPU times: user 1.27 s, sys: 112 ms, total: 1.39 s
Wall time: 1.39 s


In [63]:
TEST_USERS = test[Columns.User].unique()

In [64]:
# pop_recos = pop_model.recommend(
#         users=cold_users,
#         dataset=dataset,
#         k=K_RECOS,
#         filter_viewed=True,
# )
# pop_recos

In [65]:
%%time
results = []
for model_name, model in models.items():
    print(f"Fitting model {model_name}...")
    model_quality = {'model': model_name}

    model.fit(dataset)
    recos = model.recommend(
        users=TEST_USERS,
        dataset=dataset,
        k=K_RECOS,
        filter_viewed=True,
    )
    
    metric_values = calc_metrics(metrics, recos, test, train)
    model_quality.update(metric_values)
    results.append(model_quality)

Fitting model popular...
Fitting model ALS_32...




Fitting model ALS_64...




CPU times: user 35min 27s, sys: 43min 3s, total: 1h 18min 31s
Wall time: 20min 23s


In [66]:
df_quality = pd.DataFrame(results).T

df_quality.columns = df_quality.iloc[0]

df_quality.drop('model', inplace=True)

In [45]:
df_quality.style.highlight_max(color='lightgreen', axis=1)

model,popular,ALS_32,ALS_64
Precision@10,0.032803,0.031024,0.030943
Recall@10,0.15607,0.144221,0.14331
MAP@10,0.073836,0.074917,0.073967


In [97]:
als_model = models["ALS_32"]

users = pd.read_csv(DATA_PATH / 'users.csv')
items = pd.read_csv(DATA_PATH / 'items.csv')
interactions = pd.read_csv(DATA_PATH / 'interactions.csv')

train, _ = prepare_interactions(interactions, test_size=-1)

users = users.loc[users[Columns.User].isin(train[Columns.User])].copy()
user_features = prepare_users(users)

items = items.loc[items[Columns.Item].isin(train[Columns.Item])].copy()
item_features = prepare_items(items)

dataset = Dataset.construct(
    interactions_df=train,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type"],
)

In [98]:
als_model.fit(dataset)



<rectools.models.implicit_als.ImplicitALSWrapperModel at 0x7f37b91d7940>

In [100]:
TEST_USERS = list(train.user_id.unique())

In [129]:
recos = als_model.recommend(
    users=TEST_USERS,
    dataset=dataset,
    k=K_RECOS,
    filter_viewed=True,
)

In [130]:
recos.sort_values(by=["user_id", "rank"], ascending=True, inplace=True)

In [176]:
empty_recs = np.zeros(recos.shape[0] * 10, dtype=int) - 1
empty_recs

array([-1, -1, -1, ..., -1, -1, -1])

In [183]:
recos["rrank"] = recos["rank"] - 1
recos["uuid"] = recos["user_id"] * 10 + recos["rrank"]

In [184]:
empty_recs[recos["uuid"].values] = recos["item_id"].values

In [192]:
user_id = 23
empty_recs[user_id * 10: user_id * 10 + K_RECOS] == recos.loc[recos["user_id"] == user_id, "item_id"].values

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [194]:
np.save("../files/fm_recos.npy", empty_recs)