In [None]:
pip install lightfm

In [None]:
pip install lightgbm

In [None]:
import gc
import pandas as pd
from lightfm.data import Dataset
from lightfm import LightFM
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import lightgbm as lgbm

In [None]:
path = "/content/drive/MyDrive/WB School/data.csv.gzip"
interactions = pd.read_csv(path, compression="gzip")
interactions["order_ts"] = pd.to_datetime(interactions["order_ts"])

# Обучение

## Train/test

Делим на train и test. В test положим 3-ий месяц.

In [None]:
train = interactions[interactions.order_ts <= "2023-02-28 23:59:59.947831"]
test = interactions[interactions.order_ts > "2023-02-28 23:59:59.947831"]

del interactions
gc.collect()

20

Для обучения и валидации моделей первого уровня разделим train на 2 части в соотношении 70/30. На lmf_train обучаем модели первого этапа. На lmf_pred будем обучать бустинг.

In [None]:
lfm_threshold = train["order_ts"].quantile(q=0.7, interpolation="nearest")

lfm_train = train[(train["order_ts"] <= lfm_threshold)]
lfm_pred = train[(train["order_ts"] > lfm_threshold)]

del train
gc.collect()

0

Отбор кандидатов будет проводиться 2 вариантами матричной факторизации, поэтому в lmf_pred оставим только тех пользователей, которые есть в lmf_train.

In [None]:
lfm_pred = lfm_pred[lfm_pred["user_id"].isin(lfm_train["user_id"].unique())]

Сгруппируем датафреймы для дальнейшего преобразования в csr-матрицы.

In [None]:
lfm_train = lfm_train.groupby(["user_id", "item_id"], as_index=False).count().rename(columns={"order_ts": "amount"})
lfm_pred = lfm_pred.groupby(["user_id", "item_id"], as_index=False).count().rename(columns={"order_ts": "amount"})

## Отбор кандидатов

Соберём csr-матрицу по lmf_train с помощью класса lightfm Dataset.

In [None]:
dataset = Dataset()
dataset.fit(lfm_train["user_id"].unique(), lfm_train["item_id"].unique())

interactions_matrix, weights_matrix = dataset.build_interactions(zip(*lfm_train[["user_id", "item_id", "amount"]].values.T))
weights_matrix_csr = weights_matrix.tocsr()

del weights_matrix

lightfm_mapping = dataset.mapping()
lightfm_mapping = {"users_mapping": lightfm_mapping[0],
                   "items_mapping": lightfm_mapping[2]}

lightfm_mapping["users_inv_mapping"] = {v: k for k, v in lightfm_mapping["users_mapping"].items()}
lightfm_mapping["items_inv_mapping"] = {v: k for k, v in lightfm_mapping["items_mapping"].items()}

del dataset, interactions_matrix

gc.collect()

0

Обучим 2 модели - k-OS WARP и LMF.

In [None]:
model_warp_kos = LightFM(
    no_components=30,
    k=3,
    n=11,
    learning_schedule="adagrad",
    loss="warp-kos",
    learning_rate=0.027,
    item_alpha=0.00001,
    user_alpha=0.00014,
    max_sampled=42
)

num_epochs = 20
for _ in tqdm(range(num_epochs)):
    model_warp_kos.fit_partial(weights_matrix_csr)

100%|██████████| 20/20 [13:52<00:00, 41.63s/it]


In [None]:
model_lmf = LightFM(
    no_components=30,
    learning_schedule="adagrad",
    loss="logistic",
    learning_rate=0.019,
    item_alpha=0.0001,
    user_alpha=0.00001
)

num_epochs = 20
for _ in tqdm(range(num_epochs)):
    model_lmf.fit_partial(weights_matrix_csr)

100%|██████████| 20/20 [04:35<00:00, 13.79s/it]


Из обученной модели возьмём кандидаты и их ранги.

In [None]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, user_features, item_features, N, user_mapping, item_inv_mapping, num_threads=4):
  def _recs_mapper(user):
    user_id = user_mapping[user]
    recs = model.predict(user_id, item_ids, user_features=user_features, item_features=item_features, num_threads=num_threads)

    additional_N = len(known_items[user_id]) if user_id in known_items else 0
    total_N = N + additional_N
    top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]

    final_recs = [item_inv_mapping[item] for item in top_cols]
    if additional_N > 0:
        filter_items = known_items[user_id]
        final_recs = [item for item in final_recs if item not in filter_items]
    return final_recs[:N]
  return _recs_mapper

In [None]:
top_N = 30

all_cols = list(lightfm_mapping["items_mapping"].values())

mapper = generate_lightfm_recs_mapper(
    model_warp_kos,
    item_ids=all_cols,
    known_items=dict(),
    N=top_N,
    user_features=None,
    item_features=None,
    user_mapping=lightfm_mapping["users_mapping"],
    item_inv_mapping=lightfm_mapping["items_inv_mapping"],
    num_threads=20
)

candidates = pd.DataFrame({"user_id": lfm_pred["user_id"].unique()})
candidates["item_id"] = candidates["user_id"].map(mapper) # 30 минут
candidates = candidates.explode("item_id")
candidates["rank_warp_kos"] = candidates.groupby("user_id").cumcount() + 1

In [None]:
mapper = generate_lightfm_recs_mapper(
    model_lmf,
    item_ids=all_cols,
    known_items=dict(),
    N=top_N,
    user_features=None,
    item_features=None,
    user_mapping=lightfm_mapping["users_mapping"],
    item_inv_mapping=lightfm_mapping["items_inv_mapping"],
    num_threads=20
)

candidates_lmf = pd.DataFrame({"user_id": lfm_pred["user_id"].unique()})
candidates_lmf["item_id"] = candidates_lmf["user_id"].map(mapper) # 30 минут
candidates_lmf = candidates_lmf.explode("item_id")
candidates_lmf["rank_lmf"] = candidates_lmf.groupby("user_id").cumcount() + 1

Это означает, что lmf рекомендует в одни и те же товары.

In [None]:
candidates.user_id.nunique(), candidates.item_id.nunique()

(573043, 419)

In [None]:
candidates = pd.merge(candidates, candidates_lmf, on=["user_id", "item_id"], how="outer")
del candidates_lmf

gc.collect()

  candidates = pd.merge(candidates, candidates_lmf, on=["user_id", "item_id"], how="outer")


0

In [None]:
candidates.item_id.nunique()

419

Товары из warp_kos содержат все товары из lmf

In [None]:
candidates.head()

Unnamed: 0,user_id,item_id,rank_warp_kos,rank_lmf
0,2,390,1.0,1.0
1,2,192,2.0,2.0
2,2,133,3.0,4.0
3,2,357,4.0,3.0
4,2,212,5.0,8.0


## Ранжирование

In [None]:
# Нужно 2 GB иметь в запасе, т.к. в моменте подскакивает RAM и падает обратно
pos = candidates.merge(lfm_pred,
                       on=["user_id", "item_id"],
                       how="inner")

pos["target"] = 1

Возьмём соотношение 1:5.

In [None]:
neg = candidates.set_index(["user_id", "item_id"]) \
        .join(lfm_pred.set_index(["user_id", "item_id"])) # Подскакивает на 3.41 GB и падает

neg = neg.reset_index()
neg = neg.sample(frac=0.1)

neg["target"] = 0

In [None]:
len(pos) / len(neg)

0.21484730716363348

In [None]:
neg.shape

(5750853, 6)

Соберём датасет для обучения бустинга.

In [None]:
train_users, val_users = train_test_split(lfm_pred["user_id"].unique(),
                                          random_state=1,
                                          test_size=0.3)

In [None]:
select_col = ["user_id", "item_id", "rank_warp_kos", "rank_lmf", "target"]

# Catboost train
lgbm_train = shuffle(pd.concat([pos[pos["user_id"].isin(train_users)],
                                neg[neg["user_id"].isin(train_users)]])[select_col])

# for early stopping
lgbm_val = shuffle(pd.concat([pos[pos["user_id"].isin(val_users)],
                              neg[neg["user_id"].isin(val_users)]])[select_col])

Сверим соотношения.

In [None]:
lgbm_train["target"].value_counts(normalize=True)

0    0.823121
1    0.176879
Name: target, dtype: float64

In [None]:
lgbm_val["target"].value_counts(normalize=True)

0    0.823214
1    0.176786
Name: target, dtype: float64

In [None]:
def get_query_id(df):
  query_map = {}

  for query_id, user_id in enumerate(df['user_id'].unique()):
    query_map[user_id] = query_id

  query_id = df['user_id'].map(query_map)

  return query_id

In [None]:
lgbm_train["query_id"] = get_query_id(lgbm_train)
lgbm_val["query_id"] = get_query_id(lgbm_val)

In [None]:
train_group = lgbm_train["query_id"].value_counts().sort_index().values
val_group = lgbm_val["query_id"].value_counts().sort_index().values

In [None]:
del lgbm_train["query_id"]
del lgbm_val["query_id"]

gc.collect()

0

In [None]:
lgbm_train["item_id"] = lgbm_train["item_id"].astype(np.int64)
lgbm_val["item_id"] = lgbm_val["item_id"].astype(np.int64)

In [None]:
train_lgbm_dataset = lgbm.Dataset(
    data=lgbm_train.drop(columns="target"), label=lgbm_train["target"],
    group=train_group
)

val_lgbm_dataset = lgbm.Dataset(
    data=lgbm_val.drop(columns="target"), label=lgbm_val["target"],
    group=val_group
)

In [None]:
lgbm_train.user_id.nunique()

398748

Обучим бустинг ранжированию.

In [None]:
params = {
    "objective": "lambdarank",
    "learning_rate": 0.1,
    # "max_depth": -1,
    "first_metric_only": True,
    "metric": (
        "lambdarank", "map", "auc"
    ),
    "eval_at": (20)
}

In [2]:
booster = lgbm.train(
    params=params,
    train_set=train_lgbm_dataset,
    num_boost_round=1000,
    valid_sets=[train_lgbm_dataset, val_lgbm_dataset],
    early_stopping_rounds=100,
    verbose_eval=20
)

# Предсказание

В датафрейм *prediction* положим юзеров из тестового периода и соберём датасет, в который включим ранги от моделей первого этапа.

In [None]:
top_N = 30
prediction = pd.DataFrame({"user_id": test["user_id"].unique()})
prediction = prediction[prediction["user_id"].isin(lfm_train["user_id"].unique())]
known_items = lfm_train.groupby("user_id")["item_id"].apply(list).to_dict()

mapper = generate_lightfm_recs_mapper(
    model_warp_kos,
    item_ids=all_cols,
    known_items=known_items,
    N=top_N,
    user_features=None,
    item_features=None,
    user_mapping=lightfm_mapping["users_mapping"],
    item_inv_mapping=lightfm_mapping["items_inv_mapping"],
    num_threads=20
)

prediction["item_id"] = prediction["user_id"].map(mapper)
prediction = prediction.explode("item_id").reset_index(drop=True)
prediction["rank_warp_kos"] = prediction.groupby("user_id").cumcount() + 1

In [None]:
del model_warp_kos

gc.collect()

0

In [None]:
mapper = generate_lightfm_recs_mapper(
    model_lmf,
    item_ids=all_cols,
    known_items=known_items,
    N=top_N,
    user_features=None,
    item_features=None,
    user_mapping=lightfm_mapping["users_mapping"],
    item_inv_mapping=lightfm_mapping["items_inv_mapping"],
    num_threads=20
)

prediction_lmf = pd.DataFrame({"user_id": lfm_pred["user_id"].unique()})
prediction_lmf = prediction_lmf[prediction_lmf["user_id"].isin(lfm_train["user_id"].unique())]
prediction_lmf["item_id"] = prediction_lmf["user_id"].map(mapper) # 30 минут
prediction_lmf = prediction_lmf.explode("item_id")
prediction_lmf["rank_lmf"] = prediction_lmf.groupby("user_id").cumcount() + 1

In [None]:
del model_lmf
del candidates

gc.collect()

0

In [None]:
prediction = pd.merge(prediction, prediction_lmf, on=["user_id", "item_id"], how="outer")
del prediction_lmf

gc.collect()

  prediction = pd.merge(prediction, prediction_lmf, on=["user_id", "item_id"], how="outer")


0

In [None]:
prediction["item_id"] = prediction["item_id"].astype(np.int64)

In [None]:
prediction.head()

Unnamed: 0,user_id,item_id,rank,rank_lmf
0,654067,357,1.0,2.0
1,654067,390,2.0,3.0
2,654067,133,3.0,6.0
3,654067,192,4.0,1.0
4,654067,381,5.0,5.0


Получим предсказания бустинга, принимающего в качестве фичей ранги моделей первого этапа.

In [None]:
lgbm_prediction = booster.predict(prediction)
prediction["lgbm_pred"] = lgbm_prediction
prediction.head()

Unnamed: 0,user_id,item_id,rank,rank_lmf,lgbm_pred
0,654067,357,1.0,2.0,0.75426
1,654067,390,2.0,3.0,0.549938
2,654067,133,3.0,6.0,0.21823
3,654067,192,4.0,1.0,0.437454
4,654067,381,5.0,5.0,0.403778


In [None]:
prediction = prediction.sort_values(
    by=["user_id", "lgbm_pred"], ascending=[True, False])
prediction["rank_lgbm"] = prediction.groupby("user_id").cumcount() + 1
prediction.head()

Unnamed: 0,user_id,item_id,rank,rank_lmf,lgbm_pred,rank_lgbm
20052804,2,213,,25.0,-0.605371,1
20052805,2,105,,26.0,-0.609778,2
20052796,2,162,,17.0,-0.666419,3
20052806,2,217,,27.0,-0.70635,4
20052803,2,168,,24.0,-0.831486,5


In [None]:
prediction.user_id.value_counts()

529194     60
750528     60
996397     60
996396     60
750537     60
           ..
546691     30
546673     30
546670     30
546669     30
1057265    30
Name: user_id, Length: 726266, dtype: int64