
## Import Libraries

In [None]:
!pip install rectools==0.3.0

In [2]:
import os

In [3]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [4]:
import warnings

warnings.filterwarnings("ignore")

In [5]:
import pandas as pd
import numpy as np
import dill

from implicit.als import AlternatingLeastSquares
import itertools

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

import timeit

In [6]:
np.random.seed(1234)

## Loading Data

In [7]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

# from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

!unzip kion_train.zip

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [593]:
interactions = pd.read_csv('kion_train/interactions.csv')

users = pd.read_csv('kion_train/users.csv')
users.fillna("Unknown", inplace=True)
users["kids_flg"] = users["kids_flg"].astype("str")

items = pd.read_csv('kion_train/items.csv')

## Preprocessing

In [594]:
def headtail(df: pd.DataFrame):
    return pd.concat([df.head(), df.tail()])

In [595]:
Columns.Datetime = "datetime"
interactions.rename(columns={'last_watch_dt': "datetime"}, inplace=True) 

In [596]:
interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True
)

In [597]:
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format="%Y-%m-%d"
)

In [598]:
# 0 - 100 | 0-10=1 10-30=2 30-60=3 60-100=5
def f(pct):
    if pct < 10:
        return 1
    elif pct < 30:
        return 2
    elif pct < 60:
        return 3
    elif pct < 85:
        return 4
    return 5

interactions[Columns.Weight] = interactions["watched_pct"].apply(lambda x: f(x))
interactions

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,4
1,699317,1659,2021-05-29,8317,100.0,5
2,656683,7107,2021-05-09,10,0.0,1
3,864613,7638,2021-07-05,14483,100.0,5
4,964868,9506,2021-04-30,6725,100.0,5
...,...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0,1
5476247,546862,9673,2021-04-13,2308,49.0,3
5476248,697262,15297,2021-08-20,18307,63.0,4
5476249,384202,16197,2021-04-19,6203,100.0,5


## Обучение модели на полном датасете

Выбираем лучшую модель по итогам валидации и производим обучение на всем датасете

Необходимо заново собрать датасет, используя все данные. Особенности:
1. Необходим полный датасет `users`
2. Необходим полный датасет `items`
3. Логику преобразования датасета взаимодействий оставляем ту же

Изменений касательно генерации фичей нет и дополнительной логики не нужно. Обернем все преобразования в функцию и произведем вызов.

In [None]:
def get_features(users: pd.DataFrame, items: pd.DataFrame, for_hot=True):
    
    # Generate user/item features for hot users, i.e. syncronise with interactions
    if for_hot:
        users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    
    # Deal with user features
    
    users.fillna("Unknown", inplace=True)
    users = users.astype({"kids_flg": bool})
    
    user_features_frames = []
    for feature in ["sex", "age", "income", "kids_flg"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    
    user_features = pd.concat(user_features_frames)
    
    # Deal with item features
    # Genre

    items["genre"] = (
        items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    )
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    genre_feature.head()

    # Release year

    items["release_year"].fillna(int(items["release_year"].max()), inplace=True)
    items = items.astype({"release_year": int})

    year_from = 1977
    step = 5
    bins = [
        year
        for year in range(
            year_from, items["release_year"].max() + step, step
        )
    ]

    bins_bias = [item + 1 for item in bins]
    pairs_strict = list(zip(bins_bias, bins[1:]))
    # Add the most first release year
    bins = [items["release_year"].min()] + bins
    # Add interval for it
    pairs_strict = [(items["release_year"].min(), bins[1])] + pairs_strict
    # Generate feature values
    labels = [f'year_{item[0]}_{item[1]}' for item in pairs_strict]

    year_bins = pd.cut(items["release_year"], bins=bins, labels=labels, include_lowest=True)

    items['release_year'] = year_bins.astype(str)

    # Age rating

    items['age_rating'].fillna(0, inplace=True);
    items = items.astype({"age_rating": int})

    # For kids

    items["for_kids"].loc[items["age_rating"] > 12] = 0.0
    items["for_kids"].loc[items["age_rating"] <= 12] = 1.0
    items = items.astype({"for_kids": bool})
    
    item_features_frames = []
    for feature in ["content_type", "release_year", "age_rating", "for_kids"]:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        item_features_frames.append(feature_frame)
    item_features_frames.append(genre_feature)
    item_features = pd.concat(item_features_frames)
    headtail(item_features)

    return user_features, item_features

In [None]:
user_features, item_features = get_features(users, items)
full_user_features, _ = get_features(users, items, for_hot=False)

Создаем датасет

In [None]:
%%time
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "release_year", "for_kids", "age_rating"],
)

CPU times: user 761 ms, sys: 16.9 ms, total: 778 ms
Wall time: 785 ms


Фиттим

In [None]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 32
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [None]:
model = LightFMWrapperModel(
            LightFM(
                no_components=N_FACTORS,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [None]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fb87a89df10>

Сохраним модель

In [None]:
save_name = "END_MODEL"
with open(f'/content/drive/MyDrive/RecSys MTC/practice4/models/{save_name}.dill', 'wb') as f:
    dill.dump(model, f)

Загружаем модель

In [None]:
with open(f'/content/drive/MyDrive/RecSys MTC/practice4/models/{save_name}.dill', 'rb') as f:
    model = dill.load(f)

# 1. Approximate Nearest Neighbors

## 1.1. Создание индекса для приближенного поиска

In [None]:
import dill
import nmslib
import time

with open("/content/drive/MyDrive/RecSys MTC/practice4/models/ULTRA_LAST_MODEL.dill", "rb") as f:
    model = dill.load(f)

In [None]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [None]:
user_vectors, item_vectors = model.get_vectors(dataset=dataset, add_biases=True)
print("Размерность до", user_vectors.shape, item_vectors.shape)

max_norm, augmented_item_vectors = augment_inner_product(item_vectors)

extra_zero = np.zeros((user_vectors.shape[0], 1))
augmented_user_vectors = np.append(user_vectors, extra_zero, axis=1)
print("Размерность после", augmented_user_vectors.shape, augmented_item_vectors.shape)

Размерность до (302486, 34) (15484, 34)
Размерность после (302486, 35) (15484, 35)


In [None]:
user_internal = dataset.user_id_map.internal_ids
user_external = dataset.user_id_map.external_ids

item_internal = dataset.item_id_map.internal_ids
item_external = dataset.item_id_map.external_ids

In [None]:
user_mapping = {k.item(): v.item() for k, v in zip(user_external, user_internal)}
item_mapping = {k.item(): v.item() for k, v in zip(item_external, item_internal)}

user_inv_mapping = {k.item(): v.item() for k, v in zip(user_internal, user_external)}
item_inv_mapping = {k.item(): v.item() for k, v in zip(item_internal, item_external)}


with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_user_mapping.dill", "wb") as f:
    dill.dump(user_mapping, f)
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_item_inv_mapping.dill", "wb") as f:
    dill.dump(item_inv_mapping, f)

Параметры Индекса:
- M - количество соседних вершин у каждой вершины в индексе. Чем больше, тем больше потребление памяти.

- efConstruction - тот же смысл что и у efSearch, но контролирует index_time/index_accuracy. Большее значение ведёт к лучшему индексу. В какой-то момент увеличесние efConstruction не улучшает качество индекса.

- num_threads - количество потоков для построения инднекса
- space - способ вычисления скора между айтемом и пользователем

Параметры Поиска по индексу:
- K - число искомых ближайших айтемов 
- efSearch - сколько раз мы ищем в Индексе, должно быть больше K и меньше числа уникальных айтемов. Чем больше, тем точнее поиск, но медленее
- num_threads - количество потоков для поиска

In [None]:
from tqdm.notebook import tqdm

def create_index(augmented_item_vectors, M: int, efC: int, num_threads: int, space_name: str="negdotprod"):
    index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
    print('Параметры индекса', index_time_params)

    # Инициализация Индекса, узказание пространства поиска, типа векторов 
    index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)
    # Добавление векторов айтемов
    index.addDataPointBatch(augmented_item_vectors)

    # Создание Индекса
    start = time.time()
    index.createIndex(index_time_params) 
    end = time.time() 
    print('Время создания = %f' % (end-start))
    return index


# Recall = TP / (TP + FN) = TP / 10
def calculate_ann_recall(
    hot_users,
    augmented_user_vectors,
    dataset: Dataset,
    model: LightFMWrapperModel, 
    index,
    K,
    item_mapping,
):
    TP = 0
    all = 0
    for user_id in tqdm(hot_users):
        internal_user_id = int(dataset.user_id_map.convert_to_internal([user_id])[0])

        target_items = model.recommend(
            [user_id], 
            dataset=dataset,
            k=10, 
            filter_viewed=False,
            add_rank_col=False,
            items_to_recommend=dataset.item_id_map.external_ids
        ).item_id.to_numpy()
        target_items = [item_mapping[ex_i] for ex_i in target_items]

        user_vector = augmented_user_vectors[internal_user_id]
        predicted_items = index.knnQuery(vector=user_vector, k=K)[0]
        
        TP += np.isin(target_items, predicted_items).sum()
        all += K

    recall = TP / all
    print("Recall = ", recall)


Подберём параметры индекса ```M, efConstruction, efSearch```, чтобы Recall возвращаемых айтемов был наибольшим для первых 100 пользователей из обучающего набора:

Эмперически подобрали параметры, дающие лучший Recall:
- efSearch = 50
- efConstruction = 50
- M = 32

При бОльших значениях параметров больше не происходит увеличение отзывчивости поиска.

In [None]:
K = 10
num_threads = 4

hot_users = dataset.user_id_map.external_ids[:100]

In [None]:
# Параметры Индекса
M = 32 # adjustable
efC = 50 # adjustable
# Параметры Поиска по индексу
efS = 50 # adjustable

index = create_index(augmented_item_vectors, M=M, efC=efC, num_threads=num_threads)

query_time_params = {'efSearch': efS}
print(f'Параметр поиска по индксу, efSearch >= K = {K}: ', query_time_params)
index.setQueryTimeParams(query_time_params)

calculate_ann_recall(
    hot_users = hot_users,
    augmented_user_vectors = augmented_user_vectors,
    dataset = dataset,
    model = model, 
    index = index,
    K = K,
    item_mapping = item_mapping
)

Параметры индекса {'M': 32, 'indexThreadQty': 4, 'efConstruction': 50}
Время создания = 0.787863
Параметр поиска по индксу, efSearch >= K = 10:  {'efSearch': 50}


  0%|          | 0/100 [00:00<?, ?it/s]

Recall =  0.994


Сохраняем индекс:

In [None]:
index.saveIndex("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_items_index.hnsw", save_data=True)

Сохраняем вектора пользователей:

In [None]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_user_embeddings.dill", "wb") as f:
    dill.dump(augmented_user_vectors, f)

Сохраняем уже просмотренные айтемы для каждого пользователя: 

In [179]:
watched = interactions[["user_id", "item_id"]].groupby("user_id").agg(list).reset_index()
watched_user2items_dictionary = dict(zip(watched["user_id"], watched["item_id"]))

In [None]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_watched_user2items_dictionary.dill", "wb") as f:
    dill.dump(watched_user2items_dictionary, f)

## 1.2. Анализ скорости поиска рекоммендаций через обёртку и через перемножение векторов.

Скорость поиска через модель и через вектора. Приближенный поиск по векторам сильно выигрывает по скорости:

In [None]:
internal_test_user = 0
external_test_user = user_inv_mapping[internal_test_user]

user_vector = augmented_user_vectors[internal_test_user]

In [None]:
%%timeit  
target_items = model.recommend(
    [external_test_user],
    dataset=dataset,
    k=10, 
    filter_viewed=False,
    add_rank_col=False,
    items_to_recommend=dataset.item_id_map.external_ids
).item_id.to_numpy()

262 ms ± 3.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
predicted_items = index.knnQuery(vector=user_vector, k=K)[0]

11.5 µs ± 234 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [None]:
target_items = model.recommend(
    [external_test_user], 
    dataset=dataset,
    k=10, 
    filter_viewed=False,
    add_rank_col=False,
    items_to_recommend=dataset.item_id_map.external_ids
).item_id.to_numpy()
target_items

array([15464,  2150,  3351,  4918, 10440,  7829, 10680,  4735, 16018,
       11145])

In [None]:
predicted_items = index.knnQuery(vector=user_vector, k=K)[0]
predicted_items = np.array([item_inv_mapping[item] for item in predicted_items])
predicted_items

array([15464,  2150,  3351,  4918, 10440,  7829, 10680,  4735, 16018,
       11145])

In [None]:
np.isin(target_items, predicted_items)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

## 1.3. Модель ANN Lightfm для горячих пользователей

Горячие пользователи - это те, у которых не меньше 5 взаимодействий



In [458]:
class SimplePopularModel:
    def __init__(self, users_dictionary, popular_dictionary):
        self.users_dictionary = users_dictionary
        self.popular_dictionary = popular_dictionary

    def predict(self, user_id: int, k_recs: int) -> List[int]:
        try:
            category = self.users_dictionary.get(user_id, None)
            if category:
                return self.popular_dictionary[category][:k_recs]
            return self.popular_dictionary["popular_for_all"][:k_recs]
        except TypeError:
            return [14488, 12192, 9728, 15297, 5543, 10440, 4218, 341, 512, 13865]

In [459]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/popular/users_dictionary.pickle", "rb") as f:
    popular_users = dill.load(f)
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/popular/popular_dictionary.pickle", "rb") as f:
    popular_dict = dill.load(f)

popular_model = SimplePopularModel(popular_users, popular_dict)

In [159]:
user_m = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_user_mapping.dill"
item_inv_m = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_item_inv_mapping.dill"
index_path = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_items_index.hnsw"
user_emb = "/content/drive/MyDrive/RecSys MTC/practice4/models/user_embeddings.dill"
watched_u2i = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_watched_user2items_dictionary.dill"

In [480]:
from typing import List


class ANNLightFM:
    def __init__(self, user_m, item_inv_m, index_path, user_emb, watched_u2i, popular_model, K = 10):
        with open(user_m, "rb") as f:
            self.user_m = dill.load(f)
        with open(item_inv_m, "rb") as f:
            self.item_inv_m = dill.load(f)

        self.index = nmslib.init(method='hnsw', space="negdotprod")
        self.index.loadIndex(index_path, load_data=True)

        with open(user_emb, "rb") as f:
            self.user_emb = dill.load(f)
        with open(watched_u2i, "rb") as f:
            self.watched_u2i = dill.load(f)
        self.popular_model = popular_model
        self.K = K
    
    def predict(self, user_id: int) -> List[int]:
        if user_id in self.user_m:
            user_vector = self.user_emb[self.user_m[user_id]]
            pr_internal_items = self.index.knnQuery(
                vector=user_vector, k=self.K
            )[0]
            pr_items = [self.item_inv_m[item] for item in pr_internal_items]

            # Delete already seen items
            pr_items_numpy = np.array(pr_items, dtype="uint16")
            already_seen_items = np.array(
                self.watched_u2i[user_id], dtype="uint16"
            )

            unseen_items = pr_items_numpy[
                ~np.isin(pr_items_numpy, already_seen_items)
            ]
            num_lost_items = self.K - unseen_items.shape[0]
            if num_lost_items > 0:
                popular_items = np.array(
                    self.popular_model.predict(user_id, 5 * self.K)
                )

                popular_items = popular_items[
                    ~np.isin(popular_items, already_seen_items)
                ]
                popular_items = popular_items[
                    ~np.isin(popular_items, unseen_items)
                ]

                unseen_items = np.append(
                    unseen_items, popular_items[:num_lost_items]
                )
                if len(unseen_items) != 10:
                    return self.popular_model.predict(user_id, k_recs=self.K)
            return unseen_items[:self.K].tolist()
        else:
            self.popular_model.predict(user_id, k_recs=self.K)

In [481]:
ann_light_fm = ANNLightFM(user_m, item_inv_m, index_path, user_emb, watched_u2i, popular_model, K=10)

Посмотрим на рекомендации на тестовых пользователях:

In [482]:
def recommend(test_user_id: int):
    output_items = np.array(ann_light_fm.predict(test_user_id), dtype='uint16')
    already_seen_items = np.array(watched[watched["user_id"] == test_user_id].item_id.iloc[0], dtype='uint16')

    unseen_items = output_items[~np.isin(output_items, already_seen_items)]
    num_lost_items = 10 - unseen_items.shape[0]
    if num_lost_items != 0:
        popular_items = np.arange(20)

        popular_items = popular_items[~np.isin(popular_items, already_seen_items)]
        popular_items = popular_items[~np.isin(popular_items, unseen_items)]

        unseen_items = np.append(unseen_items, popular_items[:num_lost_items])
    return unseen_items.tolist()

In [487]:
test_user_id = 2
result_items = recommend(test_user_id)

pd.DataFrame(data={
    "user_id": test_user_id,
    "result_items": result_items
}).merge(items[["item_id", "title", "genres"]], how="left", left_on="result_items", right_on="item_id")

Unnamed: 0,user_id,result_items,item_id,title,genres
0,2,1267,1267,Город героев,"боевики, фантастика, мультфильм, комедии"
1,2,13243,13243,Головоломка,"фантастика, мультфильм, комедии"
2,2,11919,11919,Суперсемейка,"фантастика, мультфильм, приключения"
3,2,11749,11749,Суперсемейка 2,"фантастика, мультфильм, приключения"
4,2,14488,14488,Мастер меча,"боевики, историческое"
5,2,12192,12192,Фемида видит,"драмы, детективы, комедии"
6,2,9728,9728,Гнев человеческий,"боевики, триллеры"
7,2,15297,15297,Клиника счастья,"драмы, мелодрамы"
8,2,5543,5543,Турист,боевики
9,2,10440,10440,Хрустальный,"триллеры, детективы"


In [488]:
users[users["user_id"] == test_user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
189221,2,age_25_34,income_40_60,М,1


Классная рекомендация, для мужчины с ребёнком. Ставим Класс!

## 1.4. Рекомендации для тёплых и холодных пользователей

Рекомендации на основе признаков тёплых и холодных пользователей.

Будем рекомендовать таким пользователям, то что смотрят такие же горячие пользователи. Сходством между холодными и горячими пользователями будет являться сходство их признаков: возраст, доход, пол, флаг детей.

Фильтруем тёплых пользователей. Возьмём >= 5 просмотров:

In [508]:
# Preparing of the Hot Users
threshold = 5
users_inter_count = interactions.groupby("user_id")["item_id"].count()
hot_users = users_inter_count[users_inter_count >= threshold].index.values
warm_users = users_inter_count[users_inter_count < threshold].index.values

interactions = interactions[interactions["user_id"].isin(hot_users)].sort_values(["user_id", "datetime"])
interactions["order"] = interactions.groupby("user_id").cumcount(ascending=False)
interactions["order"] = interactions["order"].astype(np.uint16)

print("Hot users: ", hot_users.shape[0])
print("Hot interactions: ", interactions.shape[0])
interactions

Hot users:  302486
Hot interactions:  4290596


Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight,order
3590116,0,12192,2021-07-16,89,0.0,1,5
620,0,7102,2021-07-19,169,3.0,1,4
67070,0,14359,2021-07-19,130,2.0,1,3
90113,0,15297,2021-07-19,459,0.0,1,2
3103040,0,9728,2021-07-19,4,0.0,1,1
...,...,...,...,...,...,...,...
3629451,1097555,4662,2021-04-08,775,14.0,2,4
5172184,1097555,4880,2021-04-22,7117,9.0,1,3
3498963,1097555,6916,2021-05-09,740,14.0,2,2
405171,1097555,14703,2021-06-21,234,4.0,1,1


In [509]:
known_warm_users_features = users[users["user_id"].isin(warm_users)]

known_warm_users = known_warm_users_features["user_id"].to_numpy()
unknown_warm_users = np.setdiff1d(warm_users, known_warm_users)

known_warm_users_features

Unnamed: 0,user_id,age,income,sex,kids_flg
5,1037719,age_45_54,income_60_90,М,0
11,312520,age_35_44,income_90_150,Ж,0
13,382508,age_18_24,income_20_40,М,0
15,628684,age_35_44,income_40_60,М,0
16,73728,age_45_54,income_40_60,М,0
...,...,...,...,...,...
840188,312839,age_65_inf,income_60_90,Ж,0
840189,191349,age_45_54,income_40_60,М,1
840192,339025,age_65_inf,income_0_20,Ж,0
840194,251008,Unknown,Unknown,Unknown,0


In [510]:
unknown_warm_users_features = pd.DataFrame(data={
    "user_id": unknown_warm_users,
    "age": "Unknown",
    "income": "Unknown",
    "sex": "Unknown",
    "kids_flg": "Unknown",
})
unknown_warm_users_features

Unnamed: 0,user_id,age,income,sex,kids_flg
0,12,Unknown,Unknown,Unknown,Unknown
1,14,Unknown,Unknown,Unknown,Unknown
2,19,Unknown,Unknown,Unknown,Unknown
3,24,Unknown,Unknown,Unknown,Unknown
4,27,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...
160803,1097510,Unknown,Unknown,Unknown,Unknown
160804,1097515,Unknown,Unknown,Unknown,Unknown
160805,1097536,Unknown,Unknown,Unknown,Unknown
160806,1097545,Unknown,Unknown,Unknown,Unknown


In [511]:
all_users_in_interactions = np.append(hot_users, warm_users)
cold_users_only_in_users = users[~users["user_id"].isin(all_users_in_interactions)]
cold_users_only_in_users

Unnamed: 0,user_id,age,income,sex,kids_flg
2,1047345,age_45_54,income_40_60,Ж,0
6,391756,age_25_34,income_0_20,М,0
7,15878,age_25_34,income_40_60,М,1
10,99952,Unknown,Unknown,М,0
19,1067802,age_35_44,income_40_60,М,0
...,...,...,...,...,...
840180,157810,age_25_34,income_20_40,Ж,0
840185,1021814,age_45_54,income_20_40,Ж,0
840191,365945,age_25_34,income_20_40,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1


Таблица ```warm_users_features``` содержит признаки всех пользователей, которые не имеют 5 или более взаимодействий, включая тёплых и холодных. 

В таблице содержатся NaN значения признаков. Далее дадим рекомендации на основе признаков пользователей:

In [513]:
users_features = pd.concat([known_warm_users_features, unknown_warm_users_features, cold_users_only_in_users])
users_features

Unnamed: 0,user_id,age,income,sex,kids_flg
5,1037719,age_45_54,income_60_90,М,0
11,312520,age_35_44,income_90_150,Ж,0
13,382508,age_18_24,income_20_40,М,0
15,628684,age_35_44,income_40_60,М,0
16,73728,age_45_54,income_40_60,М,0
...,...,...,...,...,...
840180,157810,age_25_34,income_20_40,Ж,0
840185,1021814,age_45_54,income_20_40,Ж,0
840191,365945,age_25_34,income_20_40,Ж,0
840193,983617,age_18_24,income_20_40,Ж,1


In [514]:
users_features["value"] = users_features.set_index(["age", "income", "sex", "kids_flg"]).index.values
users_features.drop(["age", "income", "sex", "kids_flg"], axis=1, inplace=True)

users_features["value"] = users_features["value"].apply(lambda x: "_".join(x))

u2f_dictionary = dict(zip(users_features.user_id, users_features["value"]))
u2f_dictionary[1037719], len(u2f_dictionary)

('age_45_54_income_60_90_М_0', 755602)

In [515]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/u2f_dictionary.dill", "wb") as f:
    dill.dump(u2f_dictionary, f)

In [516]:
users_features.head()

Unnamed: 0,user_id,value
5,1037719,age_45_54_income_60_90_М_0
11,312520,age_35_44_income_90_150_Ж_0
13,382508,age_18_24_income_20_40_М_0
15,628684,age_35_44_income_40_60_М_0
16,73728,age_45_54_income_40_60_М_0


In [517]:
users_features.groupby(["value"]).agg(list)

Unnamed: 0_level_0,user_id
value,Unnamed: 1_level_1
Unknown_Unknown_Unknown_0,"[456259, 680143, 553353, 1093176, 1003448, 375198, 111310, 933594, 780572, 58119, 113691, 918472, 129630, 960243, 828461, 368485, 443043, 181333, 413749, 396818, 221226, 620661, 395965, 284616, 73..."
Unknown_Unknown_Unknown_Unknown,"[12, 14, 19, 24, 27, 35, 43, 44, 52, 62, 68, 71, 77, 87, 89, 92, 94, 100, 107, 111, 112, 115, 116, 127, 136, 150, 156, 162, 177, 187, 193, 207, 208, 218, 227, 234, 245, 246, 250, 252, 266, 279, 28..."
Unknown_Unknown_Ж_0,"[342508, 614599, 743033, 622319, 403645, 707773, 377961, 757983, 662105, 752014, 765884, 493933, 862440, 156850, 408200, 163723, 634785, 920363, 493779, 299129, 1077599, 169877, 251975, 801207, 50..."
Unknown_Unknown_М_0,"[901774, 930608, 571870, 1050313, 925901, 368801, 808826, 5365, 699197, 560802, 270168, 90957, 806652, 548587, 507492, 805087, 927103, 10434, 870885, 878297, 837891, 163482, 169969, 488912, 635760..."
Unknown_income_0_20_Unknown_0,[816632]
...,...
age_65_inf_income_60_90_М_1,"[31031, 32467, 238930, 999933, 381390, 743661, 399960, 764838, 149241, 596576, 689981, 56400, 67186, 865379, 139590, 570305, 746913, 57279, 363791, 64203, 10823, 179225, 870051, 943424, 1055488, 7..."
age_65_inf_income_90_150_Ж_0,"[475463, 265937, 635097, 643579, 273078, 471435, 760140, 970965, 252386, 564757, 304747, 524212, 794619, 602456, 407943, 952933, 140964, 358938, 774251, 852554, 378004, 967451, 333211, 778311, 324..."
age_65_inf_income_90_150_Ж_1,"[638054, 478090, 1034093, 643299, 381564, 946901, 562866, 547016, 647691, 441153, 874397, 26196, 81261, 557218, 762631, 76663, 274602, 775371, 232517, 758609, 377972, 913140, 458985]"
age_65_inf_income_90_150_М_0,"[838341, 339405, 623675, 408289, 220310, 467076, 274986, 393112, 288366, 195968, 755323, 591198, 356391, 411457, 1024300, 891522, 434503, 993054, 875078, 13056, 147939, 1022534, 244300, 810157, 39..."


Признаки для горячих пользователей, и случайные пять представителей для этих признаков в таблице ```hot_users_features```:

In [399]:
hot_users_features = users[users["user_id"].isin(hot_users)]
hot_users_features["value"] = hot_users_features.set_index(["age", "income", "sex", "kids_flg"]).index.values
hot_users_features.drop(["age", "income", "sex", "kids_flg"], axis=1, inplace=True)

hot_users_features["value"] = hot_users_features["value"].apply(lambda x: "_".join(x))

hot_users_features = hot_users_features.sample(frac=1).groupby(["value"]).head(5).groupby(["value"]).agg(list)
hot_users_features.tail()

Unnamed: 0_level_0,user_id
value,Unnamed: 1_level_1
age_65_inf_income_60_90_М_1,"[67568, 294906, 107715, 524828, 1060473]"
age_65_inf_income_90_150_Ж_0,"[119955, 1024749, 802504, 217561, 934093]"
age_65_inf_income_90_150_Ж_1,"[452243, 251538, 775784, 252185, 113636]"
age_65_inf_income_90_150_М_0,"[791672, 265173, 887554, 151719, 265316]"
age_65_inf_income_90_150_М_1,"[274568, 798042, 420105, 462309, 210043]"


In [None]:
import random

def recommend_list(user_ids, k=10):
    result = set()
    for test_user_id in user_ids:
        output_items = np.array(ann_light_fm.predict(test_user_id), dtype='uint16')
        already_seen_items = np.array(watched[watched["user_id"] == test_user_id].item_id.iloc[0], dtype='uint16')

        unseen_items = output_items[~np.isin(output_items, already_seen_items)]
        num_lost_items = 10 - unseen_items.shape[0]
        if num_lost_items != 0:
            popular_items = np.arange(20)

            popular_items = popular_items[~np.isin(popular_items, already_seen_items)]
            popular_items = popular_items[~np.isin(popular_items, unseen_items)]

            unseen_items = np.append(unseen_items, popular_items[:num_lost_items])

        result.update(unseen_items.tolist())
    result = list(result)
    random.shuffle(result)
    return result[:k]

In [518]:
hot_users_features["reco_item_ids"] = hot_users_features["user_id"].apply(lambda user_list: recommend_list(user_list))
hot_users_features

Unnamed: 0_level_0,user_id,reco_item_ids
value,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown_Unknown_Unknown_0,"[564658, 563169, 1080456, 959513, 1057900]","[13865, 14378, 849, 676, 15472, 11231, 13955, 3402, 5326, 2346]"
Unknown_Unknown_Ж_0,"[498933, 864263, 26165, 1092424, 753178]","[2954, 12965, 14431, 10942, 12192, 12995, 12537, 4740, 5543, 7310]"
Unknown_Unknown_М_0,"[744205, 290331, 1015100, 342105, 697342]","[4774, 11863, 1554, 3734, 11310, 9996, 13865, 10256, 142, 5543]"
Unknown_income_0_20_Ж_0,"[258459, 875491, 903297, 428464]","[142, 15266, 15915, 14025, 4151, 3095, 11756, 10440, 2616, 7216]"
Unknown_income_0_20_Ж_1,[92532],"[14470, 341, 4218, 4731, 11778, 15531, 14488, 5543, 10440, 9728]"
...,...,...
age_65_inf_income_60_90_М_1,"[67568, 294906, 107715, 524828, 1060473]","[3547, 8636, 12192, 4880, 11749, 4218, 4943, 12096, 13865, 24]"
age_65_inf_income_90_150_Ж_0,"[119955, 1024749, 802504, 217561, 934093]","[4179, 11778, 4218, 13865, 696, 3734, 4880, 7545, 13058, 4151]"
age_65_inf_income_90_150_Ж_1,"[452243, 251538, 775784, 252185, 113636]","[11640, 9728, 13865, 341, 4218, 3734, 6086, 7107, 15706, 4151]"
age_65_inf_income_90_150_М_0,"[791672, 265173, 887554, 151719, 265316]","[12837, 15297, 15266, 142, 8254, 6208, 14741, 7417, 11756, 2956]"


Мержим рекомендации

In [519]:
users_features.head()

Unnamed: 0,user_id,value
5,1037719,age_45_54_income_60_90_М_0
11,312520,age_35_44_income_90_150_Ж_0
13,382508,age_18_24_income_20_40_М_0
15,628684,age_35_44_income_40_60_М_0
16,73728,age_45_54_income_40_60_М_0


In [520]:
hot_users_features.head()

Unnamed: 0_level_0,user_id,reco_item_ids
value,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown_Unknown_Unknown_0,"[564658, 563169, 1080456, 959513, 1057900]","[13865, 14378, 849, 676, 15472, 11231, 13955, 3402, 5326, 2346]"
Unknown_Unknown_Ж_0,"[498933, 864263, 26165, 1092424, 753178]","[2954, 12965, 14431, 10942, 12192, 12995, 12537, 4740, 5543, 7310]"
Unknown_Unknown_М_0,"[744205, 290331, 1015100, 342105, 697342]","[4774, 11863, 1554, 3734, 11310, 9996, 13865, 10256, 142, 5543]"
Unknown_income_0_20_Ж_0,"[258459, 875491, 903297, 428464]","[142, 15266, 15915, 14025, 4151, 3095, 11756, 10440, 2616, 7216]"
Unknown_income_0_20_Ж_1,[92532],"[14470, 341, 4218, 4731, 11778, 15531, 14488, 5543, 10440, 9728]"


Получили такие же рекомендации для холодных и тёлпых пользователей, как и для горячих по сходству признаков.

В таблице всё еще остаются NaN в столбце ```reco_item_ids```

In [523]:
recos = users_features.merge(hot_users_features.reset_index()[["value", "reco_item_ids"]], how="left", on="value")
recos

Unnamed: 0,user_id,value,reco_item_ids
0,1037719,age_45_54_income_60_90_М_0,"[10440, 5543, 9785, 2657, 11863, 13020, 5115, 10226, 14488, 4218]"
1,312520,age_35_44_income_90_150_Ж_0,"[3734, 142, 16166, 13865, 14264, 7825, 341, 12974, 12995, 9996]"
2,382508,age_18_24_income_20_40_М_0,"[7571, 14488, 10119, 3734, 11237, 12965, 1916, 5543, 4218, 10440]"
3,628684,age_35_44_income_40_60_М_0,"[10440, 13018, 12841, 9728, 12173, 4151, 3682, 8636, 4880, 7829]"
4,73728,age_45_54_income_40_60_М_0,"[4740, 9996, 7793, 1287, 4457, 341, 12192, 5411, 10440, 849]"
...,...,...,...
755597,157810,age_25_34_income_20_40_Ж_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755598,1021814,age_45_54_income_20_40_Ж_0,"[13865, 12192, 3182, 7829, 4880, 2657, 512, 9728, 4218, 4151]"
755599,365945,age_25_34_income_20_40_Ж_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755600,983617,age_18_24_income_20_40_Ж_1,"[10323, 11985, 13915, 10761, 13243, 8821, 8584, 4457, 14488, 6646]"


In [524]:
recos[recos["reco_item_ids"].isna()]["value"].unique()

array(['age_65_inf_income_150_inf_Unknown_0',
       'age_65_inf_income_150_inf_Ж_1', 'age_55_64_income_150_inf_Ж_1',
       'age_55_64_income_150_inf_Ж_0', 'age_18_24_income_0_20_Unknown_1',
       'age_25_34_income_0_20_Unknown_1',
       'age_45_54_income_0_20_Unknown_0',
       'age_55_64_income_60_90_Unknown_1', 'age_18_24_income_150_inf_М_0',
       'age_35_44_income_0_20_Unknown_1',
       'age_35_44_income_0_20_Unknown_0',
       'age_65_inf_income_0_20_Unknown_0',
       'Unknown_income_0_20_Unknown_0',
       'age_18_24_income_40_60_Unknown_1',
       'Unknown_income_60_90_Unknown_0',
       'age_55_64_income_90_150_Unknown_0',
       'age_35_44_income_90_150_Unknown_1', 'Unknown_income_90_150_М_1',
       'age_25_34_income_60_90_Unknown_1',
       'age_35_44_income_150_inf_Unknown_0',
       'Unknown_Unknown_Unknown_Unknown',
       'age_45_54_income_0_20_Unknown_1'], dtype=object)

Вручную заполним пропуски в признаках пользователей, чтобы можно было рекомендовать по похожим горячим:

In [521]:
lost_features_dict = {
    'age_65_inf_income_150_inf_Unknown_0': 'age_65_inf_income_150_inf_Ж_0',

    'age_65_inf_income_150_inf_Ж_1': 'age_65_inf_income_150_inf_Ж_1',  
    'age_55_64_income_150_inf_Ж_1': 'age_55_64_income_150_inf_Ж_1',
    'age_55_64_income_150_inf_Ж_0': 'age_55_64_income_150_inf_Ж_0', 

    'age_18_24_income_0_20_Unknown_1': 'age_18_24_income_0_20_М_1',
    'age_25_34_income_0_20_Unknown_1': 'age_25_34_income_0_20_Ж_1',
    'age_45_54_income_0_20_Unknown_0': 'age_45_54_income_0_20_М_0',
    'age_55_64_income_60_90_Unknown_1': 'age_55_64_income_60_90_М_1', 
    'age_18_24_income_150_inf_М_0': 'age_18_24_income_150_inf_М_0',
    'age_35_44_income_0_20_Unknown_1': 'age_35_44_income_0_20_М_1',
    'age_35_44_income_0_20_Unknown_0': 'age_35_44_income_0_20_М_0',
    'age_65_inf_income_0_20_Unknown_0': 'age_65_inf_income_0_20_Ж_0',
    'Unknown_income_0_20_Unknown_0': 'age_25_34_income_0_20_М_0',
    'age_18_24_income_40_60_Unknown_1': 'age_18_24_income_40_60_Ж_1',
    'Unknown_income_60_90_Unknown_0': 'age_25_34_income_60_90_М_0',
    'age_55_64_income_90_150_Unknown_0': 'age_55_64_income_90_150_Ж_0',
    'age_35_44_income_90_150_Unknown_1': 'age_35_44_income_90_150_М_1',
    'Unknown_income_90_150_М_1': 'age_25_34_income_90_150_М_1',
    'age_25_34_income_60_90_Unknown_1': 'age_25_34_income_60_90_М_1',
    'age_35_44_income_150_inf_Unknown_0': 'age_35_44_income_150_inf_М_0',
    'Unknown_Unknown_Unknown_Unknown': 'Unknown_Unknown_Unknown_Unknown',
    'age_45_54_income_0_20_Unknown_1': 'age_45_54_income_0_20_М_1'
}

In [525]:
nan_items_mask = recos["reco_item_ids"].isna()
recos[nan_items_mask]

Unnamed: 0,user_id,value,reco_item_ids
30042,374937,age_65_inf_income_150_inf_Unknown_0,
63785,852699,age_65_inf_income_150_inf_Ж_1,
72863,384049,age_65_inf_income_150_inf_Ж_1,
77102,964249,age_55_64_income_150_inf_Ж_1,
85411,185027,age_55_64_income_150_inf_Ж_0,
...,...,...,...
659692,1097552,Unknown_Unknown_Unknown_Unknown,
677519,298642,age_45_54_income_0_20_Unknown_0,
715935,22453,age_35_44_income_0_20_Unknown_0,
726569,504226,age_45_54_income_0_20_Unknown_1,


In [526]:
for k, v in lost_features_dict.items():
    reco_list_for_user = recos[recos["value"] == v]["reco_item_ids"].iloc[0]
    if isinstance(reco_list_for_user, list):
        recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"] = \
            recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"].apply(lambda x: reco_list_for_user)
    else:
        recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"] = \
            recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"].apply(lambda x: popular_dict["popular_for_all"][:10])

In [527]:
recos["reco_item_ids"].isna().sum()

0

In [568]:
recos

Unnamed: 0,user_id,value,reco_item_ids
0,1037719,age_45_54_income_60_90_М_0,"[10440, 5543, 9785, 2657, 11863, 13020, 5115, 10226, 14488, 4218]"
1,312520,age_35_44_income_90_150_Ж_0,"[3734, 142, 16166, 13865, 14264, 7825, 341, 12974, 12995, 9996]"
2,382508,age_18_24_income_20_40_М_0,"[7571, 14488, 10119, 3734, 11237, 12965, 1916, 5543, 4218, 10440]"
3,628684,age_35_44_income_40_60_М_0,"[10440, 13018, 12841, 9728, 12173, 4151, 3682, 8636, 4880, 7829]"
4,73728,age_45_54_income_40_60_М_0,"[4740, 9996, 7793, 1287, 4457, 341, 12192, 5411, 10440, 849]"
...,...,...,...
755597,157810,age_25_34_income_20_40_Ж_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755598,1021814,age_45_54_income_20_40_Ж_0,"[13865, 12192, 3182, 7829, 4880, 2657, 512, 9728, 4218, 4151]"
755599,365945,age_25_34_income_20_40_Ж_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755600,983617,age_18_24_income_20_40_Ж_1,"[10323, 11985, 13915, 10761, 13243, 8821, 8584, 4457, 14488, 6646]"


Сохраняем словарь с рекомендациями для холодных и тёплых пользователей:

In [529]:
lightfm_users_reco_dictionary = dict(zip(recos["user_id"], recos["reco_item_ids"]))
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/lightfm_users_reco_dictionary_popular.dill", "wb") as f:
    dill.dump(lightfm_users_reco_dictionary, f)

## 1.5. Смотрим 👀 на рекомендации 

Рекомендация для холодого/тёплого:

In [553]:
test_user_id = 10
users[users["user_id"] == test_user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
715105,10,age_18_24,income_40_60,М,0


In [554]:
if test_user_id in lightfm_users_reco_dictionary:
    print(lightfm_users_reco_dictionary[test_user_id])
else:
    print("Горячий")

pd.DataFrame(data={
    "user_id": test_user_id,
    "result_items": result_items
}).merge(items[["item_id", "title", "genres"]], how="left", left_on="result_items", right_on="item_id")

[7571, 15297, 16166, 4151, 3734, 13159, 4436, 13915, 4475, 9728]


Unnamed: 0,user_id,result_items,item_id,title,genres
0,10,1267,1267,Город героев,"боевики, фантастика, мультфильм, комедии"
1,10,13243,13243,Головоломка,"фантастика, мультфильм, комедии"
2,10,11919,11919,Суперсемейка,"фантастика, мультфильм, приключения"
3,10,11749,11749,Суперсемейка 2,"фантастика, мультфильм, приключения"
4,10,14488,14488,Мастер меча,"боевики, историческое"
5,10,12192,12192,Фемида видит,"драмы, детективы, комедии"
6,10,9728,9728,Гнев человеческий,"боевики, триллеры"
7,10,15297,15297,Клиника счастья,"драмы, мелодрамы"
8,10,5543,5543,Турист,боевики
9,10,10440,10440,Хрустальный,"триллеры, детективы"


Рекомендация для горячего:

In [560]:
interactions["user_id"].unique()[:20]

array([ 0,  2,  3,  5,  9, 11, 13, 15, 21, 30, 32, 37, 41, 46, 47, 53, 55,
       59, 60, 61])

In [574]:
test_user_id = 46
users[users["user_id"] == test_user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
123407,46,age_25_34,income_20_40,Ж,0


In [575]:
result_items = recommend(test_user_id)

pd.DataFrame(data={
    "user_id": test_user_id,
    "result_items": result_items
}).merge(items[["item_id", "title", "genres"]], how="left", left_on="result_items", right_on="item_id")

Unnamed: 0,user_id,result_items,item_id,title,genres
0,46,9728,9728,Гнев человеческий,"боевики, триллеры"
1,46,15297,15297,Клиника счастья,"драмы, мелодрамы"
2,46,4151,4151,Секреты семейной жизни,комедии
3,46,13865,13865,Девятаев,"драмы, военные, приключения"
4,46,1844,1844,Аферистка,"триллеры, комедии"
5,46,3734,3734,Прабабушка легкого поведения,комедии
6,46,657,657,Защитник,"драмы, триллеры, детективы"
7,46,7571,7571,100% волк,"мультфильм, приключения, семейное, фэнтези, комедии"
8,46,14488,14488,Мастер меча,"боевики, историческое"
9,46,12192,12192,Фемида видит,"драмы, детективы, комедии"


Подмешиваются комедии, драмы и мелодрамы 

In [576]:
import random

random.randint(1, 10)

5

# Avatars

In [637]:
avatars_users = pd.read_csv(
    "/content/drive/MyDrive/RecSys MTC/practice4/models/avatars/avatars_users.csv",
    usecols=["user_id", "age", "income", "sex", "kids_flg"],
)
avatars_interactions = pd.read_csv(
    "/content/drive/MyDrive/RecSys MTC/practice4/models/avatars/avatars_interactions.csv",
    usecols=[
        "user_id",
        "item_id",
        "last_watch_dt",
        "total_dur",
        "watched_pct",
        "weight",
    ],
)

In [613]:
Columns.Datetime = "datetime"
avatars_interactions.rename(columns={'last_watch_dt': "datetime"}, inplace=True) 

In [614]:
avatars_interactions[Columns.Datetime] = pd.to_datetime(
    avatars_interactions[Columns.Datetime], format="%Y-%m-%d"
)

Генерируем искусственных пользователей.

Необходимо:
1. Создать искусственные взаимодействия и признаки этим пользователям;
2. Заново собрать датасет, добавив данные этих пользователей в interactions, users -> user_features;
3. Посмотреть предсказания модели и сделать выводы;

Логично предположить, что человеку, который смотрит все жанры подряд, легче рекомендовать предметы и меньше шансов не угодить пользователю. Попробуем создать немного хардкорных персонажей, чтобы устроить протестировать нашу факторизационную машину в деле:

* <font size='4'>взрослый работящий мужчина с зарплатой до 20к, который смотрит только мультики и при этом нет kid флага;</font>
    * интересный кейс, с человеческой точки зрения вероятнее, что не поставили флажок, а сервисом пользуется чисто ребенок, интересно будет взглянуть на рекомендации модели
* <font size='4'>молодой парень c ребенком (но без мультиков), зарабатывающий от 150к, в целом интересующийся научно-популярным, но имеющего пару айтемов других жанров </font>
    * Хватаемся за специфичную, не выигрывающую по полулярности, категорию + посмотрим, предложат ли ему из-за флажка детский контент
* <font size='4'>женщина средних лет без детей со средним достатком 40-60к, которая исключительно занимается спортом по фитнес-роликам.</font>
    * Интересно посмотреть спектр рекомендаций для такого пользователя

In [638]:
avatars_users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,-6,age_55_64,income_0_20,М,0
1,-66,age_18_24,income_150_inf,М,1
2,-666,age_35_44,income_40_60,Ж,0


In [616]:
avatars_interactions.merge(items[["item_id", "title", "genres"]], how = "left", on = "item_id")

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight,title,genres
0,-6,8464,2021-05-19,3297,78.0,4,День рождения Алисы,"русские мультфильмы, русские, для детей, полнометражные"
1,-6,11864,2021-05-18,9572,9.0,1,Наруто 7: Потерянная башня,"аниме, приключения"
2,-6,2378,2021-07-01,7689,99.0,5,Робокар Поли. Правила дорожного движения,"зарубежные, для детей, сериалы, западные мультфильмы"
3,-6,6138,2021-07-02,15,38.0,3,Астралиум,"драмы, мультфильм"
4,-66,6096,2021-04-12,2263,1.0,1,Американский монстр,научно-популярные
5,-66,11222,2021-04-08,4593,9.0,1,Львы. Южная Африка,документальное
6,-66,7132,2021-08-15,13,46.0,3,[4К] Выход к Тихому океану,документальное
7,-66,2812,2021-06-05,1252,89.0,5,Няни,для взрослых
8,-66,14315,2021-07-02,3159,3.0,1,Нечестивые,ужасы
9,-666,2888,2021-07-27,205,96.0,5,Рацион на 1700 ккал,фитнес


In [602]:
def get_features(users: pd.DataFrame, items: pd.DataFrame, for_hot=True):

    # Generate user/item features for hot users, i.e. syncronise with interactions
    if for_hot:
        users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

    # Deal with user features

    users.fillna("Unknown", inplace=True)
    users = users.astype({"kids_flg": bool})

    user_features_frames = []
    for feature in ["sex", "age", "income", "kids_flg"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)

    user_features = pd.concat(user_features_frames)

    # Deal with item features
    # Genre

    items["genre"] = (
        items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    )
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    genre_feature.head()

    # Release year

    items["release_year"].fillna(int(items["release_year"].max()), inplace=True)
    items = items.astype({"release_year": int})

    year_from = 1977
    step = 5
    bins = [year for year in range(year_from, items["release_year"].max() + step, step)]

    bins_bias = [item + 1 for item in bins]
    pairs_strict = list(zip(bins_bias, bins[1:]))
    # Add the most first release year
    bins = [items["release_year"].min()] + bins
    # Add interval for it
    pairs_strict = [(items["release_year"].min(), bins[1])] + pairs_strict
    # Generate feature values
    labels = [f"year_{item[0]}_{item[1]}" for item in pairs_strict]

    year_bins = pd.cut(
        items["release_year"], bins=bins, labels=labels, include_lowest=True
    )

    items["release_year"] = year_bins.astype(str)

    # Age rating

    items["age_rating"].fillna(0, inplace=True)
    items = items.astype({"age_rating": int})

    # For kids

    items.loc[items["age_rating"] > 12, "for_kids"] = 0
    items.loc[items["age_rating"] <= 12, "for_kids"] = 1
    items = items.astype({"for_kids": bool})

    item_features_frames = []
    for feature in ["content_type", "release_year", "age_rating", "for_kids"]:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        item_features_frames.append(feature_frame)
    item_features_frames.append(genre_feature)
    item_features = pd.concat(item_features_frames)
    headtail(item_features)

    return user_features, item_features

In [617]:
users = pd.concat([users, avatars_users], ignore_index=True)

In [618]:
users.tail()

Unnamed: 0,user_id,age,income,sex,kids_flg
840198,-66,age_18_24,income_150_inf,М,1
840199,-666,age_35_44,income_40_60,Ж,0
840200,-6,age_55_64,income_0_20,М,0
840201,-66,age_18_24,income_150_inf,М,1
840202,-666,age_35_44,income_40_60,Ж,0


In [619]:
users_features, items_features = get_features(users, items)

In [620]:
interactions = pd.concat([interactions, avatars_interactions], ignore_index=True)

In [622]:
interactions.drop("last_watch_dt", axis=1, inplace=True)

In [623]:
interactions.tail()

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight
5476272,-66,14315,2021-07-02,3159,3.0,1
5476273,-666,2888,2021-07-27,205,96.0,5
5476274,-666,8565,2021-05-26,10748,23.0,2
5476275,-666,15465,2021-08-06,5472,85.0,5
5476276,-666,14957,2021-08-05,4158,67.0,4


In [624]:
%%time
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=users_features,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=items_features,
    cat_item_features=[
        "genre",
        "content_type",
        "release_year",
        "for_kids",
        "age_rating",
    ],
)

CPU times: user 2.33 s, sys: 30.9 ms, total: 2.36 s
Wall time: 2.36 s


Обучаем:

In [625]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 32
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [626]:
model = LightFMWrapperModel(
            LightFM(
                no_components=N_FACTORS,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [627]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f8ab3ef11c0>

In [632]:
model.recommend([-6, -66, -666], dataset=dataset, filter_viewed=True, k=10)\
    .merge(items[["item_id", "title", "release_year", "genres"]], how="left", on="item_id")

Unnamed: 0,user_id,item_id,score,rank,title,release_year,genres
0,-6,15465,-296.16209,1,Такси для Ангела,2007.0,"русские, детективы"
1,-6,16227,-302.016142,2,"Вперед, Астробой!",2019.0,"аниме, сериалы, приключения, зарубежные, фантастика, западные мультфильмы"
2,-6,7303,-302.087509,3,Металионы,2018.0,"мультсериалы, приключения, фантастика, боевики, фэнтези"
3,-6,12741,-302.312172,4,Шиммер и Шайн,2016.0,"мультсериалы, фэнтези, приключения"
4,-6,9936,-302.33386,5,Рей и пожарный патруль,2016.0,"фантастика, мультсериалы, приключения"
5,-6,15084,-302.342264,6,Супер Зак,2019.0,"фантастика, мультсериалы, приключения"
6,-6,12512,-302.352851,7,Игры с Йоко,2018.0,"мультсериалы, приключения"
7,-6,14729,-302.400245,8,Атлоны,2019.0,"боевики, мультсериалы, приключения"
8,-6,13271,-302.426804,9,Гормити,2018.0,"мультсериалы, фэнтези, приключения"
9,-6,2059,-302.432776,10,Монкарт,2017.0,"фантастика, мультсериалы, фэнтези, приключения"
