
## Import Libraries

In [None]:
!pip install rectools==0.3.0

In [2]:
import os

In [3]:
os.environ["OPENBLAS_NUM_THREADS"] = "1"  # For implicit ALS

In [4]:
import warnings

warnings.filterwarnings("ignore")

In [5]:
import pandas as pd
import numpy as np
import dill

from implicit.als import AlternatingLeastSquares
import itertools

from rectools.metrics import Precision, Recall, MAP, calc_metrics
from rectools.models import PopularModel, RandomModel, ImplicitALSWrapperModel
from rectools import Columns
from rectools.dataset import Dataset
from rectools.models import ImplicitALSWrapperModel, LightFMWrapperModel

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse

import matplotlib.pyplot as plt
from pathlib import Path
import typing as tp
from tqdm import tqdm

from lightfm import LightFM

from implicit.bpr import BayesianPersonalizedRanking

from implicit.lmf import LogisticMatrixFactorization

import timeit

In [6]:
np.random.seed(1234)

## Loading Data

In [7]:
import pandas as pd
import numpy as np
import scipy as sp
import requests
from tqdm.auto import tqdm
from scipy.stats import mode 
from pprint import pprint
import warnings
warnings.filterwarnings("ignore")

# from rectools import Columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

# download dataset by chunks
url = "https://storage.yandexcloud.net/itmo-recsys-public-data/kion_train.zip"

req = requests.get(url, stream=True)

with open('kion_train.zip', "wb") as fd:
    total_size_in_bytes = int(req.headers.get('Content-Length', 0))
    progress_bar = tqdm(desc='kion dataset download', total=total_size_in_bytes, unit='iB', unit_scale=True)
    for chunk in req.iter_content(chunk_size=2 ** 20):
        progress_bar.update(len(chunk))
        fd.write(chunk)

!unzip kion_train.zip

kion dataset download:   0%|          | 0.00/78.8M [00:00<?, ?iB/s]

Archive:  kion_train.zip
   creating: kion_train/
  inflating: kion_train/interactions.csv  
  inflating: __MACOSX/kion_train/._interactions.csv  
  inflating: kion_train/users.csv    
  inflating: __MACOSX/kion_train/._users.csv  
  inflating: kion_train/items.csv    
  inflating: __MACOSX/kion_train/._items.csv  


In [593]:
interactions = pd.read_csv('kion_train/interactions.csv')

users = pd.read_csv('kion_train/users.csv')
users.fillna("Unknown", inplace=True)
users["kids_flg"] = users["kids_flg"].astype("str")

items = pd.read_csv('kion_train/items.csv')

## Preprocessing

In [594]:
def headtail(df: pd.DataFrame):
    return pd.concat([df.head(), df.tail()])

In [595]:
Columns.Datetime = "datetime"
interactions.rename(columns={'last_watch_dt': "datetime"}, inplace=True) 

In [596]:
interactions.drop(
    interactions[interactions[Columns.Datetime].str.len() != 10].index, inplace=True
)

In [597]:
interactions[Columns.Datetime] = pd.to_datetime(
    interactions[Columns.Datetime], format="%Y-%m-%d"
)

In [598]:
# 0 - 100 | 0-10=1 10-30=2 30-60=3 60-100=5
def f(pct):
    if pct < 10:
        return 1
    elif pct < 30:
        return 2
    elif pct < 60:
        return 3
    elif pct < 85:
        return 4
    return 5

interactions[Columns.Weight] = interactions["watched_pct"].apply(lambda x: f(x))
interactions

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight
0,176549,9506,2021-05-11,4250,72.0,4
1,699317,1659,2021-05-29,8317,100.0,5
2,656683,7107,2021-05-09,10,0.0,1
3,864613,7638,2021-07-05,14483,100.0,5
4,964868,9506,2021-04-30,6725,100.0,5
...,...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0,1
5476247,546862,9673,2021-04-13,2308,49.0,3
5476248,697262,15297,2021-08-20,18307,63.0,4
5476249,384202,16197,2021-04-19,6203,100.0,5


## –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏ –Ω–∞ –ø–æ–ª–Ω–æ–º –¥–∞—Ç–∞—Å–µ—Ç–µ

–í—ã–±–∏—Ä–∞–µ–º –ª—É—á—à—É—é –º–æ–¥–µ–ª—å –ø–æ –∏—Ç–æ–≥–∞–º –≤–∞–ª–∏–¥–∞—Ü–∏–∏ –∏ –ø—Ä–æ–∏–∑–≤–æ–¥–∏–º –æ–±—É—á–µ–Ω–∏–µ –Ω–∞ –≤—Å–µ–º –¥–∞—Ç–∞—Å–µ—Ç–µ

–ù–µ–æ–±—Ö–æ–¥–∏–º–æ –∑–∞–Ω–æ–≤–æ —Å–æ–±—Ä–∞—Ç—å –¥–∞—Ç–∞—Å–µ—Ç, –∏—Å–ø–æ–ª—å–∑—É—è –≤—Å–µ –¥–∞–Ω–Ω—ã–µ. –û—Å–æ–±–µ–Ω–Ω–æ—Å—Ç–∏:
1. –ù–µ–æ–±—Ö–æ–¥–∏–º –ø–æ–ª–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç `users`
2. –ù–µ–æ–±—Ö–æ–¥–∏–º –ø–æ–ª–Ω—ã–π –¥–∞—Ç–∞—Å–µ—Ç `items`
3. –õ–æ–≥–∏–∫—É –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –¥–∞—Ç–∞—Å–µ—Ç–∞ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏–π –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç—É –∂–µ

–ò–∑–º–µ–Ω–µ–Ω–∏–π –∫–∞—Å–∞—Ç–µ–ª—å–Ω–æ –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ —Ñ–∏—á–µ–π –Ω–µ—Ç –∏ –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–π –ª–æ–≥–∏–∫–∏ –Ω–µ –Ω—É–∂–Ω–æ. –û–±–µ—Ä–Ω–µ–º –≤—Å–µ –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è –≤ —Ñ—É–Ω–∫—Ü–∏—é –∏ –ø—Ä–æ–∏–∑–≤–µ–¥–µ–º –≤—ã–∑–æ–≤.

In [None]:
def get_features(users: pd.DataFrame, items: pd.DataFrame, for_hot=True):
    
    # Generate user/item features for hot users, i.e. syncronise with interactions
    if for_hot:
        users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()
    
    # Deal with user features
    
    users.fillna("Unknown", inplace=True)
    users = users.astype({"kids_flg": bool})
    
    user_features_frames = []
    for feature in ["sex", "age", "income", "kids_flg"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)
    
    user_features = pd.concat(user_features_frames)
    
    # Deal with item features
    # Genre

    items["genre"] = (
        items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    )
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    genre_feature.head()

    # Release year

    items["release_year"].fillna(int(items["release_year"].max()), inplace=True)
    items = items.astype({"release_year": int})

    year_from = 1977
    step = 5
    bins = [
        year
        for year in range(
            year_from, items["release_year"].max() + step, step
        )
    ]

    bins_bias = [item + 1 for item in bins]
    pairs_strict = list(zip(bins_bias, bins[1:]))
    # Add the most first release year
    bins = [items["release_year"].min()] + bins
    # Add interval for it
    pairs_strict = [(items["release_year"].min(), bins[1])] + pairs_strict
    # Generate feature values
    labels = [f'year_{item[0]}_{item[1]}' for item in pairs_strict]

    year_bins = pd.cut(items["release_year"], bins=bins, labels=labels, include_lowest=True)

    items['release_year'] = year_bins.astype(str)

    # Age rating

    items['age_rating'].fillna(0, inplace=True);
    items = items.astype({"age_rating": int})

    # For kids

    items["for_kids"].loc[items["age_rating"] > 12] = 0.0
    items["for_kids"].loc[items["age_rating"] <= 12] = 1.0
    items = items.astype({"for_kids": bool})
    
    item_features_frames = []
    for feature in ["content_type", "release_year", "age_rating", "for_kids"]:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        item_features_frames.append(feature_frame)
    item_features_frames.append(genre_feature)
    item_features = pd.concat(item_features_frames)
    headtail(item_features)

    return user_features, item_features

In [None]:
user_features, item_features = get_features(users, items)
full_user_features, _ = get_features(users, items, for_hot=False)

–°–æ–∑–¥–∞–µ–º –¥–∞—Ç–∞—Å–µ—Ç

In [None]:
%%time
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=user_features,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=item_features,
    cat_item_features=["genre", "content_type", "release_year", "for_kids", "age_rating"],
)

CPU times: user 761 ms, sys: 16.9 ms, total: 778 ms
Wall time: 785 ms


–§–∏—Ç—Ç–∏–º

In [None]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 32
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [None]:
model = LightFMWrapperModel(
            LightFM(
                no_components=N_FACTORS,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [None]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7fb87a89df10>

–°–æ—Ö—Ä–∞–Ω–∏–º –º–æ–¥–µ–ª—å

In [None]:
save_name = "END_MODEL"
with open(f'/content/drive/MyDrive/RecSys MTC/practice4/models/{save_name}.dill', 'wb') as f:
    dill.dump(model, f)

–ó–∞–≥—Ä—É–∂–∞–µ–º –º–æ–¥–µ–ª—å

In [None]:
with open(f'/content/drive/MyDrive/RecSys MTC/practice4/models/{save_name}.dill', 'rb') as f:
    model = dill.load(f)

# 1. Approximate Nearest Neighbors

## 1.1. –°–æ–∑–¥–∞–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ –¥–ª—è –ø—Ä–∏–±–ª–∏–∂–µ–Ω–Ω–æ–≥–æ –ø–æ–∏—Å–∫–∞

In [None]:
import dill
import nmslib
import time

with open("/content/drive/MyDrive/RecSys MTC/practice4/models/ULTRA_LAST_MODEL.dill", "rb") as f:
    model = dill.load(f)

In [None]:
def augment_inner_product(factors):
    normed_factors = np.linalg.norm(factors, axis=1)
    max_norm = normed_factors.max()
    
    extra_dim = np.sqrt(max_norm ** 2 - normed_factors ** 2).reshape(-1, 1)
    augmented_factors = np.append(factors, extra_dim, axis=1)
    return max_norm, augmented_factors

In [None]:
user_vectors, item_vectors = model.get_vectors(dataset=dataset, add_biases=True)
print("–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –¥–æ", user_vectors.shape, item_vectors.shape)

max_norm, augmented_item_vectors = augment_inner_product(item_vectors)

extra_zero = np.zeros((user_vectors.shape[0], 1))
augmented_user_vectors = np.append(user_vectors, extra_zero, axis=1)
print("–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –ø–æ—Å–ª–µ", augmented_user_vectors.shape, augmented_item_vectors.shape)

–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –¥–æ (302486, 34) (15484, 34)
–†–∞–∑–º–µ—Ä–Ω–æ—Å—Ç—å –ø–æ—Å–ª–µ (302486, 35) (15484, 35)


In [None]:
user_internal = dataset.user_id_map.internal_ids
user_external = dataset.user_id_map.external_ids

item_internal = dataset.item_id_map.internal_ids
item_external = dataset.item_id_map.external_ids

In [None]:
user_mapping = {k.item(): v.item() for k, v in zip(user_external, user_internal)}
item_mapping = {k.item(): v.item() for k, v in zip(item_external, item_internal)}

user_inv_mapping = {k.item(): v.item() for k, v in zip(user_internal, user_external)}
item_inv_mapping = {k.item(): v.item() for k, v in zip(item_internal, item_external)}


with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_user_mapping.dill", "wb") as f:
    dill.dump(user_mapping, f)
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_item_inv_mapping.dill", "wb") as f:
    dill.dump(item_inv_mapping, f)

–ü–∞—Ä–∞–º–µ—Ç—Ä—ã –ò–Ω–¥–µ–∫—Å–∞:
- M - –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å–æ—Å–µ–¥–Ω–∏—Ö –≤–µ—Ä—à–∏–Ω —É –∫–∞–∂–¥–æ–π –≤–µ—Ä—à–∏–Ω—ã –≤ –∏–Ω–¥–µ–∫—Å–µ. –ß–µ–º –±–æ–ª—å—à–µ, —Ç–µ–º –±–æ–ª—å—à–µ –ø–æ—Ç—Ä–µ–±–ª–µ–Ω–∏–µ –ø–∞–º—è—Ç–∏.

- efConstruction - —Ç–æ—Ç –∂–µ —Å–º—ã—Å–ª —á—Ç–æ –∏ —É efSearch, –Ω–æ –∫–æ–Ω—Ç—Ä–æ–ª–∏—Ä—É–µ—Ç index_time/index_accuracy. –ë–æ–ª—å—à–µ–µ –∑–Ω–∞—á–µ–Ω–∏–µ –≤–µ–¥—ë—Ç –∫ –ª—É—á—à–µ–º—É –∏–Ω–¥–µ–∫—Å—É. –í –∫–∞–∫–æ–π-—Ç–æ –º–æ–º–µ–Ω—Ç —É–≤–µ–ª–∏—á–µ—Å–Ω–∏–µ efConstruction –Ω–µ —É–ª—É—á—à–∞–µ—Ç –∫–∞—á–µ—Å—Ç–≤–æ –∏–Ω–¥–µ–∫—Å–∞.

- num_threads - –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Ç–æ–∫–æ–≤ –¥–ª—è –ø–æ—Å—Ç—Ä–æ–µ–Ω–∏—è –∏–Ω–¥–Ω–µ–∫—Å–∞
- space - —Å–ø–æ—Å–æ–± –≤—ã—á–∏—Å–ª–µ–Ω–∏—è —Å–∫–æ—Ä–∞ –º–µ–∂–¥—É –∞–π—Ç–µ–º–æ–º –∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–º

–ü–∞—Ä–∞–º–µ—Ç—Ä—ã –ü–æ–∏—Å–∫–∞ –ø–æ –∏–Ω–¥–µ–∫—Å—É:
- K - —á–∏—Å–ª–æ –∏—Å–∫–æ–º—ã—Ö –±–ª–∏–∂–∞–π—à–∏—Ö –∞–π—Ç–µ–º–æ–≤ 
- efSearch - —Å–∫–æ–ª—å–∫–æ —Ä–∞–∑ –º—ã –∏—â–µ–º –≤ –ò–Ω–¥–µ–∫—Å–µ, –¥–æ–ª–∂–Ω–æ –±—ã—Ç—å –±–æ–ª—å—à–µ K –∏ –º–µ–Ω—å—à–µ —á–∏—Å–ª–∞ —É–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –∞–π—Ç–µ–º–æ–≤. –ß–µ–º –±–æ–ª—å—à–µ, —Ç–µ–º —Ç–æ—á–Ω–µ–µ –ø–æ–∏—Å–∫, –Ω–æ –º–µ–¥–ª–µ–Ω–µ–µ
- num_threads - –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –ø–æ—Ç–æ–∫–æ–≤ –¥–ª—è –ø–æ–∏—Å–∫–∞

In [None]:
from tqdm.notebook import tqdm

def create_index(augmented_item_vectors, M: int, efC: int, num_threads: int, space_name: str="negdotprod"):
    index_time_params = {'M': M, 'indexThreadQty': num_threads, 'efConstruction': efC}
    print('–ü–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–Ω–¥–µ–∫—Å–∞', index_time_params)

    # –ò–Ω–∏—Ü–∏–∞–ª–∏–∑–∞—Ü–∏—è –ò–Ω–¥–µ–∫—Å–∞, —É–∑–∫–∞–∑–∞–Ω–∏–µ –ø—Ä–æ—Å—Ç—Ä–∞–Ω—Å—Ç–≤–∞ –ø–æ–∏—Å–∫–∞, —Ç–∏–ø–∞ –≤–µ–∫—Ç–æ—Ä–æ–≤ 
    index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR)
    # –î–æ–±–∞–≤–ª–µ–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–æ–≤ –∞–π—Ç–µ–º–æ–≤
    index.addDataPointBatch(augmented_item_vectors)

    # –°–æ–∑–¥–∞–Ω–∏–µ –ò–Ω–¥–µ–∫—Å–∞
    start = time.time()
    index.createIndex(index_time_params) 
    end = time.time() 
    print('–í—Ä–µ–º—è —Å–æ–∑–¥–∞–Ω–∏—è = %f' % (end-start))
    return index


# Recall = TP / (TP + FN) = TP / 10
def calculate_ann_recall(
    hot_users,
    augmented_user_vectors,
    dataset: Dataset,
    model: LightFMWrapperModel, 
    index,
    K,
    item_mapping,
):
    TP = 0
    all = 0
    for user_id in tqdm(hot_users):
        internal_user_id = int(dataset.user_id_map.convert_to_internal([user_id])[0])

        target_items = model.recommend(
            [user_id], 
            dataset=dataset,
            k=10, 
            filter_viewed=False,
            add_rank_col=False,
            items_to_recommend=dataset.item_id_map.external_ids
        ).item_id.to_numpy()
        target_items = [item_mapping[ex_i] for ex_i in target_items]

        user_vector = augmented_user_vectors[internal_user_id]
        predicted_items = index.knnQuery(vector=user_vector, k=K)[0]
        
        TP += np.isin(target_items, predicted_items).sum()
        all += K

    recall = TP / all
    print("Recall = ", recall)


–ü–æ–¥–±–µ—Ä—ë–º –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–Ω–¥–µ–∫—Å–∞ ```M, efConstruction, efSearch```, —á—Ç–æ–±—ã Recall –≤–æ–∑–≤—Ä–∞—â–∞–µ–º—ã—Ö –∞–π—Ç–µ–º–æ–≤ –±—ã–ª –Ω–∞–∏–±–æ–ª—å—à–∏–º –¥–ª—è –ø–µ—Ä–≤—ã—Ö 100 –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π –∏–∑ –æ–±—É—á–∞—é—â–µ–≥–æ –Ω–∞–±–æ—Ä–∞:

–≠–º–ø–µ—Ä–∏—á–µ—Å–∫–∏ –ø–æ–¥–æ–±—Ä–∞–ª–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã, –¥–∞—é—â–∏–µ –ª—É—á—à–∏–π Recall:
- efSearch = 50
- efConstruction = 50
- M = 32

–ü—Ä–∏ –±–û–ª—å—à–∏—Ö –∑–Ω–∞—á–µ–Ω–∏—è—Ö –ø–∞—Ä–∞–º–µ—Ç—Ä–æ–≤ –±–æ–ª—å—à–µ –Ω–µ –ø—Ä–æ–∏—Å—Ö–æ–¥–∏—Ç —É–≤–µ–ª–∏—á–µ–Ω–∏–µ –æ—Ç–∑—ã–≤—á–∏–≤–æ—Å—Ç–∏ –ø–æ–∏—Å–∫–∞.

In [None]:
K = 10
num_threads = 4

hot_users = dataset.user_id_map.external_ids[:100]

In [None]:
# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –ò–Ω–¥–µ–∫—Å–∞
M = 32 # adjustable
efC = 50 # adjustable
# –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –ü–æ–∏—Å–∫–∞ –ø–æ –∏–Ω–¥–µ–∫—Å—É
efS = 50 # adjustable

index = create_index(augmented_item_vectors, M=M, efC=efC, num_threads=num_threads)

query_time_params = {'efSearch': efS}
print(f'–ü–∞—Ä–∞–º–µ—Ç—Ä –ø–æ–∏—Å–∫–∞ –ø–æ –∏–Ω–¥–∫—Å—É, efSearch >= K = {K}: ', query_time_params)
index.setQueryTimeParams(query_time_params)

calculate_ann_recall(
    hot_users = hot_users,
    augmented_user_vectors = augmented_user_vectors,
    dataset = dataset,
    model = model, 
    index = index,
    K = K,
    item_mapping = item_mapping
)

–ü–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–Ω–¥–µ–∫—Å–∞ {'M': 32, 'indexThreadQty': 4, 'efConstruction': 50}
–í—Ä–µ–º—è —Å–æ–∑–¥–∞–Ω–∏—è = 0.787863
–ü–∞—Ä–∞–º–µ—Ç—Ä –ø–æ–∏—Å–∫–∞ –ø–æ –∏–Ω–¥–∫—Å—É, efSearch >= K = 10:  {'efSearch': 50}


  0%|          | 0/100 [00:00<?, ?it/s]

Recall =  0.994


–°–æ—Ö—Ä–∞–Ω—è–µ–º –∏–Ω–¥–µ–∫—Å:

In [None]:
index.saveIndex("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_items_index.hnsw", save_data=True)

–°–æ—Ö—Ä–∞–Ω—è–µ–º –≤–µ–∫—Ç–æ—Ä–∞ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π:

In [None]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_user_embeddings.dill", "wb") as f:
    dill.dump(augmented_user_vectors, f)

–°–æ—Ö—Ä–∞–Ω—è–µ–º —É–∂–µ –ø—Ä–æ—Å–º–æ—Ç—Ä–µ–Ω–Ω—ã–µ –∞–π—Ç–µ–º—ã –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è: 

In [179]:
watched = interactions[["user_id", "item_id"]].groupby("user_id").agg(list).reset_index()
watched_user2items_dictionary = dict(zip(watched["user_id"], watched["item_id"]))

In [None]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_watched_user2items_dictionary.dill", "wb") as f:
    dill.dump(watched_user2items_dictionary, f)

## 1.2. –ê–Ω–∞–ª–∏–∑ —Å–∫–æ—Ä–æ—Å—Ç–∏ –ø–æ–∏—Å–∫–∞ —Ä–µ–∫–æ–º–º–µ–Ω–¥–∞—Ü–∏–π —á–µ—Ä–µ–∑ –æ–±—ë—Ä—Ç–∫—É –∏ —á–µ—Ä–µ–∑ –ø–µ—Ä–µ–º–Ω–æ–∂–µ–Ω–∏–µ –≤–µ–∫—Ç–æ—Ä–æ–≤.

–°–∫–æ—Ä–æ—Å—Ç—å –ø–æ–∏—Å–∫–∞ —á–µ—Ä–µ–∑ –º–æ–¥–µ–ª—å –∏ —á–µ—Ä–µ–∑ –≤–µ–∫—Ç–æ—Ä–∞. –ü—Ä–∏–±–ª–∏–∂–µ–Ω–Ω—ã–π –ø–æ–∏—Å–∫ –ø–æ –≤–µ–∫—Ç–æ—Ä–∞–º —Å–∏–ª—å–Ω–æ –≤—ã–∏–≥—Ä—ã–≤–∞–µ—Ç –ø–æ —Å–∫–æ—Ä–æ—Å—Ç–∏:

In [None]:
internal_test_user = 0
external_test_user = user_inv_mapping[internal_test_user]

user_vector = augmented_user_vectors[internal_test_user]

In [None]:
%%timeit  
target_items = model.recommend(
    [external_test_user],
    dataset=dataset,
    k=10, 
    filter_viewed=False,
    add_rank_col=False,
    items_to_recommend=dataset.item_id_map.external_ids
).item_id.to_numpy()

262 ms ¬± 3.77 ms per loop (mean ¬± std. dev. of 7 runs, 1 loop each)


In [None]:
%%timeit
predicted_items = index.knnQuery(vector=user_vector, k=K)[0]

11.5 ¬µs ¬± 234 ns per loop (mean ¬± std. dev. of 7 runs, 100000 loops each)


In [None]:
target_items = model.recommend(
    [external_test_user], 
    dataset=dataset,
    k=10, 
    filter_viewed=False,
    add_rank_col=False,
    items_to_recommend=dataset.item_id_map.external_ids
).item_id.to_numpy()
target_items

array([15464,  2150,  3351,  4918, 10440,  7829, 10680,  4735, 16018,
       11145])

In [None]:
predicted_items = index.knnQuery(vector=user_vector, k=K)[0]
predicted_items = np.array([item_inv_mapping[item] for item in predicted_items])
predicted_items

array([15464,  2150,  3351,  4918, 10440,  7829, 10680,  4735, 16018,
       11145])

In [None]:
np.isin(target_items, predicted_items)

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

## 1.3. –ú–æ–¥–µ–ª—å ANN Lightfm –¥–ª—è –≥–æ—Ä—è—á–∏—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π

–ì–æ—Ä—è—á–∏–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–∏ - —ç—Ç–æ —Ç–µ, —É –∫–æ—Ç–æ—Ä—ã—Ö –Ω–µ –º–µ–Ω—å—à–µ 5 –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏–π



In [458]:
class SimplePopularModel:
    def __init__(self, users_dictionary, popular_dictionary):
        self.users_dictionary = users_dictionary
        self.popular_dictionary = popular_dictionary

    def predict(self, user_id: int, k_recs: int) -> List[int]:
        try:
            category = self.users_dictionary.get(user_id, None)
            if category:
                return self.popular_dictionary[category][:k_recs]
            return self.popular_dictionary["popular_for_all"][:k_recs]
        except TypeError:
            return [14488, 12192, 9728, 15297, 5543, 10440, 4218, 341, 512, 13865]

In [459]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/popular/users_dictionary.pickle", "rb") as f:
    popular_users = dill.load(f)
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/popular/popular_dictionary.pickle", "rb") as f:
    popular_dict = dill.load(f)

popular_model = SimplePopularModel(popular_users, popular_dict)

In [159]:
user_m = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_user_mapping.dill"
item_inv_m = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_item_inv_mapping.dill"
index_path = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_items_index.hnsw"
user_emb = "/content/drive/MyDrive/RecSys MTC/practice4/models/user_embeddings.dill"
watched_u2i = "/content/drive/MyDrive/RecSys MTC/practice4/models/TRUE_END_watched_user2items_dictionary.dill"

In [480]:
from typing import List


class ANNLightFM:
    def __init__(self, user_m, item_inv_m, index_path, user_emb, watched_u2i, popular_model, K = 10):
        with open(user_m, "rb") as f:
            self.user_m = dill.load(f)
        with open(item_inv_m, "rb") as f:
            self.item_inv_m = dill.load(f)

        self.index = nmslib.init(method='hnsw', space="negdotprod")
        self.index.loadIndex(index_path, load_data=True)

        with open(user_emb, "rb") as f:
            self.user_emb = dill.load(f)
        with open(watched_u2i, "rb") as f:
            self.watched_u2i = dill.load(f)
        self.popular_model = popular_model
        self.K = K
    
    def predict(self, user_id: int) -> List[int]:
        if user_id in self.user_m:
            user_vector = self.user_emb[self.user_m[user_id]]
            pr_internal_items = self.index.knnQuery(
                vector=user_vector, k=self.K
            )[0]
            pr_items = [self.item_inv_m[item] for item in pr_internal_items]

            # Delete already seen items
            pr_items_numpy = np.array(pr_items, dtype="uint16")
            already_seen_items = np.array(
                self.watched_u2i[user_id], dtype="uint16"
            )

            unseen_items = pr_items_numpy[
                ~np.isin(pr_items_numpy, already_seen_items)
            ]
            num_lost_items = self.K - unseen_items.shape[0]
            if num_lost_items > 0:
                popular_items = np.array(
                    self.popular_model.predict(user_id, 5 * self.K)
                )

                popular_items = popular_items[
                    ~np.isin(popular_items, already_seen_items)
                ]
                popular_items = popular_items[
                    ~np.isin(popular_items, unseen_items)
                ]

                unseen_items = np.append(
                    unseen_items, popular_items[:num_lost_items]
                )
                if len(unseen_items) != 10:
                    return self.popular_model.predict(user_id, k_recs=self.K)
            return unseen_items[:self.K].tolist()
        else:
            self.popular_model.predict(user_id, k_recs=self.K)

In [481]:
ann_light_fm = ANNLightFM(user_m, item_inv_m, index_path, user_emb, watched_u2i, popular_model, K=10)

–ü–æ—Å–º–æ—Ç—Ä–∏–º –Ω–∞ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –Ω–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è—Ö:

In [482]:
def recommend(test_user_id: int):
    output_items = np.array(ann_light_fm.predict(test_user_id), dtype='uint16')
    already_seen_items = np.array(watched[watched["user_id"] == test_user_id].item_id.iloc[0], dtype='uint16')

    unseen_items = output_items[~np.isin(output_items, already_seen_items)]
    num_lost_items = 10 - unseen_items.shape[0]
    if num_lost_items != 0:
        popular_items = np.arange(20)

        popular_items = popular_items[~np.isin(popular_items, already_seen_items)]
        popular_items = popular_items[~np.isin(popular_items, unseen_items)]

        unseen_items = np.append(unseen_items, popular_items[:num_lost_items])
    return unseen_items.tolist()

In [487]:
test_user_id = 2
result_items = recommend(test_user_id)

pd.DataFrame(data={
    "user_id": test_user_id,
    "result_items": result_items
}).merge(items[["item_id", "title", "genres"]], how="left", left_on="result_items", right_on="item_id")

Unnamed: 0,user_id,result_items,item_id,title,genres
0,2,1267,1267,–ì–æ—Ä–æ–¥ –≥–µ—Ä–æ–µ–≤,"–±–æ–µ–≤–∏–∫–∏, —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –∫–æ–º–µ–¥–∏–∏"
1,2,13243,13243,–ì–æ–ª–æ–≤–æ–ª–æ–º–∫–∞,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –∫–æ–º–µ–¥–∏–∏"
2,2,11919,11919,–°—É–ø–µ—Ä—Å–µ–º–µ–π–∫–∞,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
3,2,11749,11749,–°—É–ø–µ—Ä—Å–µ–º–µ–π–∫–∞ 2,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
4,2,14488,14488,–ú–∞—Å—Ç–µ—Ä –º–µ—á–∞,"–±–æ–µ–≤–∏–∫–∏, –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–æ–µ"
5,2,12192,12192,–§–µ–º–∏–¥–∞ –≤–∏–¥–∏—Ç,"–¥—Ä–∞–º—ã, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã, –∫–æ–º–µ–¥–∏–∏"
6,2,9728,9728,–ì–Ω–µ–≤ —á–µ–ª–æ–≤–µ—á–µ—Å–∫–∏–π,"–±–æ–µ–≤–∏–∫–∏, —Ç—Ä–∏–ª–ª–µ—Ä—ã"
7,2,15297,15297,–ö–ª–∏–Ω–∏–∫–∞ —Å—á–∞—Å—Ç—å—è,"–¥—Ä–∞–º—ã, –º–µ–ª–æ–¥—Ä–∞–º—ã"
8,2,5543,5543,–¢—É—Ä–∏—Å—Ç,–±–æ–µ–≤–∏–∫–∏
9,2,10440,10440,–•—Ä—É—Å—Ç–∞–ª—å–Ω—ã–π,"—Ç—Ä–∏–ª–ª–µ—Ä—ã, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã"


In [488]:
users[users["user_id"] == test_user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
189221,2,age_25_34,income_40_60,–ú,1


–ö–ª–∞—Å—Å–Ω–∞—è —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è, –¥–ª—è –º—É–∂—á–∏–Ω—ã —Å —Ä–µ–±—ë–Ω–∫–æ–º. –°—Ç–∞–≤–∏–º –ö–ª–∞—Å—Å!

## 1.4. –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –¥–ª—è —Ç—ë–ø–ª—ã—Ö –∏ —Ö–æ–ª–æ–¥–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π

–†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ —Ç—ë–ø–ª—ã—Ö –∏ —Ö–æ–ª–æ–¥–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π.

–ë—É–¥–µ–º —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å —Ç–∞–∫–∏–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è–º, —Ç–æ —á—Ç–æ —Å–º–æ—Ç—Ä—è—Ç —Ç–∞–∫–∏–µ –∂–µ –≥–æ—Ä—è—á–∏–µ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–∏. –°—Ö–æ–¥—Å—Ç–≤–æ–º –º–µ–∂–¥—É —Ö–æ–ª–æ–¥–Ω—ã–º–∏ –∏ –≥–æ—Ä—è—á–∏–º–∏ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è–º–∏ –±—É–¥–µ—Ç —è–≤–ª—è—Ç—å—Å—è —Å—Ö–æ–¥—Å—Ç–≤–æ –∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: –≤–æ–∑—Ä–∞—Å—Ç, –¥–æ—Ö–æ–¥, –ø–æ–ª, —Ñ–ª–∞–≥ –¥–µ—Ç–µ–π.

–§–∏–ª—å—Ç—Ä—É–µ–º —Ç—ë–ø–ª—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π. –í–æ–∑—å–º—ë–º >= 5 –ø—Ä–æ—Å–º–æ—Ç—Ä–æ–≤:

In [508]:
# Preparing of the Hot Users
threshold = 5
users_inter_count = interactions.groupby("user_id")["item_id"].count()
hot_users = users_inter_count[users_inter_count >= threshold].index.values
warm_users = users_inter_count[users_inter_count < threshold].index.values

interactions = interactions[interactions["user_id"].isin(hot_users)].sort_values(["user_id", "datetime"])
interactions["order"] = interactions.groupby("user_id").cumcount(ascending=False)
interactions["order"] = interactions["order"].astype(np.uint16)

print("Hot users: ", hot_users.shape[0])
print("Hot interactions: ", interactions.shape[0])
interactions

Hot users:  302486
Hot interactions:  4290596


Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight,order
3590116,0,12192,2021-07-16,89,0.0,1,5
620,0,7102,2021-07-19,169,3.0,1,4
67070,0,14359,2021-07-19,130,2.0,1,3
90113,0,15297,2021-07-19,459,0.0,1,2
3103040,0,9728,2021-07-19,4,0.0,1,1
...,...,...,...,...,...,...,...
3629451,1097555,4662,2021-04-08,775,14.0,2,4
5172184,1097555,4880,2021-04-22,7117,9.0,1,3
3498963,1097555,6916,2021-05-09,740,14.0,2,2
405171,1097555,14703,2021-06-21,234,4.0,1,1


In [509]:
known_warm_users_features = users[users["user_id"].isin(warm_users)]

known_warm_users = known_warm_users_features["user_id"].to_numpy()
unknown_warm_users = np.setdiff1d(warm_users, known_warm_users)

known_warm_users_features

Unnamed: 0,user_id,age,income,sex,kids_flg
5,1037719,age_45_54,income_60_90,–ú,0
11,312520,age_35_44,income_90_150,–ñ,0
13,382508,age_18_24,income_20_40,–ú,0
15,628684,age_35_44,income_40_60,–ú,0
16,73728,age_45_54,income_40_60,–ú,0
...,...,...,...,...,...
840188,312839,age_65_inf,income_60_90,–ñ,0
840189,191349,age_45_54,income_40_60,–ú,1
840192,339025,age_65_inf,income_0_20,–ñ,0
840194,251008,Unknown,Unknown,Unknown,0


In [510]:
unknown_warm_users_features = pd.DataFrame(data={
    "user_id": unknown_warm_users,
    "age": "Unknown",
    "income": "Unknown",
    "sex": "Unknown",
    "kids_flg": "Unknown",
})
unknown_warm_users_features

Unnamed: 0,user_id,age,income,sex,kids_flg
0,12,Unknown,Unknown,Unknown,Unknown
1,14,Unknown,Unknown,Unknown,Unknown
2,19,Unknown,Unknown,Unknown,Unknown
3,24,Unknown,Unknown,Unknown,Unknown
4,27,Unknown,Unknown,Unknown,Unknown
...,...,...,...,...,...
160803,1097510,Unknown,Unknown,Unknown,Unknown
160804,1097515,Unknown,Unknown,Unknown,Unknown
160805,1097536,Unknown,Unknown,Unknown,Unknown
160806,1097545,Unknown,Unknown,Unknown,Unknown


In [511]:
all_users_in_interactions = np.append(hot_users, warm_users)
cold_users_only_in_users = users[~users["user_id"].isin(all_users_in_interactions)]
cold_users_only_in_users

Unnamed: 0,user_id,age,income,sex,kids_flg
2,1047345,age_45_54,income_40_60,–ñ,0
6,391756,age_25_34,income_0_20,–ú,0
7,15878,age_25_34,income_40_60,–ú,1
10,99952,Unknown,Unknown,–ú,0
19,1067802,age_35_44,income_40_60,–ú,0
...,...,...,...,...,...
840180,157810,age_25_34,income_20_40,–ñ,0
840185,1021814,age_45_54,income_20_40,–ñ,0
840191,365945,age_25_34,income_20_40,–ñ,0
840193,983617,age_18_24,income_20_40,–ñ,1


–¢–∞–±–ª–∏—Ü–∞ ```warm_users_features``` —Å–æ–¥–µ—Ä–∂–∏—Ç –ø—Ä–∏–∑–Ω–∞–∫–∏ –≤—Å–µ—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –∫–æ—Ç–æ—Ä—ã–µ –Ω–µ –∏–º–µ—é—Ç 5 –∏–ª–∏ –±–æ–ª–µ–µ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏–π, –≤–∫–ª—é—á–∞—è —Ç—ë–ø–ª—ã—Ö –∏ —Ö–æ–ª–æ–¥–Ω—ã—Ö. 

–í —Ç–∞–±–ª–∏—Ü–µ —Å–æ–¥–µ—Ä–∂–∞—Ç—Å—è NaN –∑–Ω–∞—á–µ–Ω–∏—è –ø—Ä–∏–∑–Ω–∞–∫–æ–≤. –î–∞–ª–µ–µ –¥–∞–¥–∏–º —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –Ω–∞ –æ—Å–Ω–æ–≤–µ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π:

In [513]:
users_features = pd.concat([known_warm_users_features, unknown_warm_users_features, cold_users_only_in_users])
users_features

Unnamed: 0,user_id,age,income,sex,kids_flg
5,1037719,age_45_54,income_60_90,–ú,0
11,312520,age_35_44,income_90_150,–ñ,0
13,382508,age_18_24,income_20_40,–ú,0
15,628684,age_35_44,income_40_60,–ú,0
16,73728,age_45_54,income_40_60,–ú,0
...,...,...,...,...,...
840180,157810,age_25_34,income_20_40,–ñ,0
840185,1021814,age_45_54,income_20_40,–ñ,0
840191,365945,age_25_34,income_20_40,–ñ,0
840193,983617,age_18_24,income_20_40,–ñ,1


In [514]:
users_features["value"] = users_features.set_index(["age", "income", "sex", "kids_flg"]).index.values
users_features.drop(["age", "income", "sex", "kids_flg"], axis=1, inplace=True)

users_features["value"] = users_features["value"].apply(lambda x: "_".join(x))

u2f_dictionary = dict(zip(users_features.user_id, users_features["value"]))
u2f_dictionary[1037719], len(u2f_dictionary)

('age_45_54_income_60_90_–ú_0', 755602)

In [515]:
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/u2f_dictionary.dill", "wb") as f:
    dill.dump(u2f_dictionary, f)

In [516]:
users_features.head()

Unnamed: 0,user_id,value
5,1037719,age_45_54_income_60_90_–ú_0
11,312520,age_35_44_income_90_150_–ñ_0
13,382508,age_18_24_income_20_40_–ú_0
15,628684,age_35_44_income_40_60_–ú_0
16,73728,age_45_54_income_40_60_–ú_0


In [517]:
users_features.groupby(["value"]).agg(list)

Unnamed: 0_level_0,user_id
value,Unnamed: 1_level_1
Unknown_Unknown_Unknown_0,"[456259, 680143, 553353, 1093176, 1003448, 375198, 111310, 933594, 780572, 58119, 113691, 918472, 129630, 960243, 828461, 368485, 443043, 181333, 413749, 396818, 221226, 620661, 395965, 284616, 73..."
Unknown_Unknown_Unknown_Unknown,"[12, 14, 19, 24, 27, 35, 43, 44, 52, 62, 68, 71, 77, 87, 89, 92, 94, 100, 107, 111, 112, 115, 116, 127, 136, 150, 156, 162, 177, 187, 193, 207, 208, 218, 227, 234, 245, 246, 250, 252, 266, 279, 28..."
Unknown_Unknown_–ñ_0,"[342508, 614599, 743033, 622319, 403645, 707773, 377961, 757983, 662105, 752014, 765884, 493933, 862440, 156850, 408200, 163723, 634785, 920363, 493779, 299129, 1077599, 169877, 251975, 801207, 50..."
Unknown_Unknown_–ú_0,"[901774, 930608, 571870, 1050313, 925901, 368801, 808826, 5365, 699197, 560802, 270168, 90957, 806652, 548587, 507492, 805087, 927103, 10434, 870885, 878297, 837891, 163482, 169969, 488912, 635760..."
Unknown_income_0_20_Unknown_0,[816632]
...,...
age_65_inf_income_60_90_–ú_1,"[31031, 32467, 238930, 999933, 381390, 743661, 399960, 764838, 149241, 596576, 689981, 56400, 67186, 865379, 139590, 570305, 746913, 57279, 363791, 64203, 10823, 179225, 870051, 943424, 1055488, 7..."
age_65_inf_income_90_150_–ñ_0,"[475463, 265937, 635097, 643579, 273078, 471435, 760140, 970965, 252386, 564757, 304747, 524212, 794619, 602456, 407943, 952933, 140964, 358938, 774251, 852554, 378004, 967451, 333211, 778311, 324..."
age_65_inf_income_90_150_–ñ_1,"[638054, 478090, 1034093, 643299, 381564, 946901, 562866, 547016, 647691, 441153, 874397, 26196, 81261, 557218, 762631, 76663, 274602, 775371, 232517, 758609, 377972, 913140, 458985]"
age_65_inf_income_90_150_–ú_0,"[838341, 339405, 623675, 408289, 220310, 467076, 274986, 393112, 288366, 195968, 755323, 591198, 356391, 411457, 1024300, 891522, 434503, 993054, 875078, 13056, 147939, 1022534, 244300, 810157, 39..."


–ü—Ä–∏–∑–Ω–∞–∫–∏ –¥–ª—è –≥–æ—Ä—è—á–∏—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –∏ —Å–ª—É—á–∞–π–Ω—ã–µ –ø—è—Ç—å –ø—Ä–µ–¥—Å—Ç–∞–≤–∏—Ç–µ–ª–µ–π –¥–ª—è —ç—Ç–∏—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ —Ç–∞–±–ª–∏—Ü–µ ```hot_users_features```:

In [399]:
hot_users_features = users[users["user_id"].isin(hot_users)]
hot_users_features["value"] = hot_users_features.set_index(["age", "income", "sex", "kids_flg"]).index.values
hot_users_features.drop(["age", "income", "sex", "kids_flg"], axis=1, inplace=True)

hot_users_features["value"] = hot_users_features["value"].apply(lambda x: "_".join(x))

hot_users_features = hot_users_features.sample(frac=1).groupby(["value"]).head(5).groupby(["value"]).agg(list)
hot_users_features.tail()

Unnamed: 0_level_0,user_id
value,Unnamed: 1_level_1
age_65_inf_income_60_90_–ú_1,"[67568, 294906, 107715, 524828, 1060473]"
age_65_inf_income_90_150_–ñ_0,"[119955, 1024749, 802504, 217561, 934093]"
age_65_inf_income_90_150_–ñ_1,"[452243, 251538, 775784, 252185, 113636]"
age_65_inf_income_90_150_–ú_0,"[791672, 265173, 887554, 151719, 265316]"
age_65_inf_income_90_150_–ú_1,"[274568, 798042, 420105, 462309, 210043]"


In [None]:
import random

def recommend_list(user_ids, k=10):
    result = set()
    for test_user_id in user_ids:
        output_items = np.array(ann_light_fm.predict(test_user_id), dtype='uint16')
        already_seen_items = np.array(watched[watched["user_id"] == test_user_id].item_id.iloc[0], dtype='uint16')

        unseen_items = output_items[~np.isin(output_items, already_seen_items)]
        num_lost_items = 10 - unseen_items.shape[0]
        if num_lost_items != 0:
            popular_items = np.arange(20)

            popular_items = popular_items[~np.isin(popular_items, already_seen_items)]
            popular_items = popular_items[~np.isin(popular_items, unseen_items)]

            unseen_items = np.append(unseen_items, popular_items[:num_lost_items])

        result.update(unseen_items.tolist())
    result = list(result)
    random.shuffle(result)
    return result[:k]

In [518]:
hot_users_features["reco_item_ids"] = hot_users_features["user_id"].apply(lambda user_list: recommend_list(user_list))
hot_users_features

Unnamed: 0_level_0,user_id,reco_item_ids
value,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown_Unknown_Unknown_0,"[564658, 563169, 1080456, 959513, 1057900]","[13865, 14378, 849, 676, 15472, 11231, 13955, 3402, 5326, 2346]"
Unknown_Unknown_–ñ_0,"[498933, 864263, 26165, 1092424, 753178]","[2954, 12965, 14431, 10942, 12192, 12995, 12537, 4740, 5543, 7310]"
Unknown_Unknown_–ú_0,"[744205, 290331, 1015100, 342105, 697342]","[4774, 11863, 1554, 3734, 11310, 9996, 13865, 10256, 142, 5543]"
Unknown_income_0_20_–ñ_0,"[258459, 875491, 903297, 428464]","[142, 15266, 15915, 14025, 4151, 3095, 11756, 10440, 2616, 7216]"
Unknown_income_0_20_–ñ_1,[92532],"[14470, 341, 4218, 4731, 11778, 15531, 14488, 5543, 10440, 9728]"
...,...,...
age_65_inf_income_60_90_–ú_1,"[67568, 294906, 107715, 524828, 1060473]","[3547, 8636, 12192, 4880, 11749, 4218, 4943, 12096, 13865, 24]"
age_65_inf_income_90_150_–ñ_0,"[119955, 1024749, 802504, 217561, 934093]","[4179, 11778, 4218, 13865, 696, 3734, 4880, 7545, 13058, 4151]"
age_65_inf_income_90_150_–ñ_1,"[452243, 251538, 775784, 252185, 113636]","[11640, 9728, 13865, 341, 4218, 3734, 6086, 7107, 15706, 4151]"
age_65_inf_income_90_150_–ú_0,"[791672, 265173, 887554, 151719, 265316]","[12837, 15297, 15266, 142, 8254, 6208, 14741, 7417, 11756, 2956]"


–ú–µ—Ä–∂–∏–º —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏

In [519]:
users_features.head()

Unnamed: 0,user_id,value
5,1037719,age_45_54_income_60_90_–ú_0
11,312520,age_35_44_income_90_150_–ñ_0
13,382508,age_18_24_income_20_40_–ú_0
15,628684,age_35_44_income_40_60_–ú_0
16,73728,age_45_54_income_40_60_–ú_0


In [520]:
hot_users_features.head()

Unnamed: 0_level_0,user_id,reco_item_ids
value,Unnamed: 1_level_1,Unnamed: 2_level_1
Unknown_Unknown_Unknown_0,"[564658, 563169, 1080456, 959513, 1057900]","[13865, 14378, 849, 676, 15472, 11231, 13955, 3402, 5326, 2346]"
Unknown_Unknown_–ñ_0,"[498933, 864263, 26165, 1092424, 753178]","[2954, 12965, 14431, 10942, 12192, 12995, 12537, 4740, 5543, 7310]"
Unknown_Unknown_–ú_0,"[744205, 290331, 1015100, 342105, 697342]","[4774, 11863, 1554, 3734, 11310, 9996, 13865, 10256, 142, 5543]"
Unknown_income_0_20_–ñ_0,"[258459, 875491, 903297, 428464]","[142, 15266, 15915, 14025, 4151, 3095, 11756, 10440, 2616, 7216]"
Unknown_income_0_20_–ñ_1,[92532],"[14470, 341, 4218, 4731, 11778, 15531, 14488, 5543, 10440, 9728]"


–ü–æ–ª—É—á–∏–ª–∏ —Ç–∞–∫–∏–µ –∂–µ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –¥–ª—è —Ö–æ–ª–æ–¥–Ω—ã—Ö –∏ —Ç—ë–ª–ø—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, –∫–∞–∫ –∏ –¥–ª—è –≥–æ—Ä—è—á–∏—Ö –ø–æ —Å—Ö–æ–¥—Å—Ç–≤—É –ø—Ä–∏–∑–Ω–∞–∫–æ–≤.

–í —Ç–∞–±–ª–∏—Ü–µ –≤—Å—ë –µ—â–µ –æ—Å—Ç–∞—é—Ç—Å—è NaN –≤ —Å—Ç–æ–ª–±—Ü–µ ```reco_item_ids```

In [523]:
recos = users_features.merge(hot_users_features.reset_index()[["value", "reco_item_ids"]], how="left", on="value")
recos

Unnamed: 0,user_id,value,reco_item_ids
0,1037719,age_45_54_income_60_90_–ú_0,"[10440, 5543, 9785, 2657, 11863, 13020, 5115, 10226, 14488, 4218]"
1,312520,age_35_44_income_90_150_–ñ_0,"[3734, 142, 16166, 13865, 14264, 7825, 341, 12974, 12995, 9996]"
2,382508,age_18_24_income_20_40_–ú_0,"[7571, 14488, 10119, 3734, 11237, 12965, 1916, 5543, 4218, 10440]"
3,628684,age_35_44_income_40_60_–ú_0,"[10440, 13018, 12841, 9728, 12173, 4151, 3682, 8636, 4880, 7829]"
4,73728,age_45_54_income_40_60_–ú_0,"[4740, 9996, 7793, 1287, 4457, 341, 12192, 5411, 10440, 849]"
...,...,...,...
755597,157810,age_25_34_income_20_40_–ñ_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755598,1021814,age_45_54_income_20_40_–ñ_0,"[13865, 12192, 3182, 7829, 4880, 2657, 512, 9728, 4218, 4151]"
755599,365945,age_25_34_income_20_40_–ñ_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755600,983617,age_18_24_income_20_40_–ñ_1,"[10323, 11985, 13915, 10761, 13243, 8821, 8584, 4457, 14488, 6646]"


In [524]:
recos[recos["reco_item_ids"].isna()]["value"].unique()

array(['age_65_inf_income_150_inf_Unknown_0',
       'age_65_inf_income_150_inf_–ñ_1', 'age_55_64_income_150_inf_–ñ_1',
       'age_55_64_income_150_inf_–ñ_0', 'age_18_24_income_0_20_Unknown_1',
       'age_25_34_income_0_20_Unknown_1',
       'age_45_54_income_0_20_Unknown_0',
       'age_55_64_income_60_90_Unknown_1', 'age_18_24_income_150_inf_–ú_0',
       'age_35_44_income_0_20_Unknown_1',
       'age_35_44_income_0_20_Unknown_0',
       'age_65_inf_income_0_20_Unknown_0',
       'Unknown_income_0_20_Unknown_0',
       'age_18_24_income_40_60_Unknown_1',
       'Unknown_income_60_90_Unknown_0',
       'age_55_64_income_90_150_Unknown_0',
       'age_35_44_income_90_150_Unknown_1', 'Unknown_income_90_150_–ú_1',
       'age_25_34_income_60_90_Unknown_1',
       'age_35_44_income_150_inf_Unknown_0',
       'Unknown_Unknown_Unknown_Unknown',
       'age_45_54_income_0_20_Unknown_1'], dtype=object)

–í—Ä—É—á–Ω—É—é –∑–∞–ø–æ–ª–Ω–∏–º –ø—Ä–æ–ø—É—Å–∫–∏ –≤ –ø—Ä–∏–∑–Ω–∞–∫–∞—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π, —á—Ç–æ–±—ã –º–æ–∂–Ω–æ –±—ã–ª–æ —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –ø–æ –ø–æ—Ö–æ–∂–∏–º –≥–æ—Ä—è—á–∏–º:

In [521]:
lost_features_dict = {
    'age_65_inf_income_150_inf_Unknown_0': 'age_65_inf_income_150_inf_–ñ_0',

    'age_65_inf_income_150_inf_–ñ_1': 'age_65_inf_income_150_inf_–ñ_1',  
    'age_55_64_income_150_inf_–ñ_1': 'age_55_64_income_150_inf_–ñ_1',
    'age_55_64_income_150_inf_–ñ_0': 'age_55_64_income_150_inf_–ñ_0', 

    'age_18_24_income_0_20_Unknown_1': 'age_18_24_income_0_20_–ú_1',
    'age_25_34_income_0_20_Unknown_1': 'age_25_34_income_0_20_–ñ_1',
    'age_45_54_income_0_20_Unknown_0': 'age_45_54_income_0_20_–ú_0',
    'age_55_64_income_60_90_Unknown_1': 'age_55_64_income_60_90_–ú_1', 
    'age_18_24_income_150_inf_–ú_0': 'age_18_24_income_150_inf_–ú_0',
    'age_35_44_income_0_20_Unknown_1': 'age_35_44_income_0_20_–ú_1',
    'age_35_44_income_0_20_Unknown_0': 'age_35_44_income_0_20_–ú_0',
    'age_65_inf_income_0_20_Unknown_0': 'age_65_inf_income_0_20_–ñ_0',
    'Unknown_income_0_20_Unknown_0': 'age_25_34_income_0_20_–ú_0',
    'age_18_24_income_40_60_Unknown_1': 'age_18_24_income_40_60_–ñ_1',
    'Unknown_income_60_90_Unknown_0': 'age_25_34_income_60_90_–ú_0',
    'age_55_64_income_90_150_Unknown_0': 'age_55_64_income_90_150_–ñ_0',
    'age_35_44_income_90_150_Unknown_1': 'age_35_44_income_90_150_–ú_1',
    'Unknown_income_90_150_–ú_1': 'age_25_34_income_90_150_–ú_1',
    'age_25_34_income_60_90_Unknown_1': 'age_25_34_income_60_90_–ú_1',
    'age_35_44_income_150_inf_Unknown_0': 'age_35_44_income_150_inf_–ú_0',
    'Unknown_Unknown_Unknown_Unknown': 'Unknown_Unknown_Unknown_Unknown',
    'age_45_54_income_0_20_Unknown_1': 'age_45_54_income_0_20_–ú_1'
}

In [525]:
nan_items_mask = recos["reco_item_ids"].isna()
recos[nan_items_mask]

Unnamed: 0,user_id,value,reco_item_ids
30042,374937,age_65_inf_income_150_inf_Unknown_0,
63785,852699,age_65_inf_income_150_inf_–ñ_1,
72863,384049,age_65_inf_income_150_inf_–ñ_1,
77102,964249,age_55_64_income_150_inf_–ñ_1,
85411,185027,age_55_64_income_150_inf_–ñ_0,
...,...,...,...
659692,1097552,Unknown_Unknown_Unknown_Unknown,
677519,298642,age_45_54_income_0_20_Unknown_0,
715935,22453,age_35_44_income_0_20_Unknown_0,
726569,504226,age_45_54_income_0_20_Unknown_1,


In [526]:
for k, v in lost_features_dict.items():
    reco_list_for_user = recos[recos["value"] == v]["reco_item_ids"].iloc[0]
    if isinstance(reco_list_for_user, list):
        recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"] = \
            recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"].apply(lambda x: reco_list_for_user)
    else:
        recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"] = \
            recos.loc[nan_items_mask & (recos["value"] == k), "reco_item_ids"].apply(lambda x: popular_dict["popular_for_all"][:10])

In [527]:
recos["reco_item_ids"].isna().sum()

0

In [568]:
recos

Unnamed: 0,user_id,value,reco_item_ids
0,1037719,age_45_54_income_60_90_–ú_0,"[10440, 5543, 9785, 2657, 11863, 13020, 5115, 10226, 14488, 4218]"
1,312520,age_35_44_income_90_150_–ñ_0,"[3734, 142, 16166, 13865, 14264, 7825, 341, 12974, 12995, 9996]"
2,382508,age_18_24_income_20_40_–ú_0,"[7571, 14488, 10119, 3734, 11237, 12965, 1916, 5543, 4218, 10440]"
3,628684,age_35_44_income_40_60_–ú_0,"[10440, 13018, 12841, 9728, 12173, 4151, 3682, 8636, 4880, 7829]"
4,73728,age_45_54_income_40_60_–ú_0,"[4740, 9996, 7793, 1287, 4457, 341, 12192, 5411, 10440, 849]"
...,...,...,...
755597,157810,age_25_34_income_20_40_–ñ_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755598,1021814,age_45_54_income_20_40_–ñ_0,"[13865, 12192, 3182, 7829, 4880, 2657, 512, 9728, 4218, 4151]"
755599,365945,age_25_34_income_20_40_–ñ_0,"[9842, 142, 4218, 5411, 12995, 15297, 12192, 13159, 16029, 9996]"
755600,983617,age_18_24_income_20_40_–ñ_1,"[10323, 11985, 13915, 10761, 13243, 8821, 8584, 4457, 14488, 6646]"


–°–æ—Ö—Ä–∞–Ω—è–µ–º —Å–ª–æ–≤–∞—Ä—å —Å —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è–º–∏ –¥–ª—è —Ö–æ–ª–æ–¥–Ω—ã—Ö –∏ —Ç—ë–ø–ª—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π:

In [529]:
lightfm_users_reco_dictionary = dict(zip(recos["user_id"], recos["reco_item_ids"]))
with open("/content/drive/MyDrive/RecSys MTC/practice4/models/lightfm_users_reco_dictionary_popular.dill", "wb") as f:
    dill.dump(lightfm_users_reco_dictionary, f)

## 1.5. –°–º–æ—Ç—Ä–∏–º üëÄ –Ω–∞ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ 

–†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è –¥–ª—è —Ö–æ–ª–æ–¥–æ–≥–æ/—Ç—ë–ø–ª–æ–≥–æ:

In [553]:
test_user_id = 10
users[users["user_id"] == test_user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
715105,10,age_18_24,income_40_60,–ú,0


In [554]:
if test_user_id in lightfm_users_reco_dictionary:
    print(lightfm_users_reco_dictionary[test_user_id])
else:
    print("–ì–æ—Ä—è—á–∏–π")

pd.DataFrame(data={
    "user_id": test_user_id,
    "result_items": result_items
}).merge(items[["item_id", "title", "genres"]], how="left", left_on="result_items", right_on="item_id")

[7571, 15297, 16166, 4151, 3734, 13159, 4436, 13915, 4475, 9728]


Unnamed: 0,user_id,result_items,item_id,title,genres
0,10,1267,1267,–ì–æ—Ä–æ–¥ –≥–µ—Ä–æ–µ–≤,"–±–æ–µ–≤–∏–∫–∏, —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –∫–æ–º–µ–¥–∏–∏"
1,10,13243,13243,–ì–æ–ª–æ–≤–æ–ª–æ–º–∫–∞,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –∫–æ–º–µ–¥–∏–∏"
2,10,11919,11919,–°—É–ø–µ—Ä—Å–µ–º–µ–π–∫–∞,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
3,10,11749,11749,–°—É–ø–µ—Ä—Å–µ–º–µ–π–∫–∞ 2,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Ñ–∏–ª—å–º, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
4,10,14488,14488,–ú–∞—Å—Ç–µ—Ä –º–µ—á–∞,"–±–æ–µ–≤–∏–∫–∏, –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–æ–µ"
5,10,12192,12192,–§–µ–º–∏–¥–∞ –≤–∏–¥–∏—Ç,"–¥—Ä–∞–º—ã, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã, –∫–æ–º–µ–¥–∏–∏"
6,10,9728,9728,–ì–Ω–µ–≤ —á–µ–ª–æ–≤–µ—á–µ—Å–∫–∏–π,"–±–æ–µ–≤–∏–∫–∏, —Ç—Ä–∏–ª–ª–µ—Ä—ã"
7,10,15297,15297,–ö–ª–∏–Ω–∏–∫–∞ —Å—á–∞—Å—Ç—å—è,"–¥—Ä–∞–º—ã, –º–µ–ª–æ–¥—Ä–∞–º—ã"
8,10,5543,5543,–¢—É—Ä–∏—Å—Ç,–±–æ–µ–≤–∏–∫–∏
9,10,10440,10440,–•—Ä—É—Å—Ç–∞–ª—å–Ω—ã–π,"—Ç—Ä–∏–ª–ª–µ—Ä—ã, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã"


–†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è –¥–ª—è –≥–æ—Ä—è—á–µ–≥–æ:

In [560]:
interactions["user_id"].unique()[:20]

array([ 0,  2,  3,  5,  9, 11, 13, 15, 21, 30, 32, 37, 41, 46, 47, 53, 55,
       59, 60, 61])

In [574]:
test_user_id = 46
users[users["user_id"] == test_user_id]

Unnamed: 0,user_id,age,income,sex,kids_flg
123407,46,age_25_34,income_20_40,–ñ,0


In [575]:
result_items = recommend(test_user_id)

pd.DataFrame(data={
    "user_id": test_user_id,
    "result_items": result_items
}).merge(items[["item_id", "title", "genres"]], how="left", left_on="result_items", right_on="item_id")

Unnamed: 0,user_id,result_items,item_id,title,genres
0,46,9728,9728,–ì–Ω–µ–≤ —á–µ–ª–æ–≤–µ—á–µ—Å–∫–∏–π,"–±–æ–µ–≤–∏–∫–∏, —Ç—Ä–∏–ª–ª–µ—Ä—ã"
1,46,15297,15297,–ö–ª–∏–Ω–∏–∫–∞ —Å—á–∞—Å—Ç—å—è,"–¥—Ä–∞–º—ã, –º–µ–ª–æ–¥—Ä–∞–º—ã"
2,46,4151,4151,–°–µ–∫—Ä–µ—Ç—ã —Å–µ–º–µ–π–Ω–æ–π –∂–∏–∑–Ω–∏,–∫–æ–º–µ–¥–∏–∏
3,46,13865,13865,–î–µ–≤—è—Ç–∞–µ–≤,"–¥—Ä–∞–º—ã, –≤–æ–µ–Ω–Ω—ã–µ, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
4,46,1844,1844,–ê—Ñ–µ—Ä–∏—Å—Ç–∫–∞,"—Ç—Ä–∏–ª–ª–µ—Ä—ã, –∫–æ–º–µ–¥–∏–∏"
5,46,3734,3734,–ü—Ä–∞–±–∞–±—É—à–∫–∞ –ª–µ–≥–∫–æ–≥–æ –ø–æ–≤–µ–¥–µ–Ω–∏—è,–∫–æ–º–µ–¥–∏–∏
6,46,657,657,–ó–∞—â–∏—Ç–Ω–∏–∫,"–¥—Ä–∞–º—ã, —Ç—Ä–∏–ª–ª–µ—Ä—ã, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã"
7,46,7571,7571,100% –≤–æ–ª–∫,"–º—É–ª—å—Ç—Ñ–∏–ª—å–º, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è, —Å–µ–º–µ–π–Ω–æ–µ, —Ñ—ç–Ω—Ç–µ–∑–∏, –∫–æ–º–µ–¥–∏–∏"
8,46,14488,14488,–ú–∞—Å—Ç–µ—Ä –º–µ—á–∞,"–±–æ–µ–≤–∏–∫–∏, –∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–æ–µ"
9,46,12192,12192,–§–µ–º–∏–¥–∞ –≤–∏–¥–∏—Ç,"–¥—Ä–∞–º—ã, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã, –∫–æ–º–µ–¥–∏–∏"


–ü–æ–¥–º–µ—à–∏–≤–∞—é—Ç—Å—è –∫–æ–º–µ–¥–∏–∏, –¥—Ä–∞–º—ã –∏ –º–µ–ª–æ–¥—Ä–∞–º—ã 

In [576]:
import random

random.randint(1, 10)

5

# Avatars

In [637]:
avatars_users = pd.read_csv(
    "/content/drive/MyDrive/RecSys MTC/practice4/models/avatars/avatars_users.csv",
    usecols=["user_id", "age", "income", "sex", "kids_flg"],
)
avatars_interactions = pd.read_csv(
    "/content/drive/MyDrive/RecSys MTC/practice4/models/avatars/avatars_interactions.csv",
    usecols=[
        "user_id",
        "item_id",
        "last_watch_dt",
        "total_dur",
        "watched_pct",
        "weight",
    ],
)

In [613]:
Columns.Datetime = "datetime"
avatars_interactions.rename(columns={'last_watch_dt': "datetime"}, inplace=True) 

In [614]:
avatars_interactions[Columns.Datetime] = pd.to_datetime(
    avatars_interactions[Columns.Datetime], format="%Y-%m-%d"
)

–ì–µ–Ω–µ—Ä–∏—Ä—É–µ–º –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω—ã—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π.

–ù–µ–æ–±—Ö–æ–¥–∏–º–æ:
1. –°–æ–∑–¥–∞—Ç—å –∏—Å–∫—É—Å—Å—Ç–≤–µ–Ω–Ω—ã–µ –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –∏ –ø—Ä–∏–∑–Ω–∞–∫–∏ —ç—Ç–∏–º –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è–º;
2. –ó–∞–Ω–æ–≤–æ —Å–æ–±—Ä–∞—Ç—å –¥–∞—Ç–∞—Å–µ—Ç, –¥–æ–±–∞–≤–∏–≤ –¥–∞–Ω–Ω—ã–µ —ç—Ç–∏—Ö –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª–µ–π –≤ interactions, users -> user_features;
3. –ü–æ—Å–º–æ—Ç—Ä–µ—Ç—å –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –º–æ–¥–µ–ª–∏ –∏ —Å–¥–µ–ª–∞—Ç—å –≤—ã–≤–æ–¥—ã;

–õ–æ–≥–∏—á–Ω–æ –ø—Ä–µ–¥–ø–æ–ª–æ–∂–∏—Ç—å, —á—Ç–æ —á–µ–ª–æ–≤–µ–∫—É, –∫–æ—Ç–æ—Ä—ã–π —Å–º–æ—Ç—Ä–∏—Ç –≤—Å–µ –∂–∞–Ω—Ä—ã –ø–æ–¥—Ä—è–¥, –ª–µ–≥—á–µ —Ä–µ–∫–æ–º–µ–Ω–¥–æ–≤–∞—Ç—å –ø—Ä–µ–¥–º–µ—Ç—ã –∏ –º–µ–Ω—å—à–µ —à–∞–Ω—Å–æ–≤ –Ω–µ —É–≥–æ–¥–∏—Ç—å –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—é. –ü–æ–ø—Ä–æ–±—É–µ–º —Å–æ–∑–¥–∞—Ç—å –Ω–µ–º–Ω–æ–≥–æ —Ö–∞—Ä–¥–∫–æ—Ä–Ω—ã—Ö –ø–µ—Ä—Å–æ–Ω–∞–∂–µ–π, —á—Ç–æ–±—ã —É—Å—Ç—Ä–æ–∏—Ç—å –ø—Ä–æ—Ç–µ—Å—Ç–∏—Ä–æ–≤–∞—Ç—å –Ω–∞—à—É —Ñ–∞–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏–æ–Ω–Ω—É—é –º–∞—à–∏–Ω—É –≤ –¥–µ–ª–µ:

* <font size='4'>–≤–∑—Ä–æ—Å–ª—ã–π —Ä–∞–±–æ—Ç—è—â–∏–π –º—É–∂—á–∏–Ω–∞ —Å –∑–∞—Ä–ø–ª–∞—Ç–æ–π –¥–æ 20–∫, –∫–æ—Ç–æ—Ä—ã–π —Å–º–æ—Ç—Ä–∏—Ç —Ç–æ–ª—å–∫–æ –º—É–ª—å—Ç–∏–∫–∏ –∏ –ø—Ä–∏ —ç—Ç–æ–º –Ω–µ—Ç kid —Ñ–ª–∞–≥–∞;</font>
    * –∏–Ω—Ç–µ—Ä–µ—Å–Ω—ã–π –∫–µ–π—Å, —Å —á–µ–ª–æ–≤–µ—á–µ—Å–∫–æ–π —Ç–æ—á–∫–∏ –∑—Ä–µ–Ω–∏—è –≤–µ—Ä–æ—è—Ç–Ω–µ–µ, —á—Ç–æ –Ω–µ –ø–æ—Å—Ç–∞–≤–∏–ª–∏ —Ñ–ª–∞–∂–æ–∫, –∞ —Å–µ—Ä–≤–∏—Å–æ–º –ø–æ–ª—å–∑—É–µ—Ç—Å—è —á–∏—Å—Ç–æ —Ä–µ–±–µ–Ω–æ–∫, –∏–Ω—Ç–µ—Ä–µ—Å–Ω–æ –±—É–¥–µ—Ç –≤–∑–≥–ª—è–Ω—É—Ç—å –Ω–∞ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –º–æ–¥–µ–ª–∏
* <font size='4'>–º–æ–ª–æ–¥–æ–π –ø–∞—Ä–µ–Ω—å c —Ä–µ–±–µ–Ω–∫–æ–º (–Ω–æ –±–µ–∑ –º—É–ª—å—Ç–∏–∫–æ–≤), –∑–∞—Ä–∞–±–∞—Ç—ã–≤–∞—é—â–∏–π –æ—Ç 150–∫, –≤ —Ü–µ–ª–æ–º –∏–Ω—Ç–µ—Ä–µ—Å—É—é—â–∏–π—Å—è –Ω–∞—É—á–Ω–æ-–ø–æ–ø—É–ª—è—Ä–Ω—ã–º, –Ω–æ –∏–º–µ—é—â–µ–≥–æ –ø–∞—Ä—É –∞–π—Ç–µ–º–æ–≤ –¥—Ä—É–≥–∏—Ö –∂–∞–Ω—Ä–æ–≤ </font>
    * –•–≤–∞—Ç–∞–µ–º—Å—è –∑–∞ —Å–ø–µ—Ü–∏—Ñ–∏—á–Ω—É—é, –Ω–µ –≤—ã–∏–≥—Ä—ã–≤–∞—é—â—É—é –ø–æ –ø–æ–ª—É–ª—è—Ä–Ω–æ—Å—Ç–∏, –∫–∞—Ç–µ–≥–æ—Ä–∏—é + –ø–æ—Å–º–æ—Ç—Ä–∏–º, –ø—Ä–µ–¥–ª–æ–∂–∞—Ç –ª–∏ –µ–º—É –∏–∑-–∑–∞ —Ñ–ª–∞–∂–∫–∞ –¥–µ—Ç—Å–∫–∏–π –∫–æ–Ω—Ç–µ–Ω—Ç
* <font size='4'>–∂–µ–Ω—â–∏–Ω–∞ —Å—Ä–µ–¥–Ω–∏—Ö –ª–µ—Ç –±–µ–∑ –¥–µ—Ç–µ–π —Å–æ —Å—Ä–µ–¥–Ω–∏–º –¥–æ—Å—Ç–∞—Ç–∫–æ–º 40-60–∫, –∫–æ—Ç–æ—Ä–∞—è –∏—Å–∫–ª—é—á–∏—Ç–µ–ª—å–Ω–æ –∑–∞–Ω–∏–º–∞–µ—Ç—Å—è —Å–ø–æ—Ä—Ç–æ–º –ø–æ —Ñ–∏—Ç–Ω–µ—Å-—Ä–æ–ª–∏–∫–∞–º.</font>
    * –ò–Ω—Ç–µ—Ä–µ—Å–Ω–æ –ø–æ—Å–º–æ—Ç—Ä–µ—Ç—å —Å–ø–µ–∫—Ç—Ä —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–π –¥–ª—è —Ç–∞–∫–æ–≥–æ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è

In [638]:
avatars_users

Unnamed: 0,user_id,age,income,sex,kids_flg
0,-6,age_55_64,income_0_20,–ú,0
1,-66,age_18_24,income_150_inf,–ú,1
2,-666,age_35_44,income_40_60,–ñ,0


In [616]:
avatars_interactions.merge(items[["item_id", "title", "genres"]], how = "left", on = "item_id")

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight,title,genres
0,-6,8464,2021-05-19,3297,78.0,4,–î–µ–Ω—å —Ä–æ–∂–¥–µ–Ω–∏—è –ê–ª–∏—Å—ã,"—Ä—É—Å—Å–∫–∏–µ –º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã, —Ä—É—Å—Å–∫–∏–µ, –¥–ª—è –¥–µ—Ç–µ–π, –ø–æ–ª–Ω–æ–º–µ—Ç—Ä–∞–∂–Ω—ã–µ"
1,-6,11864,2021-05-18,9572,9.0,1,–ù–∞—Ä—É—Ç–æ 7: –ü–æ—Ç–µ—Ä—è–Ω–Ω–∞—è –±–∞—à–Ω—è,"–∞–Ω–∏–º–µ, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
2,-6,2378,2021-07-01,7689,99.0,5,–†–æ–±–æ–∫–∞—Ä –ü–æ–ª–∏. –ü—Ä–∞–≤–∏–ª–∞ –¥–æ—Ä–æ–∂–Ω–æ–≥–æ –¥–≤–∏–∂–µ–Ω–∏—è,"–∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, –¥–ª—è –¥–µ—Ç–µ–π, —Å–µ—Ä–∏–∞–ª—ã, –∑–∞–ø–∞–¥–Ω—ã–µ –º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã"
3,-6,6138,2021-07-02,15,38.0,3,–ê—Å—Ç—Ä–∞–ª–∏—É–º,"–¥—Ä–∞–º—ã, –º—É–ª—å—Ç—Ñ–∏–ª—å–º"
4,-66,6096,2021-04-12,2263,1.0,1,–ê–º–µ—Ä–∏–∫–∞–Ω—Å–∫–∏–π –º–æ–Ω—Å—Ç—Ä,–Ω–∞—É—á–Ω–æ-–ø–æ–ø—É–ª—è—Ä–Ω—ã–µ
5,-66,11222,2021-04-08,4593,9.0,1,–õ—å–≤—ã. –Æ–∂–Ω–∞—è –ê—Ñ—Ä–∏–∫–∞,–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω–æ–µ
6,-66,7132,2021-08-15,13,46.0,3,[4–ö] –í—ã—Ö–æ–¥ –∫ –¢–∏—Ö–æ–º—É –æ–∫–µ–∞–Ω—É,–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω–æ–µ
7,-66,2812,2021-06-05,1252,89.0,5,–ù—è–Ω–∏,–¥–ª—è –≤–∑—Ä–æ—Å–ª—ã—Ö
8,-66,14315,2021-07-02,3159,3.0,1,–ù–µ—á–µ—Å—Ç–∏–≤—ã–µ,—É–∂–∞—Å—ã
9,-666,2888,2021-07-27,205,96.0,5,–†–∞—Ü–∏–æ–Ω –Ω–∞ 1700 –∫–∫–∞–ª,—Ñ–∏—Ç–Ω–µ—Å


In [602]:
def get_features(users: pd.DataFrame, items: pd.DataFrame, for_hot=True):

    # Generate user/item features for hot users, i.e. syncronise with interactions
    if for_hot:
        users = users.loc[users[Columns.User].isin(interactions[Columns.User])].copy()
    items = items.loc[items[Columns.Item].isin(interactions[Columns.Item])].copy()

    # Deal with user features

    users.fillna("Unknown", inplace=True)
    users = users.astype({"kids_flg": bool})

    user_features_frames = []
    for feature in ["sex", "age", "income", "kids_flg"]:
        feature_frame = users.reindex(columns=[Columns.User, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        user_features_frames.append(feature_frame)

    user_features = pd.concat(user_features_frames)

    # Deal with item features
    # Genre

    items["genre"] = (
        items["genres"].str.lower().str.replace(", ", ",", regex=False).str.split(",")
    )
    genre_feature = items[["item_id", "genre"]].explode("genre")
    genre_feature.columns = ["id", "value"]
    genre_feature["feature"] = "genre"
    genre_feature.head()

    # Release year

    items["release_year"].fillna(int(items["release_year"].max()), inplace=True)
    items = items.astype({"release_year": int})

    year_from = 1977
    step = 5
    bins = [year for year in range(year_from, items["release_year"].max() + step, step)]

    bins_bias = [item + 1 for item in bins]
    pairs_strict = list(zip(bins_bias, bins[1:]))
    # Add the most first release year
    bins = [items["release_year"].min()] + bins
    # Add interval for it
    pairs_strict = [(items["release_year"].min(), bins[1])] + pairs_strict
    # Generate feature values
    labels = [f"year_{item[0]}_{item[1]}" for item in pairs_strict]

    year_bins = pd.cut(
        items["release_year"], bins=bins, labels=labels, include_lowest=True
    )

    items["release_year"] = year_bins.astype(str)

    # Age rating

    items["age_rating"].fillna(0, inplace=True)
    items = items.astype({"age_rating": int})

    # For kids

    items.loc[items["age_rating"] > 12, "for_kids"] = 0
    items.loc[items["age_rating"] <= 12, "for_kids"] = 1
    items = items.astype({"for_kids": bool})

    item_features_frames = []
    for feature in ["content_type", "release_year", "age_rating", "for_kids"]:
        feature_frame = items.reindex(columns=[Columns.Item, feature])
        feature_frame.columns = ["id", "value"]
        feature_frame["feature"] = feature
        item_features_frames.append(feature_frame)
    item_features_frames.append(genre_feature)
    item_features = pd.concat(item_features_frames)
    headtail(item_features)

    return user_features, item_features

In [617]:
users = pd.concat([users, avatars_users], ignore_index=True)

In [618]:
users.tail()

Unnamed: 0,user_id,age,income,sex,kids_flg
840198,-66,age_18_24,income_150_inf,–ú,1
840199,-666,age_35_44,income_40_60,–ñ,0
840200,-6,age_55_64,income_0_20,–ú,0
840201,-66,age_18_24,income_150_inf,–ú,1
840202,-666,age_35_44,income_40_60,–ñ,0


In [619]:
users_features, items_features = get_features(users, items)

In [620]:
interactions = pd.concat([interactions, avatars_interactions], ignore_index=True)

In [622]:
interactions.drop("last_watch_dt", axis=1, inplace=True)

In [623]:
interactions.tail()

Unnamed: 0,user_id,item_id,datetime,total_dur,watched_pct,weight
5476272,-66,14315,2021-07-02,3159,3.0,1
5476273,-666,2888,2021-07-27,205,96.0,5
5476274,-666,8565,2021-05-26,10748,23.0,2
5476275,-666,15465,2021-08-06,5472,85.0,5
5476276,-666,14957,2021-08-05,4158,67.0,4


In [624]:
%%time
dataset = Dataset.construct(
    interactions_df=interactions,
    user_features_df=users_features,
    cat_user_features=["sex", "age", "income", "kids_flg"],
    item_features_df=items_features,
    cat_item_features=[
        "genre",
        "content_type",
        "release_year",
        "for_kids",
        "age_rating",
    ],
)

CPU times: user 2.33 s, sys: 30.9 ms, total: 2.36 s
Wall time: 2.36 s


–û–±—É—á–∞–µ–º:

In [625]:
K_RECOS = 10
RANDOM_STATE = 42
NUM_THREADS = 16
N_FACTORS = 32
N_EPOCHS = 1 # Lightfm
USER_ALPHA = 0 # Lightfm
ITEM_ALPHA = 0 # Lightfm
LEARNING_RATE = 0.05 # Lightfm

In [626]:
model = LightFMWrapperModel(
            LightFM(
                no_components=N_FACTORS,
                loss='warp',
                random_state=RANDOM_STATE,
                learning_rate=LEARNING_RATE,
                user_alpha=USER_ALPHA,
                item_alpha=ITEM_ALPHA,
            ),
            epochs=N_EPOCHS,
            num_threads=NUM_THREADS,
        )

In [627]:
model.fit(dataset)

<rectools.models.lightfm.LightFMWrapperModel at 0x7f8ab3ef11c0>

In [632]:
model.recommend([-6, -66, -666], dataset=dataset, filter_viewed=True, k=10)\
    .merge(items[["item_id", "title", "release_year", "genres"]], how="left", on="item_id")

Unnamed: 0,user_id,item_id,score,rank,title,release_year,genres
0,-6,15465,-296.16209,1,–¢–∞–∫—Å–∏ –¥–ª—è –ê–Ω–≥–µ–ª–∞,2007.0,"—Ä—É—Å—Å–∫–∏–µ, –¥–µ—Ç–µ–∫—Ç–∏–≤—ã"
1,-6,16227,-302.016142,2,"–í–ø–µ—Ä–µ–¥, –ê—Å—Ç—Ä–æ–±–æ–π!",2019.0,"–∞–Ω–∏–º–µ, —Å–µ—Ä–∏–∞–ª—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è, –∑–∞—Ä—É–±–µ–∂–Ω—ã–µ, —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –∑–∞–ø–∞–¥–Ω—ã–µ –º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã"
2,-6,7303,-302.087509,3,–ú–µ—Ç–∞–ª–∏–æ–Ω—ã,2018.0,"–º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è, —Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –±–æ–µ–≤–∏–∫–∏, —Ñ—ç–Ω—Ç–µ–∑–∏"
3,-6,12741,-302.312172,4,–®–∏–º–º–µ—Ä –∏ –®–∞–π–Ω,2016.0,"–º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, —Ñ—ç–Ω—Ç–µ–∑–∏, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
4,-6,9936,-302.33386,5,–†–µ–π –∏ –ø–æ–∂–∞—Ä–Ω—ã–π –ø–∞—Ç—Ä—É–ª—å,2016.0,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
5,-6,15084,-302.342264,6,–°—É–ø–µ—Ä –ó–∞–∫,2019.0,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
6,-6,12512,-302.352851,7,–ò–≥—Ä—ã —Å –ô–æ–∫–æ,2018.0,"–º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
7,-6,14729,-302.400245,8,–ê—Ç–ª–æ–Ω—ã,2019.0,"–±–æ–µ–≤–∏–∫–∏, –º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
8,-6,13271,-302.426804,9,–ì–æ—Ä–º–∏—Ç–∏,2018.0,"–º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, —Ñ—ç–Ω—Ç–µ–∑–∏, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
9,-6,2059,-302.432776,10,–ú–æ–Ω–∫–∞—Ä—Ç,2017.0,"—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞, –º—É–ª—å—Ç—Å–µ—Ä–∏–∞–ª—ã, —Ñ—ç–Ω—Ç–µ–∑–∏, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è"
