# Туториал по двухэтапной модели

In [1]:
import datetime
import dill
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import pyarrow.parquet as pq
from zipfile import ZipFile
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import warnings
warnings.simplefilter('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)

In [2]:
train = pd.read_csv('../input/hack-the-cart/hist_data.csv')
print(train.shape)
train.sample(10)

(4529889, 8)


Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods,weight
4211162,96056710,98518487221,2021-08-27 14:53:57,210909485,1.0,99.99,False,16.36
3764669,95211810,98516989715,2021-08-20 18:31:06,203370019,9.4,179.9,True,24.75
2324309,94989600,98513078345,2021-08-01 06:38:20,205102516,1.0,369.49,False,20.27
1938813,94861727,98512092861,2021-07-26 17:03:28,202880262,0.994,57.495,True,11.9
4521696,94989802,98520608711,2021-09-05 09:35:24,203388161,1.0,301.99,False,29.83
4064970,95584821,98517951126,2021-08-25 11:49:13,203382198,2.0,64.99,False,19.78
1324233,95095759,98510445731,2021-07-18 13:03:24,203370019,5.246,644.85,True,43.31
1051546,95155977,98509712998,2021-07-15 06:21:44,202808349,1.0,44.99,False,10.86
4203023,95695687,98518458718,2021-08-27 13:13:24,202820148,0.948,42.82,True,17.08
1096150,94616960,98509823053,2021-07-15 15:01:18,202807138,2.0,241.49,False,13.56


In [3]:
test = pd.read_csv('../input/hack-the-cart/test.csv')
print(test.shape)
test.sample(10)

(1081420, 7)


Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods
714794,96003592,98514333353,2021-08-07 13:22:02,202808298,1.0,66.99,False
914813,95101141,98514525578,2021-08-08 12:31:11,203015414,1.0,128.39,False
703176,95523965,4620152007,2021-07-21 11:34:20,211072597,4.0,14.99,False
772461,96072065,98516481579,2021-08-18 12:19:09,203566523,1.0,299.99,False
509393,95497745,98520656098,2021-09-05 11:48:14,204257910,1.0,60.31,False
237451,95367942,98519422221,2021-08-31 14:41:10,202854572,0.964,27.495,True
487869,95384538,98519845218,2021-09-02 13:16:56,203455264,1.0,44.99,False
951638,95200201,98520597413,2021-09-05 08:57:41,202838185,1.0,51.99,False
348142,95234467,98517709392,2021-08-24 10:27:00,203425610,1.0,138.32,False
142406,94920918,98515121977,2021-08-11 15:04:56,205781292,1.0,47.99,False


In [4]:
# test.groupby(['buyer_id', 'pav_order_id'])['item_id'].count()

## Загрузим данные

# 🎛 EDA

## `interactions`: взаимодействия пользователь - айтем 
- с датой `last_watch_dt`
- длительностью просмотра `total_dur`
- % просмотра `watched_pct`

In [5]:
def get_interactions():

    interactions = pd.read_csv('../input/hack-the-cart/hist_data.csv')
    interactions.rename(columns={'buyer_id': 'user_id',
                                 'count' : 'product_quantity', 'created' : 'completed_at'} , inplace=True)
    nes_cols = ['user_id', 'item_id', 'completed_at','product_quantity']
    interactions = interactions[nes_cols].groupby(['user_id', 'item_id'], as_index=False).agg({'completed_at' : 'max', 
                                                                                   'product_quantity': 'sum'})
    interactions['user_id'] = interactions.user_id.astype(int)
    interactions['item_id'] = interactions.item_id.astype(int)
    
    
    interactions.to_csv('interactions.csv')
    return interactions
    
interactions = get_interactions()

In [6]:
interactions

Unnamed: 0,user_id,item_id,completed_at,product_quantity
0,94578442,202795926,2021-08-08 09:58:03,1.0
1,94578442,202801709,2021-08-08 09:58:03,3.0
2,94578442,202806979,2021-07-15 17:37:29,2.0
3,94578442,202806998,2021-07-15 17:37:29,4.0
4,94578442,202807303,2021-07-02 17:20:51,1.0
...,...,...,...,...
3644161,96400990,203445463,2021-09-07 10:00:40,1.0
3644162,96400990,203457302,2021-09-07 10:00:40,1.0
3644163,96400990,203581478,2021-09-07 10:00:40,1.0
3644164,96400990,205797303,2021-09-07 10:00:40,1.0


In [7]:
# # обработка даты
# interactions['last_watch_dt'] = pd.to_datetime(interactions['completed_at']).map(lambda x: x.date())

print(f"Уникальных юзеров в interactions: {interactions['user_id'].nunique():_}")
print(f"Уникальных айтемов в interactions: {interactions['item_id'].nunique():_}")

Уникальных юзеров в interactions: 63_925
Уникальных айтемов в interactions: 52_472


In [8]:
max_date = interactions['completed_at'].max()
min_date = interactions['completed_at'].min()

print(f"min дата в interactions: {min_date}")
print(f"max дата в interactions: {max_date}")

min дата в interactions: 2021-07-01 00:03:44
max дата в interactions: 2021-09-07 18:48:29


## `users`: данные о пользователях

- `age` бин по возрасту 
- `income` бин по доходу 
- `sex` пол 
- `kids_flg` флаг наличия детей

Все признаки - результат предсказания соцдем моделей

In [9]:
import gc

gc.collect()

84

## `items`: данные об айтемах

- `content_type` - тип контента
- `title` - название на русском
- `title_orig` - название оригинальное
- `release_year` - год выпуска
- `countries` - страны
- `for_kids` - флаг контент для детей
- `age_rating`- Возрастной рейтинг
- `studios` - студии
- `directors` - режиссеры
- `actors`- актеры
- `keywords` - ключевые слова 
- `description` - описание

In [10]:
# items = pq.read_table('../input/sbermarket-competition-intel/clusters.parquet', use_threads=False)
# items = items.to_pandas()
# print(f"Уникальных айтемов в items {items.shape[0]:_}")
# items.head()

# Агрегация и фильтрация кластеров

Уберем повторы, будет считать что продукт это фильм, просмотренный раз

# ✂️ Схема валидации

## Глобальный train - test

На `test` будем проверять результат обоих моделей:
-  отдельно модели 1го уровня: LightFM
- отдельно двухуровненой модели: LightFM + Catboost

На test оставим 7 дней

In [11]:
train = interactions#[(interactions['completed_at'] < max_date - pd.Timedelta(days=7))]
print('Трейн готов')
# test = interactions[(interactions['completed_at'] >= max_date - pd.Timedelta(days=7))]

# дополнительная фильтрация train для исключения случайных просмотров 
# train = train[train['total_dur'] >= 300]

del interactions
print(f"train: {train.shape}")
# print(f"test: {test.shape}")

Трейн готов
train: (3644166, 4)


# 1️⃣ 1 этап: LightFM

- обучаем LightFM user2item модель на LighFM train

- делаем предикт c кандидатами на LighFM predict



## Разделим данные для LightFM еще на 2 части

### train + pred for candidates 

Разделим данные на `lfm_train` и `lfm_pred` как 60%-40% по квантилю даты просмотра 

(Считаем, что гиперпараметры оптимальной lfm модели мы уже подобрали)

На `lfm_pred` будем в дальнейшем обучать бустинг - модель 2 этапа

In [12]:
lfm_date_threshold = train['completed_at'].quantile(q=0.6, interpolation='nearest')
lfm_date_threshold

'2021-08-08 12:50:58'

In [13]:
lfm_train = train#[(train['completed_at'] < lfm_date_threshold)]
lfm_pred = train[(train['completed_at'] >= lfm_date_threshold)]

print(f"lfm_train: {lfm_train.shape}")
print(f"lfm_pred: {lfm_pred.shape}")

lfm_train: (3644166, 4)
lfm_pred: (1457679, 4)


Будем предсказывать кандидатов только на теплых пользователях - у которых есть просмотры в обучающей выборке

In [14]:
lfm_pred = lfm_pred[lfm_pred['user_id'].isin(lfm_train['user_id'].unique())]

In [15]:
lfm_train.head(3)

Unnamed: 0,user_id,item_id,completed_at,product_quantity
0,94578442,202795926,2021-08-08 09:58:03,1.0
1,94578442,202801709,2021-08-08 09:58:03,3.0
2,94578442,202806979,2021-07-15 17:37:29,2.0


## Обучение LightFM

In [16]:
from lightfm.data import Dataset
from lightfm import LightFM

In [17]:
dataset = Dataset()
dataset.fit(lfm_train['user_id'].unique().astype(int), lfm_train['item_id'].unique().astype(float))


# user / item mappings
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'items_mapping': lightfm_mapping[2],
}

# инвертированные словарь
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

print(f"users_mapping amount: {len(lightfm_mapping['users_mapping'])}")
print(f"items_mapping amount: {len(lightfm_mapping['items_mapping'])}")

import pickle

a_file = open("data.pkl", "wb")
pickle.dump(lightfm_mapping, a_file)

# a_file = open("data.pkl", "rb")
# lightfm_mapping = pickle.load(a_file)
# print(lightfm_mapping)

users_mapping amount: 63925
items_mapping amount: 52472


In [18]:
# matrix for training
interactions_matrix, weights_matrix = dataset.build_interactions(
    zip(*lfm_train[['user_id', 'item_id', 'product_quantity']].values.astype(int).T)
)

weights_matrix_csr = weights_matrix.tocsr()

In [19]:
lfm_model = LightFM(
    no_components=64, 
    learning_rate=0.1, 
    loss='warp', 
    max_sampled=5, 
    random_state=42
)

In [20]:
# 💪🏼 train model

num_epochs = 25 # лучше 10 - 20

for _ in tqdm(range(num_epochs)):
    lfm_model.fit_partial(
        weights_matrix_csr
    )

  0%|          | 0/25 [00:00<?, ?it/s]

In [21]:
# save model  
with open(f"lfm_model.dill", 'wb') as f:
    dill.dump(lfm_model, f)

## Генерируем предсказания LightFM как кандидатов для второго этапа

In [22]:
# пустой датафрейм для предсказания
candidates = pd.DataFrame({
    'user_id': lfm_pred['user_id'].unique()
}, dtype=int)

candidates.head(3)

Unnamed: 0,user_id
0,94578533
1,94578595
2,94578658


### В `tools` вынесены функции из лекций первого курса Your First Recsys  <a href="https://www.kaggle.com/sharthz23/implicit-lightfm">Код лекций</a>

`generate_lightfm_recs_mapper` - функция для генерации LightFM предсказаний по всем пользователям с учетом удаления просмотренных айтемов

`compute_metrics` - функция для расчета классических метрик для рекомендаций (будем использовать в конце)

In [23]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, user_features, item_features, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user = None, known_items = known_items):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, user_features=user_features,
                             item_features=item_features, num_threads=num_threads)
        
        additional_N = len(known_items[user_id]) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]
        
        final_recs = [item_inv_mapping[item] for item in top_cols]
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return final_recs[:N]
    return _recs_mapper

In [24]:
# кол-во кандидатов 
top_N = 200 # тут сколько хочешь 

# вспомогательные данные 
all_cols = list(lightfm_mapping['items_mapping'].values())

mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=dict(),
    N=top_N,
    user_features=None, 
    item_features=None, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=20
)

In [25]:
from tqdm import tqdm
tqdm.pandas()  # <- added this line

In [26]:
# candidates['user_id'][0]
# lightfm_mapping['users_mapping']

In [27]:
# генерируем предказания
candidates['item_id'] = candidates['user_id'].progress_map(mapper)
# candidates['item_id'] = candidates['user_id'].map(mapper)
candidates = candidates.explode('item_id')
candidates['rank'] = candidates.groupby('user_id').cumcount() + 1 

candidates.head()

100%|██████████| 39359/39359 [05:41<00:00, 115.09it/s]


Unnamed: 0,user_id,item_id,rank
0,94578533,202914232.0,1
0,94578533,202807115.0,2
0,94578533,203566436.0,3
0,94578533,202820287.0,4
0,94578533,203455576.0,5


In [28]:
candidates.to_csv('candidates_stage_1.csv', index=False)

# Предсказания для submisionm

In [29]:
# пустой датафрейм для предсказания
candidates = pd.DataFrame({
    'user_id': lfm_pred['user_id'].unique()
}, dtype=int)

candidates.head(3)

Unnamed: 0,user_id
0,94578533
1,94578595
2,94578658


In [30]:
test = pd.read_csv('../input/hack-the-cart/test.csv')
print(test.shape)
test.sample(10)

(1081420, 7)


Unnamed: 0,buyer_id,pav_order_id,created,item_id,count,price_sold,flag_weight_goods
638826,96345915,98520531300,2021-09-04 20:18:43,203400060,1.0,59.99,False
747467,96353895,98519573884,2021-09-01 10:36:27,202838754,1.096,44.995,True
159960,95079563,98519928394,2021-09-02 18:08:21,202991914,1.0,89.89,False
84643,94734210,98520916736,2021-09-06 11:52:30,202797999,2.0,59.99,False
564660,95142943,98515486677,2021-08-13 11:58:29,202894145,1.0,99.99,False
149370,94708158,98512875200,2021-07-30 23:49:18,202953901,0.514,34.995,True
715634,94652201,98519772455,2021-09-02 08:54:01,205737852,4.0,29.99,False
376469,95237660,4620188004,2021-08-16 10:32:28,202807180,2.0,107.19,False
196265,95211137,98518139693,2021-08-26 08:24:33,205952431,1.0,188.49,False
103919,94773674,98520501303,2021-09-04 18:15:01,204021171,1.0,110.99,False


In [31]:
to_list = lambda x: x.to_list()

test_tmp = test.groupby(['buyer_id', 'pav_order_id'], as_index=False).agg(known_items = ('item_id', to_list))
test_tmp

Unnamed: 0,buyer_id,pav_order_id,known_items
0,94578440,98520921566,"[203474864, 2028..."
1,94578442,98519811873,"[203235251, 2035..."
2,94578454,98518335363,"[204083530, 2034..."
3,94578475,98520998296,"[203566146, 2035..."
4,94578533,98519408284,"[213903767, 2033..."
...,...,...,...
80239,96405796,98521263508,"[202820143, 2034..."
80240,96406017,98521269190,"[203439307, 2028..."
80241,96406151,98521271961,"[203469498, 2029..."
80242,96406233,98521274492,"[202944500, 2028..."


In [32]:
top_20 = train.item_id.value_counts().index[:20].values

In [33]:
def mapper_with_cold_start(user_id=None, known_items=[]):
    
    if user_id in lightfm_mapping['users_mapping'].keys():
        return mapper(user_id, known_items = known_items)
        
    return top_20
    

In [34]:
sample_submission = test_tmp #pd.read_csv('../input/hack-the-cart/test.csv')
sample_submission.rename(columns = {'buyer_id':'user_id'}, inplace=True)
sample_submission['user_id'] = sample_submission['user_id'].values.astype(int)
# генерируем предказания


# sample_submission['item_id'] = sample_submission['user_id'].progress_map(mapper)
sample_submission['item_id'] = sample_submission.progress_apply(lambda x: mapper_with_cold_start(x['user_id']), axis=1) # , x['known_items']
# sample_submission['item_id'] = sample_submission.progress_apply(mapper_with_cold_start, user_id = user_id)
sample_submission = sample_submission.explode('item_id')
sample_submission['rank'] = sample_submission.groupby('user_id').cumcount() + 1 
sample_submission.to_csv('sub_lfm.csv', index=False)
sample_submission.head()


100%|██████████| 80244/80244 [09:24<00:00, 142.24it/s] 


Unnamed: 0,user_id,pav_order_id,known_items,item_id,rank
0,94578440,98520921566,"[203474864, 2028...",202820148,1
0,94578440,98520921566,"[203474864, 2028...",202872237,2
0,94578440,98520921566,"[203474864, 2028...",202809628,3
0,94578440,98520921566,"[203474864, 2028...",202862432,4
0,94578440,98520921566,"[203474864, 2028...",203404725,5


In [35]:
sample_submission.sample(20)

Unnamed: 0,user_id,pav_order_id,known_items,item_id,rank
66601,96002122,98511991221,"[203419976, 2028...",202794687.0,107
25360,94999054,4620196408,"[205781250, 2034...",210734379.0,46
47401,95417616,98515512261,"[203121858, 2040...",203475722.0,123
67862,96036435,4620186819,"[203446118, 2041...",202967706.0,147
55994,95649831,98521250393,"[217826155, 2034...",202807359.0,191
2738,94630138,98517229092,"[202880254, 2028...",211072423.0,62
44571,95340053,98511020240,"[203408409, 2028...",202820148.0,1
20384,94913112,98520727585,"[202807298, 2033...",202884489.0,18
33108,95125580,98519051612,"[202794649, 2028...",203566122.0,195
76650,96281870,98520777900,"[204033776, 2034...",203425095.0,37


In [36]:
to_list = lambda x: x.to_list()[:20]

sample_submission.groupby('pav_order_id', as_index=False).agg(preds = ('item_id', to_list)).to_csv('tmp_submission.csv', index=False)

In [37]:
pd.read_csv('../input/hack-the-cart/sample_submission.csv').head()

Unnamed: 0,pav_order_id,preds
0,4620121489,"[202820148, 2028..."
1,4620121505,"[202820148, 2028..."
2,4620121594,"[202820148, 2028..."
3,4620121684,"[202820148, 2028..."
4,4620121902,"[202820148, 2034..."
