In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm
import lightgbm as lgb
import pickle as pkl
from pandas.api.types import CategoricalDtype
import os

import gc
gc.enable()

# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
user_region = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Archive/user_region.csv')
user_age = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Archive/user_age.csv', usecols=['row', 'data'])
item_subclass = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Archive/item_subclass.csv')
item_price = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Archive/item_price.csv', usecols=['row', 'data'])
item_asset = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Archive/item_asset.csv', usecols=['row', 'data'])
interactions = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Archive/interactions.csv')

In [None]:
assert user_region.isnull().values.any() == False
assert user_age.isnull().values.any() == False
assert item_subclass.isnull().values.any() == False
assert item_price.isnull().values.any() == False
assert item_asset.isnull().values.any() == False
assert interactions.isnull().values.any() == False

# Предобработка

### Собираем датасет для продуктов

In [None]:
set(item_subclass.data)

{1.0}

In [None]:
item_subclass.drop(columns=['data'], inplace=True)

In [None]:
item_subclass.rename(columns={'col':'product_category'}, inplace=True)
item_subclass['product_category'] = item_subclass['product_category'].astype('category')
item_subclass.head()

Unnamed: 0,row,product_category
0,0,679
1,1,1376
2,2,1495
3,3,502
4,4,502


In [None]:
item_price.rename(columns={'data':'price'}, inplace=True)
item_price.head()

Unnamed: 0,row,price
0,0,0.012911
1,1,0.005211
2,2,0.004131
3,3,0.003521
4,4,0.003521


In [None]:
item_asset.rename(columns={'data':'asset'}, inplace=True)
item_asset.head()

Unnamed: 0,row,asset
0,0,0.009497
1,1,0.004226
2,2,0.003371
3,3,0.002991
4,4,0.002991


In [None]:
print(item_subclass.shape)
print(item_price.shape)
print(item_asset.shape)

(18495, 2)
(18493, 2)
(18490, 2)


In [None]:
prod_feats = item_subclass.merge(item_price, how='outer', on='row').merge(item_asset, how='outer', on='row')
prod_feats.rename(columns={'row':'product_id'}, inplace=True)

In [None]:
prod_feats.head()

Unnamed: 0,product_id,product_category,price,asset
0,0,679,0.012911,0.009497
1,1,1376,0.005211,0.004226
2,2,1495,0.004131,0.003371
3,3,502,0.003521,0.002991
4,4,502,0.003521,0.002991


In [None]:
prod_feats_without_nan = prod_feats[(~prod_feats['price'].isna()) & (~prod_feats['asset'].isna())]

In [None]:
N_N = 5

In [None]:
for ind in prod_feats[prod_feats['price'].isna()].index:
    categ = prod_feats.loc[ind, 'product_category']
    asset = prod_feats.loc[ind, 'asset']
    
    temp_df = prod_feats_without_nan[(prod_feats_without_nan['product_category'] == categ)]
    
    n_neighbors = min(N_N, temp_df.shape[0])
    
    nn = NearestNeighbors(n_neighbors)
    nn.fit(temp_df['asset'].values.reshape(-1,1))
    
    _, neighbors_inds = nn.kneighbors(np.reshape([asset], (1, 1)))
    
    prod_feats.loc[ind, 'price'] = temp_df.values[neighbors_inds, -2].mean()

In [None]:
for ind in prod_feats[prod_feats['asset'].isna()].index:
    categ = prod_feats.loc[ind, 'product_category']
    price = prod_feats.loc[ind, 'price']
    
    temp_df = prod_feats_without_nan[(prod_feats_without_nan['product_category'] == categ)]
    
    n_neighbors = min(N_N, temp_df.shape[0])
    
    nn = NearestNeighbors(n_neighbors)
    nn.fit(temp_df['price'].values.reshape(-1,1))
    
    _, neighbors_inds = nn.kneighbors(np.reshape([price], (1, 1)))
    
    prod_feats.loc[ind, 'asset'] = temp_df.values[neighbors_inds, -1].mean()

### Собираем датасет для пользователей

In [None]:
set(user_region.data)

{1.0}

In [None]:
user_region.drop(columns=['data'], inplace=True)

In [None]:
user_region.rename(columns={'col':'user_region'}, inplace=True)
user_region.head()

Unnamed: 0,row,user_region
0,0,6
1,0,0
2,1,7
3,1,0
4,2,5


In [None]:
set(user_age.data)

{1.0}

In [None]:
user_age.drop(columns=['data'], inplace=True)

In [None]:
user_age.head()

Unnamed: 0,row
0,2
1,7
2,8
3,10
4,12


In [None]:
print(user_region.shape)
print(user_age.shape)

(26609, 2)
(30317, 1)


In [None]:
user_feats = user_age.merge(user_region, how='outer', on='row')
user_feats.rename(columns={'row':'user_id'}, inplace=True)

In [None]:
user_feats['user_region'].fillna(1, inplace=True)

In [None]:
user_feats['user_region'] = user_feats['user_region'].astype('category')

In [None]:
user_feats.head()

Unnamed: 0,user_id,user_region
0,2,5.0
1,7,6.0
2,8,1.0
3,10,7.0
4,12,7.0


### Выделяем обучающие, валидационные и тестовые заказы

In [None]:
set(interactions.data)

{1.0}

In [None]:
interactions.rename(columns={'row':'user_id', 'col':'product_id', 'data':'label'}, inplace=True)

In [None]:
np.random.seed(432)
train_indexes = np.random.choice(interactions.index, int(len(interactions.index)*0.7), replace=False)
val_train_indexes = list(filter(lambda x: x not in train_indexes, interactions.index))

In [None]:
val_indexes = val_train_indexes[:int(len(val_train_indexes)*0.5)]
test_indexes = val_train_indexes[int(len(val_train_indexes)*0.5):]

In [None]:
interactions_train = interactions.loc[train_indexes].reset_index(drop=True)
interactions_val = interactions.loc[val_indexes].reset_index(drop=True)
interactions_test = interactions.loc[test_indexes].reset_index(drop=True)

In [None]:
interactions_train.head()

Unnamed: 0,user_id,product_id,label
0,9248,4635,1.0
1,10279,1675,1.0
2,9716,6639,1.0
3,7964,14718,1.0
4,1692,8956,1.0


## Генерируем признаки

In [None]:
user_prod_train = interactions_train.merge(user_feats, how='inner', on='user_id').merge(prod_feats, how='inner', on='product_id')

In [None]:
user_prod_train.head()

Unnamed: 0,user_id,product_id,label,user_region,product_category,price,asset
0,9248,4635,1.0,7.0,1536,0.000986,0.000902
1,5514,4635,1.0,6.0,1536,0.000986,0.000902
2,5640,4635,1.0,7.0,1536,0.000986,0.000902
3,8187,4635,1.0,3.0,1536,0.000986,0.000902
4,22065,4635,1.0,6.0,1536,0.000986,0.000902


### Пользователь

Полное число покупок пользователя

In [None]:
user_total_buy = user_prod_train.groupby('user_id')['user_id'].count().to_frame('user_total_buy')
user_total_buy = user_total_buy.reset_index()
user_total_buy.head()

Число различных категорий, которые покупал пользователь

In [None]:
user_dif_cat = user_prod_train.groupby('user_id')['product_category'].nunique().to_frame('user_dif_cat')
user_dif_cat = user_dif_cat.reset_index()
user_dif_cat.head()

Unnamed: 0,user_id,user_dif_cat
0,0,2
1,1,5
2,2,7
3,3,1
4,4,5


### Холодный старт для пользователей

In [None]:
all_users = user_feats.merge(user_total_buy, how='outer', on='user_id').merge(user_dif_cat, how='outer', on='user_id')

In [None]:
all_users_without_nan = all_users[(~all_users['user_total_buy'].isna()) & (~all_users['user_dif_cat'].isna())]

In [None]:
for ind in tqdm(all_users[all_users['user_total_buy'].isna()].index, position=0):
    region = all_users.loc[ind, 'user_region']
    
    temp_df = all_users_without_nan[(all_users_without_nan['user_region'] == region)]
    
    all_users.loc[ind, 'user_total_buy'] = int(temp_df.loc[:, 'user_total_buy'].mean())
    all_users.loc[ind, 'user_dif_cat'] = int(temp_df.loc[:, 'user_dif_cat'].mean())

0it [00:00, ?it/s]


In [None]:
for ind in tqdm(all_users[all_users['user_dif_cat'].isna()].index, position=0):
    region = all_users.loc[ind, 'user_region']
    
    temp_df = all_users_without_nan[(all_users_without_nan['user_region'] == region)]
    
    all_users.loc[ind, 'user_dif_cat'] = int(temp_df.loc[:, 'user_dif_cat'].mean())

100%|██████████| 1/1 [00:00<00:00, 145.10it/s]


In [None]:
all_users.drop_duplicates(subset=['user_id'], inplace=True)

In [None]:
all_users.to_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/all_users.csv', index=False)

### Продукт

In [None]:
product_total_buy = user_prod_train.groupby('product_id')['user_id'].count().to_frame('product_total_buy')
product_total_buy = product_total_buy.reset_index()
product_total_buy.head()

Unnamed: 0,product_id,product_total_buy
0,0,1
1,2,15
2,4,18
3,5,13
4,6,3


In [None]:
prod_dif_region = user_prod_train.groupby('product_id')['user_region'].nunique().to_frame('prod_dif_region')
prod_dif_region = prod_dif_region.reset_index()
prod_dif_region.head()

Unnamed: 0,product_id,prod_dif_region
0,0,1
1,2,5
2,4,6
3,5,4
4,6,2


### Холодный старт для продуктов

In [None]:
all_prods = prod_feats.merge(product_total_buy, how='outer', on='product_id').merge(prod_dif_region, how='outer', on='product_id')

In [None]:
all_prod_without_nan = all_prods[(~all_prods['product_total_buy'].isna()) & (~all_prods['prod_dif_region'].isna())]

In [None]:
N_N = 5

for ind in tqdm(all_prods[all_prods['product_total_buy'].isna()].index, position=0):
    categ = all_prods.loc[ind, 'product_category']
    price, asset = all_prods.loc[ind, 'price'], all_prods.loc[ind, 'asset']
    
    temp_df = all_prod_without_nan[(all_prod_without_nan['product_category'] == categ)]
    if temp_df.shape[0] == 0:
        temp_df = all_prod_without_nan
        
    n_neighbors = min(N_N, temp_df.shape[0])
    
    nn = NearestNeighbors(n_neighbors)
    nn.fit(temp_df[['price', 'asset']].values)
    
    _, neighbors_inds = nn.kneighbors(np.reshape([price, asset], (1, 2)))
    
    all_prods.loc[ind, 'product_total_buy'] = int(temp_df.values[neighbors_inds, -2].mean())
    all_prods.loc[ind, 'prod_dif_region'] = int(temp_df.values[neighbors_inds, -1].mean())

100%|██████████| 4038/4038 [00:17<00:00, 229.95it/s]


In [None]:
all_prods.drop_duplicates(subset=['product_id'], inplace=True)

In [None]:
all_prods.to_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/all_prods.csv', index=False)

### Мержим признаки

In [None]:
user_prod_train = interactions_train.merge(all_users, how='inner', on='user_id').merge(all_prods, how='inner', on='product_id')

In [None]:
val_arr = []
for user_id, group_u in tqdm(interactions_val.groupby('user_id'), position=0):
    val_arr.append([user_id, group_u['product_id'].values])

100%|██████████| 10234/10234 [00:02<00:00, 3566.54it/s]


In [None]:
val_df = pd.DataFrame(val_arr, columns=['user_id', 'bought_products']).merge(all_users, how='inner', on='user_id')

In [None]:
test_arr = []
for user_id, group_u in tqdm(interactions_test.groupby('user_id'), position=0):
    test_arr.append([user_id, group_u['product_id'].values])

100%|██████████| 12398/12398 [00:03<00:00, 3841.54it/s]


In [None]:
test_df = pd.DataFrame(test_arr, columns=['user_id', 'bought_products']).merge(all_users, how='inner', on='user_id')

In [None]:
val_df.to_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/val_df.csv', index=False)
test_df.to_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/test_df.csv', index=False)

## Генерируем отрицательные классы

In [None]:
np.random.seed(432)

negative_samples = []
for user_id, group_u in tqdm(user_prod_train.groupby('user_id'), position=0):
    
    product_list = list(set(group_u['product_id']))
    
    target_products = list(set(user_prod_train[(user_prod_train['user_id'] != user_id) & (~user_prod_train['product_id'].isin(product_list))]['product_id']))
    
    num_to_extract = min(len(product_list), len(target_products))
    
    negative_products = np.random.choice(target_products, num_to_extract, replace=False)
    
    for product in negative_products:
        negative_samples.append([user_id, product])

100%|██████████| 26184/26184 [21:54<00:00, 19.92it/s]


In [None]:
negative_user_prod_train = pd.DataFrame(negative_samples, columns=['user_id', 'product_id'])
negative_user_prod_train['label'] = 0

In [None]:
negative_user_prod_train = negative_user_prod_train.merge(all_users, how='inner', on='user_id').merge(all_prods, how='inner', on='product_id')

## Мержим датасеты и сохраняем

In [None]:
df = pd.concat((user_prod_train, negative_user_prod_train), axis=0).sample(frac=1)

In [None]:
df.to_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/train_dataset.csv', index=False)

# Обучение

In [23]:
train_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/train_dataset.csv')
val_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/val_df.csv')
test_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/test_df.csv')
all_prods = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/all_prods.csv')
all_users = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Generated_Data/all_users.csv')

train_df['user_region'] = train_df['user_region'].astype(CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]))
train_df['product_category'] = train_df['product_category'].astype('category')

val_df['user_region'] = val_df['user_region'].astype(CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]))
val_df['bought_products'] = val_df['bought_products'].apply(lambda x:np.fromstring(x.strip('[ ]'), dtype=int, sep=' '))

test_df['user_region'] = test_df['user_region'].astype(CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]))
test_df['bought_products'] = test_df['bought_products'].apply(lambda x:np.fromstring(x.strip('[ ]'), dtype=int, sep=' '))

all_prods['product_category'] = all_prods['product_category'].astype('category')

all_users['user_region'] = all_users['user_region'].astype(CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]))

In [3]:
feat_cols = ['user_region', 'product_category', 'price', 'asset', 'user_total_buy', 'user_dif_cat', 'product_total_buy', 'prod_dif_region', 'label']

X_train, y_train = train_df[feat_cols[:-1]], train_df[feat_cols[-1]]

In [4]:
train_df.head()

Unnamed: 0,user_id,product_id,label,user_region,user_total_buy,user_dif_cat,product_category,price,asset,product_total_buy,prod_dif_region
0,6210,5457,1.0,6.0,71.0,60.0,209,0.001643,0.001519,52.0,7.0
1,21635,12268,0.0,6.0,6.0,6.0,883,0.003568,0.002707,3.0,2.0
2,30896,16185,1.0,1.0,4.0,4.0,1944,0.011033,0.009687,159.0,8.0
3,4392,12941,0.0,1.0,24.0,21.0,196,0.012723,0.009212,6.0,2.0
4,14310,5103,0.0,6.0,50.0,45.0,119,0.003192,0.002849,4.0,3.0


In [5]:
def average_precision(bought_products, sorted_recommended_products, K):
    av_prec = 0
    recommends = list(map(lambda x: x in bought_products, sorted_recommended_products))
    precision = [sum(recommends[:k+1])/(k+1.) for k in range(K)]
    for i in range(K):
        av_prec += recommends[i]*precision[i]
        
    return av_prec/len(bought_products)

## Оптимизируем гиперпараметры на валидационной выборке по целевой метрике

In [None]:
gridParams = {
    'boosting_type': ['gbdt','dart'],
    'colsample_bytree': [0.6, 0.8],
    'learning_rate': [0.05],
    'max_depth' : [-1, 5],
    'min_child_samples': [20],
    'min_child_weight': [0.001],
    'min_split_gain': [0.5],
    'n_estimators': [100],
    'num_leaves': [16, 32],
    'reg_alpha': [0, 1.2],
    'reg_lambda': [0],
    'subsample': [0.8, 1],
    'subsample_for_bin': [200],
    'subsample_freq': [1, 2]
    }

In [1]:
import configparser
from optimization import read_grid_params_from_config
import numpy as np

In [2]:
np.fromstring('12', dtype=int, sep=', ')

array([12])

In [3]:
read_grid_params_from_config('opt_conf.ini')

{'boosting_type': ['gbdt', 'dart'],
 'colsample_bytree': array([0.6, 0.8]),
 'learning_rate': array([0.05]),
 'max_depth': array([-1,  5]),
 'min_child_samples': array([20]),
 'min_child_weight': array([0.001]),
 'min_split_gain': array([0.5]),
 'n_estimators': array([100]),
 'num_leaves': array([16, 32]),
 'reg_alpha': array([0. , 1.2]),
 'reg_lambda': array([0.]),
 'subsample': array([0.8, 1. ]),
 'subsample_for_bin': array([200]),
 'subsample_freq': array([1, 2])}

In [None]:
params = [{'boosting_type':bt, 'colsample_bytree':cbt, 'learning_rate':lr, 'max_depth':md, 'min_child_samples':mcs, 'min_child_weight':mcw,
           'min_split_gain':msg, 'n_estimators':ne, 'num_leaves':nl, 'reg_alpha':l1, 'reg_lambda':l2, 'subsample':subs, 
           'subsample_for_bin':sfb, 'subsample_freq':sf} for bt in gridParams['boosting_type'] for cbt in gridParams['colsample_bytree'] 
          for lr in gridParams['learning_rate'] for md in  gridParams['max_depth'] for mcs in gridParams['min_child_samples'] 
          for mcw in gridParams['min_child_weight'] for msg in gridParams['min_split_gain'] for ne in gridParams['n_estimators']
          for nl in gridParams['num_leaves'] for l1 in gridParams['reg_alpha'] for l2 in gridParams['reg_lambda']
          for subs in gridParams['subsample'] for sfb in gridParams['subsample_for_bin'] for sf in gridParams['subsample_freq']]

In [None]:
best_mean_av_prec = 0
best_params = params[0]

for param in tqdm(params, position=0):
    lgbmc = lgb.LGBMClassifier(objective='binary', random_state=432, 
                             boosting_type=param['boosting_type'],
                             colsample_bytree=param['colsample_bytree'],
                             learning_rate=param['learning_rate'],
                             max_depth=param['max_depth'],
                             min_child_samples=param['min_child_samples'],
                             min_child_weight=param['min_child_weight'],
                             min_split_gain=param['min_split_gain'],
                             n_estimators=param['n_estimators'],
                             num_leaves=param['num_leaves'],
                             reg_alpha=param['reg_alpha'],
                             reg_lambda=param['reg_lambda'],
                             subsample=param['subsample'],
                             subsample_for_bin=param['subsample_for_bin'],
                             subsample_freq=param['subsample_freq'])
    
    model_lgbm = lgbmc.fit(X_train, y_train)
    
    mean_av_prec = 0
    total = 0
    
    for line in tqdm(val_df.values, position=0):
        
        user_id, bought_products, user_region, user_total_buy, user_dif_cat = line
        
        temp_df = all_prods.copy()
        
        temp_df['user_region'] = user_region
        temp_df['user_region'] = temp_df['user_region'].astype(CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]))
        temp_df['user_total_buy'] = user_total_buy
        temp_df['user_dif_cat'] = user_dif_cat
        
        temp_df['probs'] = model_lgbm.predict_proba(temp_df[feat_cols[:-1]])[:, 1]
        
        sorted_recommended_products = temp_df['product_id'].values[np.argsort(temp_df['probs'].values)[::-1]]
        
        mean_av_prec += average_precision(bought_products, sorted_recommended_products, 10)
        total += 1
        
        mean_av_prec /= total
        
        if mean_av_prec > best_mean_av_prec:
            best_mean_av_prec = mean_av_prec
            best_params = param
            
        with open('/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Model/best_model.pkl', 'wb') as f:
            pkl.dump(model_lgbm, f)

## Сравниваем другие подходы с моделью на тестовой выборке

### Собираем статистику по исходной модели

In [38]:
best_params = {'boosting_type':'gbdt',
               'colsample_bytree':0.8,
               'learning_rate':0.05,
               'max_depth':5,
               'min_child_samples':20,
               'min_child_weight':0.001,
               'min_split_gain':0.5,
               'n_estimators':100,
               'num_leaves':16,
               'reg_alpha':1.2,
               'reg_lambda':0,
               'subsample':0.8,
               'subsample_for_bin':200,
               'subsample_freq':1}

In [46]:
for i in tqdm(range(5), position=0):
    
    path = '/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Model/{0}'.format(i)
    os.mkdir(path)
    
    lgbmc = lgb.LGBMClassifier(objective='binary', 
                              boosting_type=best_params['boosting_type'],
                              colsample_bytree=best_params['colsample_bytree'],
                              learning_rate=best_params['learning_rate'],
                              max_depth=best_params['max_depth'],
                              min_child_samples=best_params['min_child_samples'],
                              min_child_weight=best_params['min_child_weight'],
                              min_split_gain=best_params['min_split_gain'],
                              n_estimators=best_params['n_estimators'],
                              num_leaves=best_params['num_leaves'],
                              reg_alpha=best_params['reg_alpha'],
                              reg_lambda=best_params['reg_lambda'],
                              subsample=best_params['subsample'],
                              subsample_for_bin=best_params['subsample_for_bin'],
                              subsample_freq=best_params['subsample_freq'])
    
    model_lgbm = lgbmc.fit(X_train, y_train)
    
    mean_av_prec = 0
    total = 0
    
    for line in tqdm(test_df.values, position=0):
        user_id, bought_products, user_region, user_total_buy, user_dif_cat = line
        
        temp_df = all_prods.copy()
        
        temp_df['user_region'] = user_region
        temp_df['user_region'] = temp_df['user_region'].astype(CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]))
        temp_df['user_total_buy'] = user_total_buy
        temp_df['user_dif_cat'] = user_dif_cat
        
        temp_df['probs'] = model_lgbm.predict_proba(temp_df[feat_cols[:-1]])[:, 1]
        
        sorted_recommended_products = temp_df['product_id'].values[np.argsort(temp_df['probs'].values)[::-1]]
        
        mean_av_prec += average_precision(bought_products, sorted_recommended_products, 10)
        total += 1
        
    with open(path + '/model.pkl', 'wb') as f:
        pkl.dump(model_lgbm, f)
        
    with open(path + '/metric.pkl', 'wb') as f:
        pkl.dump(mean_av_prec/total, f)

100%|██████████| 12398/12398 [28:41<00:00,  7.20it/s]
100%|██████████| 12398/12398 [28:21<00:00,  7.28it/s]
100%|██████████| 12398/12398 [28:15<00:00,  7.31it/s]
100%|██████████| 12398/12398 [29:38<00:00,  6.97it/s]
100%|██████████| 12398/12398 [28:52<00:00,  7.16it/s]
100%|██████████| 5/5 [2:24:13<00:00, 1730.75s/it]


In [47]:
lgbm_metric = []
for i in range(5):
    path = '/content/drive/My Drive/Colab Notebooks/Sber_RecSys/Model/{0}'.format(i)
    with open(path + '/metric.pkl', 'rb') as f:
        lgbm_metric.append(pkl.load(f))

In [50]:
mean_average_precision_lgbm = 0.006184884371275953

0.006184884371275953


Тут получилось почему-то для всех иниициализаций одно и то же значение целевой метрики

### Рекомендуем 10 самых популярных продуктов среди всех покупателей

In [51]:
recommended_prods = all_prods.sort_values('product_total_buy', ascending=False)['product_id'].values[:10]

In [52]:
mean_av_prec = 0
total = 0

for line in tqdm(test_df.values, position=0):
    user_id, bought_products, user_region, user_total_buy, user_dif_cat = line
    
    mean_av_prec += average_precision(bought_products, recommended_prods, 10)
    total += 1

print('\n')
print(mean_av_prec/total)

100%|██████████| 10234/10234 [00:00<00:00, 20867.50it/s]



0.006939778607406747





In [None]:
mean_average_precision_popular = 0.006939778607406747

### Рекомендуем 10 самых популярных продуктов среди покупателей из одного региона

In [53]:
mean_av_prec = 0
total = 0

for line in tqdm(test_df.values, position=0):
    user_id, bought_products, user_region, user_total_buy, user_dif_cat = line
    
    temp_df = train_df[(train_df['label']==1) & (train_df['user_region'] == user_region)].copy()
    count_prod = temp_df.groupby('product_id')['product_id'].count().to_frame('count_prod').reset_index()
    
    recommended_prods = count_prod.sort_values('count_prod', ascending=False)['product_id'].values[:10]
    
    mean_av_prec += average_precision(bought_products, recommended_prods, 10)
    total += 1

print('\n')
print(mean_av_prec/total)

100%|██████████| 12398/12398 [03:14<00:00, 63.59it/s]



0.0068433079058005915





In [None]:
mean_average_precision_popreg = 0.0068433079058005915

### Рекомендуем 10 самых популярных продуктов среди 10 похожих покупателей

In [79]:
mean_av_prec = 0
total = 0

for line in tqdm(test_df.values, position=0):
    
    user_id, bought_products, user_region, user_total_buy, user_dif_cat = line
    
    temp_df = train_df[(train_df['label'] == 1) & (train_df['user_id'] != user_id)]
    
    nn = NearestNeighbors(10)
    nn_df = temp_df.drop_duplicates('user_id')
    nn.fit(nn_df[['user_total_buy', 'user_dif_cat']].values)
    
    _, nearest_indexes = nn.kneighbors(np.reshape([user_total_buy, user_dif_cat], (1, 2)))
    
    closest_users_arr = nn_df.iloc[nearest_indexes[0], 0].values
    closest_users = temp_df[temp_df['user_id'].isin(closest_users_arr)]
    popular_products = closest_users.groupby('product_id')['product_id'].count().to_frame('popular_products').reset_index()
    recommended_prods = popular_products.sort_values('popular_products', ascending=False)['product_id'].values[:10]
    
    mean_av_prec += average_precision(bought_products, recommended_prods, 10)
    total += 1

print('\n')
print(mean_av_prec/total)

100%|██████████| 12398/12398 [21:36<00:00,  9.57it/s]

0.0020881942415210313





In [None]:
mean_average_precision_nn = 0.0020881942415210313

### Рекомендуем 10 случайных продуктов

In [82]:
np.random.seed(432)

mean_av_prec = 0
total = 0

for line in tqdm(test_df.values, position=0):
    recommended_prods = np.random.choice(all_prods['product_id'].values, 10, replace=False)
    user_id, bought_products, user_region, user_total_buy, user_dif_cat = line
    
    mean_av_prec += average_precision(bought_products, recommended_prods, 10)
    total += 1

print('\n')
print(mean_av_prec/total)

100%|██████████| 12398/12398 [00:05<00:00, 2155.42it/s]



0.0001328935947194941





In [None]:
mean_average_precision_random = 0.0001328935947194941

### Выводы

Обученная модель оказалась несостоятельной, причины могут быть разные, возможно недостаточно большой датасет, либо сгенерирпованные признаки получились абсолютно незначимыми.