### Финальный проект

Мы уже прошли всю необходимуб теорию для финального проекта. Проект осуществляется на данных из вебинара (данные считаны в начале ДЗ).
Рекомендуем вам **начать делать проект сразу после этого домашнего задания**  
- Целевая метрика - precision@5. Порог для уcпешной сдачи проекта precision@5 > 25%  
- Будет public тестовый датасет, на котором вы сможете измерять метрику  
- Также будет private тестовый датасет для измерения финального качества  
- НЕ обязательно, но крайне желательно использовать 2-ух уровневые рекоммендательные системы в проекте  
- Вы сдаете код проекта в виде github репозитория и csv файл с рекомендациями    


Код для src, utils, metrics вы можете скачать из [этого](https://github.com/geangohn/recsys-tutorial) github репозитория

https://towardsdatascience.com/lightautoml-preset-usage-tutorial-2cce7da6f936

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als
from sklearn.model_selection import train_test_split
# Модель второго уровня
from lightgbm import LGBMClassifier
import lightgbm  as lgb 
import catboost as catb

import os, sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

# Написанные нами функции
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items, postfilter_items, popularity_recommendation, perpare_lvl2_1, perpare_lvl2, category_to_digit
from src.recommenders import MainRecommender
from tqdm import tqdm

tqdm.pandas()
# from random import random

from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.tasks.common_metric import mean_quantile_error

In [2]:
data = pd.read_csv('../raw_data/retail_train.csv')
item_features = pd.read_csv('../raw_data/product.csv')
user_features = pd.read_csv('../raw_data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
N = 150 # к-во товаров получаемых из модели 1-го уровня.
final_predict_count = 30 # К-во рекомендаций выдаваемых
val_count = 5 # финальное к-во репомендаций товаров. На них будет осуществляться подсчет к-ва.
top_items_count = 5000 #

Обзор датасета.

In [4]:
data.head(3)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0


In [5]:
item_features.head(3)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,
2,26093,69,PASTRY,Private,BREAD,BREAD:ITALIAN/FRENCH,


In [6]:
user_features.head(3)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7
2,25-34,U,25-34K,Unknown,2 Adults Kids,3,1,8


Добавим номера дней недели.

In [7]:
week_day = {'week_day': []}
# считаем номер недели. поле чего вычисляем записи с номеро дня. 
# определяем номер дня соответствуующий номеру последнему дню недели и после этого начинаем вычетать из него.
# номера номера дней. 
max_week_no = data['week_no'].max()
min_week_no = data['week_no'].min()

week_days=[]
for week_no in range(min_week_no,max_week_no + 1):
    max_day_in_week = data.loc[(data['week_no']==week_no),'day'].max()
    days = data.loc[(data['week_no']==week_no),'day']
    for day in days:
        week_days.append(day-max_day_in_week+7)

data['week_day'] = week_days

### Разделение датасета на тренировочную, тестовую и валидационную выборки.

In [8]:
# Cхема обучения и валидации!
# -- давние покупки -- | -- 6 недель -- | -- 3 недель -- 
# подобрать размер 2-ого датасета (6 недель) --> learning curve (зависимость метрики recall@k от размера датасета)

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy()  # Для наглядности. Далее мы добавим изменения, и они будут отличаться
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,week_day
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0,3
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0,3


### Предварительная фильтрация данных.

In [9]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, item_features=item_features, take_n_popular=top_items_count)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 83685 to 5001


In [10]:
def get_new_values(old_df, new_df,feature):
    old_values = old_df[feature].unique()
    new_values = new_df[feature].unique()
    appended_values = []

    for value  in new_values: 
        if value not in old_values:
            appended_values.append(value)
        
    appended_values = np.unique(appended_values)
    return appended_values

In [11]:
first_users_count = len(data_train_lvl_1['user_id'].unique()) 
first_items_count = len(data_train_lvl_1['item_id'].unique()) 

new_user_lvl_1 = get_new_values(data_train_lvl_1, data_train_lvl_2 ,'user_id')
new_items_lvl_1 = get_new_values(data_train_lvl_1, data_train_lvl_2 ,'item_id')

new_user_lvl_2 = get_new_values(data_train_lvl_1, data_val_lvl_2 ,'user_id')
new_items_lvl_2 = get_new_values(data_train_lvl_1, data_val_lvl_2 ,'item_id')

print(f'Изначальное к-во: users: {first_users_count}, items: {first_items_count}')
print(f'1-й уровень  users: +{len(new_user_lvl_1)}, items: +{len(new_items_lvl_1)}')
print(f'2-й уровень  users: +{len(new_user_lvl_2)}, items: +{len(new_items_lvl_2)}')

Изначальное к-во: users: 2299, items: 5001
1-й уровень  users: +70, items: +22772
2-й уровень  users: +74, items: +19567


### Добавление фитчей User-ов.

In [12]:
data_gr = data.groupby('basket_id').mean()

In [13]:
# Среднее к-во покупаемых товаров.
user_features['median_quantity'] = user_features['user_id'].apply(lambda x: 
                                data_gr.loc[(data_gr['user_id']==x),'quantity'].median())

# Средний чек.                                
user_features['mean_sales_value'] = user_features['user_id'].apply(lambda x: 
                                data_gr.loc[(data_gr['user_id']==x),'sales_value'].mean())

In [14]:
#  Среднее к-во раз в неделю, которое user ходит в магазин.

for i in [1,2,3,4,5,6,7]:
    data_gr[f"day_{i}"] = np.where((data_gr['week_day'] == i),1,0)
    
week_count = data['week_no'].max()
#-----------------------------------------------
def mean_quantity_in_week(user_id):
    days = 0
    for i in [1,2,3,4,5,6,7]:
       days += data_gr.loc[(data_gr['user_id']==user_id),f'day_{i}'].sum()
    days /=week_count
    return int(np.round(days))

#-----------------------------------------------    
user_features['mean_quantity_in_week'] = user_features['user_id'].apply(lambda x: mean_quantity_in_week(x))

In [15]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id,median_quantity,mean_sales_value,mean_quantity_in_week
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1,1.1,2.726818,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7,1.181818,2.989986,1


### Приведение категориальных фитчей к числовому типу.

In [16]:
### Список категориальных фитчей ктороые мы будем разбирать.
features=['income_desc','age_desc','homeowner_desc','kid_category_desc','household_size_desc','hh_comp_desc']
for feature_name in features:
    print(feature_name)
    print(user_features[feature_name].unique())
    print('-'*20)

income_desc
['35-49K' '50-74K' '25-34K' '75-99K' 'Under 15K' '100-124K' '15-24K'
 '125-149K' '150-174K' '250K+' '175-199K' '200-249K']
--------------------
age_desc
['65+' '45-54' '25-34' '35-44' '19-24' '55-64']
--------------------
homeowner_desc
['Homeowner' 'Unknown' 'Renter' 'Probable Renter' 'Probable Owner']
--------------------
kid_category_desc
['None/Unknown' '1' '2' '3+']
--------------------
household_size_desc
['2' '3' '4' '1' '5+']
--------------------
hh_comp_desc
['2 Adults No Kids' '2 Adults Kids' 'Single Female' 'Unknown'
 'Single Male' '1 Adult Kids']
--------------------


In [17]:
income_desc = {'35-49K':42, '50-74K':62, '25-34K':30, '75-99K':87, 'Under 15K':15, '100-124K':112,
       '15-24K':20, '125-149K':137, '150-174K':162, '250K+':250, '175-199K':187, '200-249K':225}
       
user_features['income_desc'] = user_features['income_desc'].apply(lambda x: income_desc[x]) 

age_desc = {'65+':65, '45-54':50, '25-34':30, '35-44':40, '19-24':21, '55-64':60}	

user_features['age_desc'] = user_features['age_desc'].apply(lambda x: age_desc[x])

In [18]:
household_size_desc = {np.nan: 0, '1':1, '2':2, '3':3, '4':4, '5+':5 }

user_features['household_size_desc'] = user_features['household_size_desc'].apply(lambda x: household_size_desc[x])

In [19]:
kid_category_desc = {'None/Unknown':0, np.nan: 0, '1':1, '2':2, '3+':3 }

user_features['kid_category_desc'] = user_features['kid_category_desc'].apply(lambda x: kid_category_desc[x])

In [20]:
user_features[['hh_comp_desc_female', 'hh_comp_desc_male', 'hh_comp_desc_Adults_Kids']] = 0
user_features['hh_comp_desc_female'] = np.where((user_features['hh_comp_desc'] !='Single Male'), 1, 0)
user_features['hh_comp_desc_male'] = np.where((user_features['hh_comp_desc'] !='Single Female'), 1, 0)
user_features.loc[(user_features['hh_comp_desc']=='2 Adults Kids'), 'hh_comp_desc_Adults_Kids'] = 2
user_features.loc[(user_features['hh_comp_desc']=='1 Adult Kids'), 'hh_comp_desc_Adults_Kids'] = 1
user_features.loc[(user_features['hh_comp_desc'].isna()), ['hh_comp_desc_female','hh_comp_desc_male']] = 0
user_features.loc[(user_features['hh_comp_desc']=='Unknown'), ['hh_comp_desc_female','hh_comp_desc_male']] = 0
user_features.drop('hh_comp_desc', axis=1, inplace=True)

In [21]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,household_size_desc,kid_category_desc,user_id,median_quantity,mean_sales_value,mean_quantity_in_week,hh_comp_desc_female,hh_comp_desc_male,hh_comp_desc_Adults_Kids
0,65,A,42,Homeowner,2,0,1,1.1,2.726818,1,1,1,0
1,50,A,62,Homeowner,2,0,7,1.181818,2.989986,1,1,1,0


In [22]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


### Добавление фитчей Item-ов.

In [23]:
item_features_temp = item_features.merge(data, on='item_id', how='left')

In [24]:
# Средняя стоимость товара в категории.

item_price = item_features_temp.groupby(['item_id','commodity_desc'])['sales_value'].mean().reset_index()
item_price.columns= ['item_id','commodity_desc','sales_value']
commoditys_desc = item_price['commodity_desc'].unique()

item_price['commodity_desc_mean_sale']=np.NaN

for commodity_desc in commoditys_desc:
    mean_value = item_price.loc[(item_price['commodity_desc']==commodity_desc),'sales_value'].mean()
    item_price.loc[(item_price['commodity_desc']==commodity_desc),'commodity_desc_mean_sale'] = mean_value

item_price.loc[(item_price['commodity_desc']=='NO COMMODITY DESCRIPTION'),'sales_value']

item_features = item_features.merge(item_price[['item_id','commodity_desc_mean_sale']], on='item_id',how='left')

In [25]:
# К-во покупок в неделю.
quantity_count = item_features_temp.groupby(['item_id'])['quantity'].sum().reset_index()

quantity_count.columns = ['item_id','quantity']

quantity_in_week = item_features_temp.groupby(['item_id'])['week_no'].unique().reset_index()

quantity_in_week.columns = ['item_id','weeks']

quantity_in_week['weeks_count'] = quantity_in_week['weeks'].apply(lambda x: len(x))

quantity_in_week['sale_in_week'] = quantity_count['quantity']/quantity_in_week['weeks_count']  

item_features = item_features.merge(quantity_in_week[['item_id','sale_in_week']], on='item_id',how='left')

In [26]:
recommender = MainRecommender(data_train_lvl_1)

100%|██████████| 15/15 [00:02<00:00,  5.27it/s]
100%|██████████| 5001/5001 [00:00<00:00, 151553.15it/s]


In [27]:
# def perpare_lvl2_1(val_data, train_data, recommender, item_features, user_features, N=50):
#     # val_data = data_train_lvl_2.copy()
#     # train_data = data_train_lvl_1.copy()

#     users_warm = pd.DataFrame(val_data['user_id'].unique()) # Добавим туда еще фитчи user-ов и item-ов.
#     users_warm.columns = ['user_id']
#     # Пока только warm start
#     users_warm = users_warm[users_warm['user_id'].isin(train_data['user_id'].unique())]

#     users_cold = pd.DataFrame(val_data['user_id'].unique()) # Добавим туда еще фитчи user-ов и item-ов.
#     users_cold.columns = ['user_id']
#     # cold_start
#     users_cold = users_cold[~users_cold['user_id'].isin(users_warm['user_id'].unique())]

#     # Заполняем кандидатов, на основе предсказания модели 1-го уровня.
#     users_cold['candidates'] = users_cold['user_id'].apply(lambda x: recommender.get_top_popular(N=N))
#     s = users_cold.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
#     s.name = 'item_id'

#     # Это кандидаты. (т.е. предпологаемые покупки совершенные на основе предсказаний.)
#     users_cold = users_cold.drop('candidates', axis=1).join(s)
#     users_cold['drop'] = 1  # фиктивная переменная
#     # Заполняем кандидатов, на основе предсказания модели 1-го уровня.
#     users_warm['candidates'] = users_warm['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=N))
#     # test_users = data
#     s = users_warm.apply(lambda x: pd.Series(x['candidates']), axis=1).stack().reset_index(level=1, drop=True)
#     s.name = 'item_id'

#     # Это кандидаты. (т.е. предпологаемые покупки совершенные на основе предсказаний.)
#     users_warm = users_warm.drop('candidates', axis=1).join(s)
#     users_warm['drop'] = 1  # фиктивная переменная

#     # Создадим таблицу с реальными покупками user-ов. 
#     targets = val_data[['user_id', 'item_id']].copy() # свойства 
#     targets['target'] = 1  # тут только покупки

#     # Объединим предпологаемые покупки с реальными, совершенными user-ами.
#     targets_cold = users_cold.merge(targets, on=['user_id', 'item_id'], how='left')

#     # В результате, напротив товаров, в редсказании которых мы ошиблись, 
#     # будет стоять Nan. Заполним их  нулями.  
#     targets_cold['target'].fillna(0, inplace= True)
#     targets_cold.drop('drop', axis=1, inplace=True)
#     # Добавим к нашему датасету фичи user-ов и item-ов.
#     targets_cold = targets_cold.merge(item_features, on='item_id', how='left')
#     targets_cold = targets_cold.merge(user_features, on='user_id', how='left')

#     # Объединим предпологаемые покупки с реальными, совершенными user-ами.
#     targets_warm = users_warm.merge(targets, on=['user_id', 'item_id'], how='left')

#     # В результате, напротив товаров, в редсказании которых мы ошиблись, 
#     # будет стоять Nan. Заполним их  нулями.  
#     targets_warm['target'].fillna(0, inplace= True)
#     targets_warm.drop('drop', axis=1, inplace=True)
#     # targets_warm['target'].mean() #Угадали примерно 17% покупок.

#     # Добавим к нашему датасету фичи user-ов и item-ов.
#     targets_warm = targets_warm.merge(item_features, on='item_id', how='left')
#     targets_warm = targets_warm.merge(user_features, on='user_id', how='left')

#     targets_lvl_2 = pd.concat([targets_warm, targets_cold], ignore_index=True)

#     # X_ = targets_lvl_2.drop('target', axis=1)
#     # y_ = targets_lvl_2[['target']]

#     return targets_lvl_2

In [28]:
train_data = perpare_lvl2_1(data_train_lvl_2, data_train_lvl_1, recommender,item_features, user_features, N=N)

In [29]:
test_data = perpare_lvl2_1(data_val_lvl_2, data_train_lvl_1, recommender, item_features, user_features, N=N)

In [30]:
train_data.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,commodity_desc_mean_sale,...,income_desc,homeowner_desc,household_size_desc,kid_category_desc,median_quantity,mean_sales_value,mean_quantity_in_week,hh_comp_desc_female,hh_comp_desc_male,hh_comp_desc_Adults_Kids
0,2070,5569471.0,0.0,1208.0,GROCERY,National,SOFT DRINKS,SOFT DRINKS 12/18&15PK CAN CAR,12 OZ,2.705525,...,62.0,Unknown,1.0,0.0,1.0,2.413486,6.0,0.0,0.0,0.0
1,2070,1022003.0,0.0,1251.0,GROCERY,National,SOUP,CONDENSED SOUP,10.5OZ,2.202742,...,62.0,Unknown,1.0,0.0,1.0,2.413486,6.0,0.0,0.0,0.0


In [31]:
# Получим список катероиальных и числовых признаков.
categorical = []
numerical = []
for col, value in train_data.iteritems():
    if value.dtype == 'object':
        categorical.append(col)
    else:
        numerical.append(col)

In [32]:
print(categorical)

['department', 'brand', 'commodity_desc', 'sub_commodity_desc', 'curr_size_of_product', 'marital_status_code', 'homeowner_desc']


In [33]:
print(numerical)

['user_id', 'item_id', 'target', 'manufacturer', 'commodity_desc_mean_sale', 'sale_in_week', 'age_desc', 'income_desc', 'household_size_desc', 'kid_category_desc', 'median_quantity', 'mean_sales_value', 'mean_quantity_in_week', 'hh_comp_desc_female', 'hh_comp_desc_male', 'hh_comp_desc_Adults_Kids']


### Уберем признаки содержащие большое к-во категорий

In [34]:
for feature in categorical:
 print(f'{feature}: {len(train_data[feature].unique())}')

department: 21
brand: 3
commodity_desc: 200
sub_commodity_desc: 746
curr_size_of_product: 657
marital_status_code: 4
homeowner_desc: 6


In [35]:
features = ['commodity_desc', 'sub_commodity_desc', 'curr_size_of_product']
train_data = train_data.drop(features, axis=1)
test_data = test_data.drop(features, axis=1)

In [36]:
print(train_data['department'].unique())

['GROCERY' 'MISC. TRANS.' 'PRODUCE' 'PASTRY' 'MEAT-PCKGD' 'MEAT'
 'KIOSK-GAS' 'NUTRITION' 'SALAD BAR' 'DRUG GM' 'DELI' 'FLORAL' nan
 'MISC SALES TRAN' 'GARDEN CENTER' 'SEAFOOD' 'CHEF SHOPPE' 'SEAFOOD-PCKGD'
 'TRAVEL & LEISUR' 'COUP/STR & MFG' 'FROZEN GROCERY']


In [37]:
features = [ 'department',
            'brand',
            #'commodity_desc',
            #'sub_commodity_desc',
            #'curr_size_of_product',
            'marital_status_code',
            'homeowner_desc',
            # 'hh_comp_desc',
            # 'household_size_desc',
            # 'kid_category_desc'
           ]

In [38]:
train_data = category_to_digit(train_data, features)

In [39]:
test_data = category_to_digit(test_data, features)

In [40]:
# y_train.mean()

In [41]:
# y_test.mean()

### Обучение модели. 

In [42]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
#                                                   test_size=0.2,
#                                                   random_state=27,
#                                                  )

In [43]:
TASK = Task('reg', loss='mse', metric='mse', greater_is_better=True)
TIMEOUT = 300000
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 27
TARGET_NAME = 'target'
TEST_SIZE=0.2

In [44]:
roles = {'target': TARGET_NAME, 'drop': ['user_id', 'item_id']}

In [45]:
automl_model = TabularAutoML(task=TASK,
                            timeout=TIMEOUT,
                            cpu_limit = N_THREADS,
                            # gpu_ids='all',
                            reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
                             
                            general_params={'use_algos': [ ['lgb_tuned', 'cb_tuned', 'cb', 'lgb'] ]},
                             
                            tuning_params={'max_tuning_iter': 10},
                      )

In [46]:
train_preds = automl_model.fit_predict(train_data, roles = roles)

INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-ba8c31e9-ebca-4e69-9eee-b14eaaebe31e
INFO:optuna.study.study:Trial 0 finished with value: 0.13824886748602253 and parameters: {'feature_fraction': 0.6872700594236812, 'num_leaves': 244}. Best is trial 0 with value: 0.13824886748602253.
INFO:optuna.study.study:Trial 1 finished with value: 0.13810733202433903 and parameters: {'feature_fraction': 0.8659969709057025, 'num_leaves': 159}. Best is trial 0 with value: 0.13824886748602253.
INFO:optuna.study.study:Trial 2 finished with value: 0.13908203828552484 and parameters: {'feature_fraction': 0.5780093202212182, 'num_leaves': 53}. Best is trial 2 with value: 0.13908203828552484.
INFO:optuna.study.study:Trial 3 finished with value: 0.1382597491855744 and parameters: {'feature_fraction': 0.5290418060840998, 'num_leaves': 223}. Best is trial 2 with value: 0.13908203828552484.
INFO:optuna.study.study:Trial 4 finished with value: 0.13748892965173334 and parameters:

In [47]:
train_preds = train_preds.data[:, 0]

In [48]:
# model_catb = catb.CatBoostClassifier(silent=True, 
#                                     random_state=27,
#                                     # eval_metric='F1',
#                                      early_stopping_rounds=20,
#                                      use_best_model=True,
#                                      num_boost_round=10000
#                                     )

In [49]:
# model_catb.fit(X_train, y_train, eval_set=(X_val, y_val))

###  Предсказание 

In [50]:
def get_items(x_data, items, user_id, item_name, N=5, overall_top_purchases=None):
    items_list = []
 
    for item in items:
        flag = (x_data.loc[((x_data['user_id']==user_id) & (x_data['item_id']==item)),item_name].mean())
        
        if (flag > 0.3):
            items_list.append(item)

    if not(overall_top_purchases is None):

        if len(items_list) < N:
            items_list.extend(overall_top_purchases[:N])
        items_list = items_list[:N]
    return items_list

In [51]:
def get_final_recomendations(x_data, y_data, preds):
    x_data = x_data.copy()
    x_data['predict'] = preds
    x_data['actual'] = y_data['target'].values

    result = x_data.sort_values('predict', ascending=False).groupby('user_id')['item_id'].unique().reset_index()

    overall_top_purchases = x_data.groupby('item_id')['item_id'].count()
    overall_top_purchases = overall_top_purchases.sort_values(ascending=False).index.values

    result_df= {'user_id':[], 'actual':[], 'predict':[]}

    for res in tqdm(result.iterrows()):
        user_id = res[1]['user_id']
        item_ids = res[1]['item_id']
        actual = get_items(x_data, item_ids, user_id, 'actual', N=final_predict_count)
        if len(actual)>0:
            result_df['user_id'].append(user_id)
            predict_items= get_items(x_data, item_ids, user_id, 'predict', N=final_predict_count, overall_top_purchases = overall_top_purchases)
            result_df['predict'].append(postfilter_items(predict_items, item_features, N=val_count)) # Бизнес-ограничения. ^_^
            result_df['actual'].append(actual)
    return pd.DataFrame(result_df) 

In [52]:
X_train = train_data.drop('target', axis=1)
y_train = train_data[['target']]

In [53]:
# train_preds = model_catb.predict_proba(X_train)[:,1]

In [54]:
result_train = get_final_recomendations(X_train, y_train, train_preds)

result_train.head(3)

2154it [02:15, 15.91it/s]


Unnamed: 0,user_id,actual,predict
0,1,"[1082185.0, 995242.0, 820165.0, 840361.0, 9655...","[1082185.0, 820165.0, 866227.0, 961554.0, 7025..."
1,2,"[1106523.0, 1133018.0, 899624.0, 916122.0, 838...","[1106523.0, 1108094.0, 900072.0, 6534178.0, 98..."
2,4,"[962229.0, 6773204.0]","[962229.0, 1082185.0, 995242.0, 981760.0, 9237..."


In [55]:
precision_train = result_train.apply(lambda row: precision_at_k(row['predict'], row['actual']), axis=1).mean()
print(f'Train precision: {precision_train:.03}')

Train precision: 0.314


In [56]:
X_test = test_data.drop('target', axis=1)
y_test = test_data[['target']]

In [57]:
test_preds = automl_model.predict(test_data).data[:,0]

In [58]:
result_test = get_final_recomendations(X_test, y_test, test_preds)

result_test.head(3)

2042it [02:11, 15.55it/s]


Unnamed: 0,user_id,actual,predict
0,1,"[1082185.0, 995242.0, 961554.0, 8293439.0, 940...","[1082185.0, 820165.0, 840361.0, 898121.0, 9615..."
1,3,"[6463658.0, 1053690.0, 9526563.0]","[951590.0, 1053690.0, 938700.0, 6534178.0, 995..."
2,6,"[995242.0, 1119051.0, 5569230.0, 840361.0, 558...","[1082185.0, 1029743.0, 878715.0, 1119051.0, 10..."


In [59]:
precision_test = result_test.apply(lambda row: precision_at_k(row['predict'], row['actual']), axis=1).mean()
print(f'Test precision: {precision_test:.03}')

Test precision: 0.268


### Сохранение результатов. 

In [60]:
result_test.to_csv('finally_prediction_lama_regression.csv', index=False)

In [61]:
import pickle

with open('automl_model_regression.pickle', 'wb') as f:
    pickle.dump(automl_model, f, protocol=pickle.HIGHEST_PROTOCOL)