**Imports**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from scipy.sparse import csr_matrix

In [3]:
from implicit import als

In [4]:
from src.metrics import precision_at_k, recall_at_k
from src.utils import prefilter_items
from src.recommenders import MainRecommender

In [5]:
from src.recommenders_alt import alt_recommender



In [6]:
%load_ext autoreload

In [7]:
%autoreload

In [35]:
from lightgbm import LGBMClassifier

**Reading of data**

In [8]:
data = pd.read_csv('retail_train.csv')
data_test = pd.read_csv('retail_test.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')

**Precessing of train data**

In [9]:
# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

In [10]:
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

Т.к. есть отдельный валидационный датасет валидацию будем проводить на нем.

In [11]:
# val_lvl_1_size_weeks = 6
# val_lvl_2_size_weeks = 3

# data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
# data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
#                       (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]
data_train_lvl_1 = data.copy()
data_val_lvl_1 = data_test.copy()

In [12]:
data_test.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,1340,41652823310,664,912987,1,8.49,446,0.0,52,96,0.0,0.0
1,588,41652838477,664,1024426,1,6.29,388,0.0,8,96,0.0,0.0
2,2070,41652857291,664,995242,5,9.1,311,-0.6,46,96,0.0,0.0
3,1602,41665647035,664,827939,1,7.99,334,0.0,1741,96,0.0,0.0
4,1602,41665647035,664,927712,1,0.59,334,-0.4,1741,96,0.0,0.0


In [13]:
data.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


Датасеты для модели второго уровня.

In [15]:
data_train_lvl_1.shape

(2396804, 12)

In [16]:
data_val_lvl_1.shape

(88734, 12)

Применим фильтры к датасетам

In [17]:
n_items_before = data_train_lvl_1['item_id'].nunique()

data_train_lvl_1 = prefilter_items(data_train_lvl_1, take_n_popular=5000, item_features=item_features)

n_items_after = data_train_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 5001


In [18]:
data_train_lvl_1.shape

(981281, 13)

In [19]:
n_items_before = data_val_lvl_1['item_id'].nunique()

data_val_lvl_1 = prefilter_items(data_val_lvl_1, take_n_popular=5000, item_features=item_features)

n_items_after = data_val_lvl_1['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 20497 to 5001


Обучим модель.

In [20]:
recommender = MainRecommender(data_train_lvl_1)



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

Готовим файл с результатами.  
По итогу выберем оптимальное значение с которым и будем работать в дальнейшем.

In [21]:
result_lvl_1 = data_val_lvl_1.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_1.columns=['user_id', 'actual']
result_lvl_1.head(2)

Unnamed: 0,user_id,actual
0,1,"[999999, 883616, 940947, 959219, 991024, 10049..."
1,2,"[999999, 866211, 879769, 885023, 899624, 94094..."


Recomendations for different types of recommenders

In [22]:
for k in [5]:
    print(f'Iteration k={k}')
    result_lvl_1[f'als_{k}'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_als_recommendations(x, N=k))
    print(f'...als finished')
    result_lvl_1[f'own_{k}'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=k))
    print(f'...own finished')
    result_lvl_1[f'sim_user_{k}'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_users_recommendation(x, N=k))
    print(f'...sim_user finished')
    result_lvl_1[f'sim_item_{k}'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_similar_items_recommendation(x, N=k))
    print(f'...sim_item finished')



Iteration k=5
...als finished
...own finished
...sim_user finished
...sim_item finished


In [23]:
result_lvl_1.head(2)

Unnamed: 0,user_id,actual,als_5,own_5,sim_user_5,sim_item_5
0,1,"[999999, 883616, 940947, 959219, 991024, 10049...","[920200, 15926844, 948670, 960732, 856942]","[856942, 9297615, 5577022, 8293439, 9655212]","[1028422, 1126786, 13115981, 5981267, 10342382]","[1022097, 5582712, 9297615, 5577022, 1132231]"
1,2,"[999999, 866211, 879769, 885023, 899624, 94094...","[1021324, 978332, 13158064, 826835, 5569230]","[1103898, 911974, 1076580, 5567582, 1007414]","[944172, 9677454, 847962, 941515, 931124]","[999999, 8090537, 5569845, 985999, 819978]"


In [24]:
result_lvl_1.isna().sum()

user_id       0
actual        0
als_5         1
own_5         1
sim_user_5    9
sim_item_5    0
dtype: int64

In [25]:
result_lvl_1 = result_lvl_1.dropna()

Подсчет значений по метрике precision@k

In [26]:
total_result_lvl_1 = pd.DataFrame([{}])
for k in [5]:
    total_result_lvl_1[f'als_{k}'] = result_lvl_1.apply(lambda row: precision_at_k(row[f'als_{k}'], row['actual'], k), axis=1).mean()
    total_result_lvl_1[f'own_{k}'] = result_lvl_1.apply(lambda row: precision_at_k(row[f'own_{k}'], row['actual'], k), axis=1).mean()
    total_result_lvl_1[f'sim_user_{k}'] = result_lvl_1.apply(lambda row: precision_at_k(row[f'sim_user_{k}'], row['actual'], k), axis=1).mean()
    total_result_lvl_1[f'sim_item_{k}'] = result_lvl_1.apply(lambda row: precision_at_k(row[f'sim_item_{k}'], row['actual'], k), axis=1).mean()

In [27]:
total_result_lvl_1

Unnamed: 0,als_5,own_5,sim_user_5,sim_item_5
0,0.095638,0.120707,0.001215,0.151518


In [28]:
data_train_lvl_2 = data_val_lvl_1.copy()
data_val_lvl_2 = data_test.copy()

# data_train_lvl_1.head(2)

In [29]:
result_lvl_1.to_csv('result_lvl_1.csv', sep=';')

In [30]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

In [31]:
y_train = targets_lvl_2[['target']]
X_train = targets_lvl_2.drop('target', axis=1)


In [32]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')


In [33]:
for key in ['price', 'age', 'income']:
    cat_feats.remove(key) 

ValueError: list.remove(x): x not in list

In [36]:
lgb = LGBMClassifier(objective='binary', max_depth=7, categorical_column=cat_feats)

In [37]:
lgb.fit(X_train, y_train)

  return f(*args, **kwargs)


LGBMClassifier(categorical_column=[], max_depth=7, objective='binary')

In [39]:
train_preds = lgb.predict_proba(X_train)

In [40]:
targets_lvl_2["predict"] = 0
targets_lvl_2["predict"] = train_preds

In [41]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

# Пока только warm start
result_lvl_2 = result_lvl_2[result_lvl_2['user_id'].isin(targets_lvl_2.user_id.unique()) ]

result_lvl_2.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


In [42]:
#grouped.get_group(1).sort_values('predict', ascending=False)['item_id'].reset_index(drop=True)[:50].unique()

k = 5
grouped = targets_lvl_2.groupby('user_id')
for k in [5, 50]:
    result_lvl_2[f'own_{k}'] = None
    result_lvl_2[f'own_{k}'] = result_lvl_2['user_id'].apply(lambda x: grouped.get_group(x).sort_values('predict', ascending=False)['item_id'].reset_index(drop=True)[:k].unique())

In [43]:
total_result_lvl_2 = pd.DataFrame([{}])
# for k in [20, 50, 100, 200, 500]:
for k in [5, 50]:    
    total_result_lvl_2[f'own_precision_{k}'] = result_lvl_2.apply(lambda row: precision_at_k(row[f'own_{k}'], row['actual'], k), axis=1).mean()
    total_result_lvl_2[f'own_recall_{k}'] = result_lvl_2.apply(lambda row: recall_at_k(row[f'own_{k}'], row['actual'], k), axis=1).mean()
#     total_result_lvl_2[f'own_{k}'] = result_lvl_2.apply(lambda row: recall_at_k(row[f'own_{k}'], row['actual'], k), axis=1).mean()
#     total_result_lvl_2[f'sim_user_{k}'] = result_lvl_2.apply(lambda row: recall_at_k(row[f'sim_user_{k}'], row['actual'], k), axis=1).mean()
#     total_result_lvl_2[f'sim_item_{k}'] = result_lvl_2.apply(lambda row: recall_at_k(row[f'sim_item_{k}'], row['actual'], k), axis=1).mean()

In [44]:
result_lvl_1[f'own_5'] = result_lvl_1['user_id'].apply(lambda x: recommender.get_own_recommendations(x, N=5))

In [45]:
# total_result_lvl_1 = pd.DataFrame([{}])
# for k in [20, 50, 100, 200, 500]:
for k in [5]:    
    total_result_lvl_1[f'own_precision_{k}'] = result_lvl_1.apply(lambda row: precision_at_k(row[f'own_{k}'], row['actual'], k), axis=1).mean()
    total_result_lvl_1[f'own_recall_{k}'] = result_lvl_1.apply(lambda row: recall_at_k(row[f'own_{k}'], row['actual'], k), axis=1).mean()


In [46]:
total_result_lvl_1[['own_precision_5', 'own_recall_5']]

Unnamed: 0,own_precision_5,own_recall_5
0,0.120707,0.047803


In [47]:
total_result_lvl_2[['own_precision_5', 'own_recall_5']]

Unnamed: 0,own_precision_5,own_recall_5
0,0.817335,0.156452


In [48]:
result_lvl_2.to_csv('result_lvl_2.csv', sep=';')