### init env

In [5]:
!pip install implicit
!pip install lightfm

Collecting implicit
[?25l  Downloading https://files.pythonhosted.org/packages/bc/07/c0121884722d16e2c5beeb815f6b84b41cbf22e738e4075f1475be2791bc/implicit-0.4.4.tar.gz (1.1MB)
[K     |████████████████████████████████| 1.1MB 8.2MB/s 
Building wheels for collected packages: implicit
  Building wheel for implicit (setup.py) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.4.4-cp36-cp36m-linux_x86_64.whl size=3419394 sha256=717597ca13abad8823b5729e1d1f3a0836e1aa0d33001b04a764dff6d3c1382b
  Stored in directory: /root/.cache/pip/wheels/bf/d4/ec/fd4f622fcbefb7521f149905295b2c26adecb23af38aa28217
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.4.4
Collecting lightfm
[?25l  Downloading https://files.pythonhosted.org/packages/5e/fe/8864d723daa8e5afc74080ce510c30f7ad52facf6a157d4b42dec83dfab4/lightfm-1.16.tar.gz (310kB)
[K     |████████████████████████████████| 317kB 7.8MB/s 
Building wheels for collected packages: li

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random

# Для работы с матрицами
from scipy.sparse import csr_matrix

# Матричная факторизация
from implicit import als

# Модель второго уровня
from lightgbm import LGBMClassifier

import os, sys

In [8]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [9]:
!cp -R  /content/gdrive/MyDrive/Recomendation/srcrec  /

In [10]:
path_to_src = "/content/gdrive/MyDrive/Recomendation/"

In [11]:
if path_to_src not in sys.path:
    sys.path.append(path_to_src)

# Написанные функции
from srcrec.metrics import precision_at_k, recall_at_k
from srcrec.utils import prefilter_items
from srcrec.recommenders import MainRecommender

### load data

In [12]:
path_to_files = "/content/gdrive/MyDrive/Recomendation"
data          = pd.read_csv(os.path.join(path_to_files, "df_data.csv"))
user_features = pd.read_csv(os.path.join(path_to_files, "user_features.csv"))
item_features = pd.read_csv(os.path.join(path_to_files, "item_features.csv"))

val_lvl_1_size_weeks = 6
val_lvl_2_size_weeks = 3

### prepare

In [13]:
data_train_lvl_1 = data[data['week_no'] < data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)]
data_val_lvl_1 = data[(data['week_no'] >= data['week_no'].max() - (val_lvl_1_size_weeks + val_lvl_2_size_weeks)) &
                      (data['week_no'] < data['week_no'].max() - (val_lvl_2_size_weeks))]

data_train_lvl_2 = data_val_lvl_1.copy() 
data_val_lvl_2 = data[data['week_no'] >= data['week_no'].max() - val_lvl_2_size_weeks]

data_train_lvl_1.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,999999,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,999999,1,0.82,364,0.0,1631,1,0.0,0.0


In [14]:
recommender = MainRecommender(data_train_lvl_1, item_features, user_features)

user_item_matrix


GPU training requires factor size to be a multiple of 32. Increasing factors from 50 to 64.


user_feat
item_feat
ALS model




ALS TF-IDF model




ALS BM25 model
LightFM model
Own


In [15]:
users_lvl_2_pred_als = pd.DataFrame(data_val_lvl_2['user_id'].unique())
users_lvl_2_pred_als.columns = ['user_id']

In [16]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

In [17]:
k_n = 5
num = 40

In [18]:
popularity = data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
item_list = popularity.sort_values('n_sold', ascending=False).head(num+1).item_id.tolist()
item_list = item_list[1:num+1]

In [19]:
user_train_list = data_train_lvl_1["user_id"].unique().tolist()

### define fit function

In [20]:
def get_pred_als(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [21]:
def get_pred_als_tfidf(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_tfidf_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [22]:
def get_pred_als_bm25(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_als_bm25_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [23]:
def get_pred_own(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_own_recommendations(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

In [24]:
def get_pred_similar_items(recommender, user_train_list, item_list, x, num=5):
    res = []
    if x in user_train_list:
        res.extend(recommender.get_similar_items_recommendation(x, rec_num=num))
    else:
        res.extend(item_list)    # res.extend(random.choices(item_list, k=num))
    return res

### fit

In [25]:
users_lvl_2_pred_als['predictions'] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: get_pred_als(recommender, user_train_list, item_list, x, num=num))
    
users_lvl_2_pred_als["prec"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: precision_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions"].values[0], 
                                 list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

users_lvl_2_pred_als["prec"].mean()

0.20538687561214147

In [26]:
users_lvl_2_pred_als["rec"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: recall_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions"].values[0], 
                              list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                              k=k_n))

users_lvl_2_pred_als["rec"].mean()

0.039768345911801055

In [27]:
users_lvl_2_pred_als['predictions_tfidf'] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: get_pred_als_tfidf(recommender, user_train_list, item_list, x, num=num))
    
users_lvl_2_pred_als["prec_tfidf"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: precision_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions_tfidf"].values[0], 
                                 list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

users_lvl_2_pred_als["prec_tfidf"].mean()

0.1466209598432889

In [28]:
users_lvl_2_pred_als["rec_tfidf"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: recall_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions_tfidf"].values[0], 
                              list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                              k=k_n))

users_lvl_2_pred_als["rec_tfidf"].mean()

0.030443526365587312

In [29]:
users_lvl_2_pred_als['predictions_bm25'] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: get_pred_als_bm25(recommender, user_train_list, item_list, x, num=num))
    
users_lvl_2_pred_als["prec_bm25"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: precision_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions_bm25"].values[0], 
                                 list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                                 k=k_n))

users_lvl_2_pred_als["prec_bm25"].mean()

0.12291870714985213

In [30]:
users_lvl_2_pred_als["rec_bm25"] = users_lvl_2_pred_als['user_id'].apply(\
        lambda x: recall_at_k(users_lvl_2_pred_als.loc[users_lvl_2_pred_als["user_id"] == x, "predictions_bm25"].values[0], 
                              list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                              k=k_n))

users_lvl_2_pred_als["rec_bm25"].mean()

0.023855369944511227

In [31]:
users_lvl_2_pred_als.head()

Unnamed: 0,user_id,predictions,prec,rec,predictions_tfidf,prec_tfidf,rec_tfidf,predictions_bm25,prec_bm25,rec_bm25
0,338,"[845208, 1037863, 981760, 1026118, 1133018, 11...",0.6,0.042857,"[845208, 1037863, 1026118, 1044078, 923746, 89...",0.4,0.028571,"[819840, 1084036, 1025650, 848071, 871611, 102...",0.0,0.0
1,2120,"[981760, 1106523, 1029743, 1126899, 995242, 11...",0.0,0.0,"[1029743, 995242, 981760, 1127831, 1106523, 11...",0.0,0.0,"[981760, 1005186, 1106523, 995055, 1133018, 11...",0.0,0.0
2,2324,"[995242, 1029743, 5978656, 981760, 916122, 961...",0.4,0.051282,"[5978648, 5978656, 1039156, 1058997, 1053690, ...",0.2,0.025641,"[966684, 878611, 5978656, 907392, 1047923, 967...",0.4,0.051282
3,514,"[878996, 866211, 1127831, 1005186, 1062002, 10...",0.6,0.096774,"[1062002, 878996, 866211, 1024306, 1127831, 11...",0.8,0.129032,"[878996, 923169, 1126786, 6391534, 930118, 990...",0.4,0.064516
4,1762,"[1136257, 1068719, 1084331, 986912, 1005186, 8...",0.0,0.0,"[1110572, 1096036, 863802, 1003188, 899624, 55...",0.2,0.012346,"[12670071, 1091809, 1057365, 1003188, 1017201,...",0.4,0.024691


### Make clasification

In [32]:
users_lvl_2 = pd.DataFrame(data_train_lvl_2['user_id'].unique())
users_lvl_2.columns = ['user_id']

users_lvl_2['candidates'] = users_lvl_2['user_id'].apply(lambda x: get_pred_als(recommender, user_train_list, item_list, x, num = num))

In [33]:
users_lvl_2.head()

Unnamed: 0,user_id,candidates
0,2070,"[1038217, 923746, 1096036, 833025, 908531, 962..."
1,2021,"[951590, 981760, 1044078, 1081177, 844179, 100..."
2,1753,"[1085604, 879755, 986912, 1106523, 1133018, 10..."
3,2120,"[981760, 1106523, 1029743, 1126899, 995242, 11..."
4,1346,"[1127831, 866211, 1058997, 1126899, 878996, 98..."


In [34]:
users_lvl_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2154 entries, 0 to 2153
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   user_id     2154 non-null   int64 
 1   candidates  2154 non-null   object
dtypes: int64(1), object(1)
memory usage: 33.8+ KB


In [35]:
df=pd.DataFrame({'user_id':users_lvl_2.user_id.values.repeat(len(users_lvl_2.candidates[0])),
                 'item_id':np.concatenate(users_lvl_2.candidates.values)})

In [36]:
targets_lvl_2 = data_train_lvl_2[['user_id', 'item_id']].copy()
targets_lvl_2['target'] = 1  # тут только покупки 

targets_lvl_2 = df.merge(targets_lvl_2, on=['user_id', 'item_id'], how='left')

targets_lvl_2['target'].fillna(0, inplace= True)

In [37]:
targets_lvl_2.shape

(93037, 3)

In [38]:
targets_lvl_2 = targets_lvl_2.merge(item_features, on='item_id', how='left') #
targets_lvl_2 = targets_lvl_2.merge(user_features, on='user_id', how='left') #

targets_lvl_2.head(2)

Unnamed: 0,user_id,item_id,target,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,bill_avg,week_last_bill,items_per_week
0,2070,1038217,0.0,69,GROCERY,Private,CRACKERS/MISC BKD FD,SOUP CRACKERS (SALTINE/OYSTER),16 OZ,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815
1,2070,923746,0.0,69,GROCERY,Private,EGGS,EGGS - LARGE,18 CT,45-54,U,50-74K,Unknown,Unknown,1,None/Unknown,2.891039,95.0,29.814815


In [39]:
targets_lvl_2.shape

(93037, 19)

In [40]:
targets_lvl_2.fillna(0, inplace=True)

In [41]:
X_train = targets_lvl_2.drop('target', axis=1)
y_train = targets_lvl_2[['target']]

In [42]:
cat_feats = X_train.columns[2:].tolist()
X_train[cat_feats] = X_train[cat_feats].astype('category')

cat_feats

['manufacturer',
 'department',
 'brand',
 'commodity_desc',
 'sub_commodity_desc',
 'curr_size_of_product',
 'age_desc',
 'marital_status_code',
 'income_desc',
 'homeowner_desc',
 'hh_comp_desc',
 'household_size_desc',
 'kid_category_desc',
 'bill_avg',
 'week_last_bill',
 'items_per_week']

In [43]:
lgb = LGBMClassifier(objective='binary', 
                     n_estimators=1500,
                     learning_rate=0.008,
                     max_depth=7, 
                     verbose=1,
                     categorical_column=cat_feats)

hist = lgb.fit(X_train, y_train, 
               eval_set=[(X_train, y_train)],
               eval_metric=['logloss'],
               early_stopping_rounds=50,
               verbose=0)

train_preds = lgb.predict_proba(X_train)

In [44]:
hist.evals_result_['training']['binary_logloss'][-10:]

[0.36252011476394097,
 0.36249066991089807,
 0.3624724431351163,
 0.3624578008623545,
 0.36241702503599427,
 0.36239596563186965,
 0.362358352802727,
 0.3623320121866377,
 0.362311137709414,
 0.36228058230923027]

In [45]:
classifer_prediction = X_train[['user_id', 'item_id']]
classifer_prediction["pred"] = train_preds[:, 1]
classifer_prediction.tail()

Unnamed: 0,user_id,item_id,pred
93032,1745,845208,0.02633
93033,1745,878996,0.015467
93034,1745,1043590,0.020944
93035,1745,1127831,0.016622
93036,1745,866211,0.022204


In [46]:
res_val = classifer_prediction.loc[classifer_prediction["pred"]>0.5, ["user_id", "item_id"]]
res_val.shape

(8216, 2)

In [47]:
pred_lvl_2 = res_val.groupby('user_id')['item_id'].unique().reset_index()
pred_lvl_2.columns=['user_id', 'pred']
pred_lvl_2.head()

Unnamed: 0,user_id,pred
0,1,"[995242, 1029743, 940947]"
1,7,"[1126899, 1106523, 5591154]"
2,8,[1029743]
3,13,"[1029743, 1126899, 951590, 5978656, 5978648, 9..."
4,17,"[1029743, 995242]"


In [48]:
result_lvl_2 = result_lvl_2.merge(pred_lvl_2, on='user_id', how='left')
result_lvl_2.head()

Unnamed: 0,user_id,actual,pred
0,1,"[999999, 856942, 865456, 907957, 914190, 94331...","[995242, 1029743, 940947]"
1,3,"[835476, 999999, 920626, 958154, 1053690, 1096...",
2,6,"[999999, 1006718, 1104227, 1108624, 1110392, 8...",
3,7,"[840386, 999999, 898068, 909714, 993838, 10031...","[1126899, 1106523, 5591154]"
4,8,"[835098, 872137, 910439, 924610, 992977, 99999...",[1029743]


In [49]:
user_pred_nan = result_lvl_2.loc[result_lvl_2["pred"] != result_lvl_2["pred"], 'user_id'].values
len(user_pred_nan)

1417

In [50]:
set_tr_1 = set(data_train_lvl_1.user_id.unique())
len(set_tr_1)

2498

In [51]:
user_list = set(user_pred_nan).intersection(set_tr_1)
len(user_list)

1416

In [52]:
result_lvl_2.loc[result_lvl_2["user_id"].isin(user_list), "pred"] =\
result_lvl_2.loc[result_lvl_2["user_id"].isin(user_list), "user_id"].apply(lambda x: recommender.get_als_recommendations(x, rec_num=k_n))
result_lvl_2.head()

Unnamed: 0,user_id,actual,pred
0,1,"[999999, 856942, 865456, 907957, 914190, 94331...","[995242, 1029743, 940947]"
1,3,"[835476, 999999, 920626, 958154, 1053690, 1096...","[1092026, 1053690, 1106523, 5568378, 951590]"
2,6,"[999999, 1006718, 1104227, 1108624, 1110392, 8...","[854852, 866211, 878996, 1024306, 1127831]"
3,7,"[840386, 999999, 898068, 909714, 993838, 10031...","[1126899, 1106523, 5591154]"
4,8,"[835098, 872137, 910439, 924610, 992977, 99999...",[1029743]


In [53]:
result_lvl_2.loc[result_lvl_2["user_id"] == 1984]

Unnamed: 0,user_id,actual,pred
1622,1984,"[999999, 847344, 859237, 1064380, 1074516, 122...",


In [54]:
popularity = recommender.prep_data.groupby('item_id')['quantity'].sum().reset_index()
popularity.rename(columns={'quantity': 'n_sold'}, inplace=True)
top_5 = popularity.sort_values('n_sold', ascending=False).head(6).item_id.tolist()[1:6]
top_5

[995242, 1029743, 1133018, 981760, 1106523]

In [55]:
result_lvl_2.loc[result_lvl_2["user_id"] == 1984, "pred"] =\
        result_lvl_2.loc[result_lvl_2["user_id"] == 1984, "user_id"].apply(lambda x: top_5)

In [56]:
result_lvl_2.loc[result_lvl_2["user_id"] == 1984]

Unnamed: 0,user_id,actual,pred
1622,1984,"[999999, 847344, 859237, 1064380, 1074516, 122...","[995242, 1029743, 1133018, 981760, 1106523]"


In [57]:
len(result_lvl_2.loc[result_lvl_2["user_id"] == 1984, "pred"].values[0])

5

In [58]:
res = []
res.extend(result_lvl_2.loc[result_lvl_2["user_id"] == 1984, "pred"].values[0])
res

[995242, 1029743, 1133018, 981760, 1106523]

In [59]:
def fill_if_not_5(recommender, x):
    res = []
    res.extend(result_lvl_2.loc[result_lvl_2["user_id"] == x, "pred"].values[0])
    pred_num = len(result_lvl_2.loc[result_lvl_2["user_id"] == x, "pred"].values[0])
    if pred_num < 5:
        res.extend(recommender.get_als_recommendations(x, rec_num=5-pred_num))
    return res

In [60]:
result_lvl_2["pred"] = result_lvl_2["user_id"].apply(lambda x: fill_if_not_5(recommender, x))

In [61]:
result_lvl_2.head()

Unnamed: 0,user_id,actual,pred
0,1,"[999999, 856942, 865456, 907957, 914190, 94331...","[995242, 1029743, 940947, 885290, 995242]"
1,3,"[835476, 999999, 920626, 958154, 1053690, 1096...","[1092026, 1053690, 1106523, 5568378, 951590]"
2,6,"[999999, 1006718, 1104227, 1108624, 1110392, 8...","[854852, 866211, 878996, 1024306, 1127831]"
3,7,"[840386, 999999, 898068, 909714, 993838, 10031...","[1126899, 1106523, 5591154, 1058997, 1126899]"
4,8,"[835098, 872137, 910439, 924610, 992977, 99999...","[1029743, 981760, 5569230, 862349, 1005186]"


In [62]:
result_lvl_2["rec"] = result_lvl_2['user_id'].apply(\
        lambda x: precision_at_k(result_lvl_2.loc[result_lvl_2["user_id"] == x, "pred"].values[0], 
                                 list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                                 k=5))

result_lvl_2['rec'].mean()

0.2357492654260498

### LightGBM Ranker

In [63]:
X_train.shape

(93037, 18)

In [64]:
X_train['num'] = np.arange(len(X_train))

In [65]:
gr = X_train[['user_id', 'num']].groupby('user_id', sort=False).max()['num'].values

In [66]:
grs = np.concatenate((np.array([gr[0]+1]), gr[1:] - gr[:-1]))

In [67]:
X_train.shape, y_train.shape, np.sum(grs)

((93037, 19), (93037, 1), 93037)

In [68]:
from lightgbm import LGBMRanker

lgb_params = { 
    'objective':'lambdarank',
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'learning_rate': 0.08,
    'max_depth': 5,
    'categorical_column': cat_feats,
    'random_state': 27,
    'verbose': 0,
    'is_unbalance': True
}

In [69]:
lgb_rn = LGBMRanker(**lgb_params, silent=False)

eval_h = lgb_rn.fit(X_train, y_train, group=grs,
                    eval_set=[(X_train, y_train)], eval_group=[grs], 
                    eval_metric=['ndcg'],
                    eval_at=[5, 10 ], 
                    early_stopping_rounds=50, 
                    verbose=0)

In [70]:
eval_h.evals_result_['training']['ndcg@5'][:3], eval_h.evals_result_['training']['ndcg@5'][-3:]

([0.5515595441402816, 0.5515595441402816, 0.5582866256058272],
 [0.7530083826580618, 0.7530212156206803, 0.7530212156206803])

In [71]:
rank_preds = lgb_rn.predict(X_train)

In [72]:
ranker_prediction = X_train[['user_id', 'item_id']]
ranker_prediction["pred"] = rank_preds
ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
0,2070,1038217,-1.085295
1,2070,923746,-0.042882
2,2070,1096036,-1.094958
3,2070,833025,-0.069632
4,2070,908531,0.129991


In [73]:
ranker_prediction = ranker_prediction.drop_duplicates()

In [74]:
ranker_prediction.sort_values(by="pred", inplace=True, ascending=False)
ranker_prediction.head()

Unnamed: 0,user_id,item_id,pred
2721,957,14019790,2.436825
50684,1187,6979753,2.145829
50898,1914,894439,1.956276
50666,1187,1038462,1.86577
7145,1228,983795,1.84322


In [75]:
ranked_res = ranker_prediction.groupby('user_id')['item_id'].unique().reset_index()   
ranked_res

Unnamed: 0,user_id,item_id
0,1,"[856942, 995242, 1029743, 940947, 5978648, 107..."
1,2,"[1029743, 1106523, 995242, 1133018, 899624, 97..."
2,4,"[883932, 1029743, 1106523, 1126899, 995242, 10..."
3,6,"[1029743, 995242, 900802, 962568, 923746, 9817..."
4,7,"[1126899, 1106523, 5591154, 1122358, 1003188, ..."
...,...,...
2149,2496,"[1106523, 979707, 981760, 1056509, 1133018, 89..."
2150,2497,"[1029743, 995242, 900802, 1040807, 845208, 104..."
2151,2498,"[1070820, 1126899, 1053690, 1092026, 1058997, ..."
2152,2499,"[1070820, 5568378, 5569327, 1106523, 1133018, ..."


In [76]:
result_lvl_2 = data_val_lvl_2.groupby('user_id')['item_id'].unique().reset_index()
result_lvl_2.columns=['user_id', 'actual']

result_lvl_2 = result_lvl_2.merge(ranked_res, on='user_id', how='left')
result_lvl_2

Unnamed: 0,user_id,actual,item_id
0,1,"[999999, 856942, 865456, 907957, 914190, 94331...","[856942, 995242, 1029743, 940947, 5978648, 107..."
1,3,"[835476, 999999, 920626, 958154, 1053690, 1096...",
2,6,"[999999, 1006718, 1104227, 1108624, 1110392, 8...","[1029743, 995242, 900802, 962568, 923746, 9817..."
3,7,"[840386, 999999, 898068, 909714, 993838, 10031...","[1126899, 1106523, 5591154, 1122358, 1003188, ..."
4,8,"[835098, 872137, 910439, 924610, 992977, 99999...","[1029743, 1044078, 995242, 1005186, 923746, 96..."
...,...,...,...
2037,2496,[999999],"[1106523, 979707, 981760, 1056509, 1133018, 89..."
2038,2497,"[1016709, 9835695, 999999, 845294, 871756, 873...","[1029743, 995242, 900802, 1040807, 845208, 104..."
2039,2498,"[999999, 901776, 914190, 958382, 972437, 10398...","[1070820, 1126899, 1053690, 1092026, 1058997, ..."
2040,2499,"[867188, 877580, 902396, 914190, 951590, 95813...","[1070820, 5568378, 5569327, 1106523, 1133018, ..."


In [77]:
user_pred_nan = result_lvl_2.loc[result_lvl_2["item_id"] != result_lvl_2["item_id"], 'user_id'].values
len(user_pred_nan)

126

In [78]:
set_tr_1 = set(data_train_lvl_1.user_id.unique())
len(set_tr_1)

2498

In [79]:
user_list = set(user_pred_nan).intersection(set_tr_1)
len(user_list)

126

In [80]:
result_lvl_2.loc[result_lvl_2["user_id"].isin(user_list), "item_id"] =\
result_lvl_2.loc[result_lvl_2["user_id"].isin(user_list), "user_id"].apply(lambda x: recommender.get_als_recommendations(x, rec_num=k_n))
result_lvl_2.head()

Unnamed: 0,user_id,actual,item_id
0,1,"[999999, 856942, 865456, 907957, 914190, 94331...","[856942, 995242, 1029743, 940947, 5978648, 107..."
1,3,"[835476, 999999, 920626, 958154, 1053690, 1096...","[1092026, 1053690, 1106523, 5568378, 951590]"
2,6,"[999999, 1006718, 1104227, 1108624, 1110392, 8...","[1029743, 995242, 900802, 962568, 923746, 9817..."
3,7,"[840386, 999999, 898068, 909714, 993838, 10031...","[1126899, 1106523, 5591154, 1122358, 1003188, ..."
4,8,"[835098, 872137, 910439, 924610, 992977, 99999...","[1029743, 1044078, 995242, 1005186, 923746, 96..."


In [81]:
result_lvl_2.loc[result_lvl_2["user_id"] == 1984]

Unnamed: 0,user_id,actual,item_id
1622,1984,"[999999, 847344, 859237, 1064380, 1074516, 122...","[1029743, 1106523, 1126899, 1070820, 995242, 1..."


In [82]:
result_lvl_2["rec"] = result_lvl_2['user_id'].apply(\
        lambda x: precision_at_k(result_lvl_2.loc[result_lvl_2["user_id"] == x, "item_id"].values[0], 
                                 list(result_lvl_2.loc[result_lvl_2["user_id"] == x, "actual"].values[0]), 
                                 k=5))

result_lvl_2['rec'].mean()

0.25680705190988984