# Ranking

In [1]:
%matplotlib inline
from time import time
import numpy as np
import pandas as pd
import math
import gc

In [2]:
def hit_rate_one(ground_truth, recommendation, k):
    if len(list(set(ground_truth) & set(recommendation[:k]))) != 0:
        return 1
    return 0

def hit_rate(ground_truth, recommendation, k):
    return sum([hit_rate_one(ground_truth[i], recommendation[i], k) for i in range(len(recommendation))]) / len(recommendation)

def get_binary_relevance(ground_truth, recomendation):
    relevance = []
    for i in range(len(recomendation)):
        if recomendation[i] in ground_truth:
            relevance.append(1)
        else:
            relevance.append(0)
    return relevance

def precision(recommendation, k):
    return sum(recommendation[:k]) / k

def AP(ground_truth, recommendation, k):
    if k > len(recommendation):
        k = len(recommendation)
    relevance = get_binary_relevance(ground_truth, recommendation)
    if sum(relevance[:k]) == 0:
        return 0
    return sum([relevance[i] * precision(relevance, i + 1) for i in range(k)]) / sum(relevance[:k])

def MAP(ground_truth, recommendation, k):
    return sum([AP(ground_truth[i], recommendation[i], k) for i in range(len(recommendation))]) / len(recommendation)

def DCG_one(ground_truth, recommendation, k):
    if k > len(recommendation):
        k = len(recommendation)
    relevance = get_binary_relevance(ground_truth, recommendation)
    return sum([(2 ** relevance[i] - 1) / (math.log2(i + 2)) for i in range(k)])

def best_DCG_one(ground_truth, recommendation, k):
    if k > len(ground_truth):
        relevance = [1] * len(ground_truth) + [0] * (k - len(ground_truth))
    else:
        relevance = [1] * k
    return sum([(2 ** relevance[i] - 1) / (math.log2(i + 2)) for i in range(k)])

def NDCG_one(ground_truth, recommendation, k):
    return DCG_one(ground_truth, recommendation, k) / best_DCG_one(ground_truth, recommendation, k)

def NDCG(ground_truth, recommendation, k):
    return sum([NDCG_one(ground_truth[i], recommendation[i], k) for i in range(len(recommendation))]) / len(recommendation)

def print_metrics(ground_truth, recommendation, k):
    df = pd.DataFrame({'metric': ['HitRate@k', 'MAP@k', 'NDCG@k'], 
                       'value': [hit_rate(ground_truth, recommendation, k),
                                 MAP(ground_truth, recommendation, k),
                                 NDCG(ground_truth, recommendation, k)]})
    print(df)

## Read data

In [3]:
query_train = pd.read_pickle('data/df_train.pkl')

In [4]:
query_train.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,9,50,0.18,196:0.18,196.0,-0.002918,0.001673,-0.00801,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
1,1,10258,8,50,0.16,10258:0.16,10258.0,0.007623,-0.000338,-0.002633,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
2,1,10326,1,50,0.02,10326:0.02,10326.0,0.004164,-0.003791,-0.003216,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,0.0
3,1,12427,9,50,0.18,12427:0.18,12427.0,7.6e-05,0.002956,-0.009676,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
4,1,13032,2,50,0.04,13032:0.04,13032.0,0.005587,0.005941,-0.006445,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0


In [5]:
query_train.shape

(12084910, 136)

In [6]:
query_test = pd.read_pickle('data/df_train_val.pkl')

In [7]:
query_test.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,10,59,0.169492,196:0.1694915254237288,196.0,0.001409,0.005207,0.002497,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0
1,1,10258,9,59,0.152542,10258:0.15254237288135594,10258.0,0.005821,0.007413,6.1e-05,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0
2,1,10326,1,59,0.016949,10326:0.01694915254237288,10326.0,0.002304,0.003594,0.000376,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0
3,1,12427,10,59,0.169492,12427:0.1694915254237288,12427.0,0.005686,0.012542,-0.002346,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0
4,1,13032,3,59,0.050847,13032:0.05084745762711865,13032.0,0.004938,0.007218,0.002687,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0


In [8]:
query_test.shape

(13057263, 136)

In [9]:
df_test_reordered_pivot = pd.read_pickle('data/df_test_reordered_pivot.pkl')
df_test_reordered_pivot = df_test_reordered_pivot.rename({'product_id':'true_label'}, axis=1)
df_test_reordered_pivot.head()

Unnamed: 0,user_id,true_label
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,3,"[39190, 18599, 23650, 21903, 47766, 24810]"
3,4,"[26576, 25623, 21573]"
4,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."


## LambdaRankNN

In [10]:
from LambdaRankNN import LambdaRankNN # https://pypi.org/project/LambdaRankNN/

### Train batch 20 000

In [11]:
ranker = LambdaRankNN(input_size=128, hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam')

### #1

In [12]:
query_train_lrnn_1 = query_train[query_train['user_id'] < 20000]
query_train_lrnn_1.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,9,50,0.18,196:0.18,196.0,-0.002918,0.001673,-0.00801,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
1,1,10258,8,50,0.16,10258:0.16,10258.0,0.007623,-0.000338,-0.002633,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
2,1,10326,1,50,0.02,10326:0.02,10326.0,0.004164,-0.003791,-0.003216,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,0.0
3,1,12427,9,50,0.18,12427:0.18,12427.0,7.6e-05,0.002956,-0.009676,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
4,1,13032,2,50,0.04,13032:0.04,13032.0,0.005587,0.005941,-0.006445,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0


In [13]:
X_train_lrnn_1 = np.array(query_train_lrnn_1.iloc[:, 7:135].values)
y_train_lrnn_1 = np.array(query_train_lrnn_1.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_1 = np.array(query_train_lrnn_1.loc[:, 'user_id'].values)

In [14]:
%%time
ranker.fit(X_train_lrnn_1, y_train_lrnn_1, qid_train_lrnn_1, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5439999448568997
Wall time: 31min 49s


In [15]:
del query_train_lrnn_1
del X_train_lrnn_1
del y_train_lrnn_1
del qid_train_lrnn_1
gc.collect()

2670

### #2

In [16]:
query_train_lrnn_2 = query_train[(query_train['user_id'] >= 20000) & (query_train['user_id'] < 40000)]
query_train_lrnn_2.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
37295,20057,196,33,426,0.077465,196:0.07746478873239436,196.0,-0.00291835,0.00167324,-0.00800956,...,-0.003194,0.000848,-0.003545,-0.000321,-0.002445,0.001645,0.004606,0.000802,0.000939,1.0
37296,20057,13176,1,426,0.002347,13176:0.002347417840375587,13176.0,8.8323e-08,-1.64463e-07,-3.61833e-07,...,-0.003194,0.000848,-0.003545,-0.000321,-0.002445,0.001645,0.004606,0.000802,0.000939,0.0
37297,20057,17122,21,426,0.049296,17122:0.04929577464788732,17122.0,-0.00308709,0.00402263,-0.00632875,...,-0.003194,0.000848,-0.003545,-0.000321,-0.002445,0.001645,0.004606,0.000802,0.000939,0.0
37298,20057,49235,1,426,0.002347,49235:0.002347417840375587,49235.0,-0.0226245,-0.0191872,0.0472085,...,-0.003194,0.000848,-0.003545,-0.000321,-0.002445,0.001645,0.004606,0.000802,0.000939,0.0
37299,20057,42265,4,426,0.00939,42265:0.009389671361502348,42265.0,-0.00445524,0.00164461,0.000774248,...,-0.003194,0.000848,-0.003545,-0.000321,-0.002445,0.001645,0.004606,0.000802,0.000939,0.0


In [17]:
X_train_lrnn_2 = np.array(query_train_lrnn_2.iloc[:, 7:135].values)
y_train_lrnn_2 = np.array(query_train_lrnn_2.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_2 = np.array(query_train_lrnn_2.loc[:, 'user_id'].values)

In [18]:
%%time
ranker.fit(X_train_lrnn_2, y_train_lrnn_2, qid_train_lrnn_2, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5472211484461429
Wall time: 33min 18s


In [19]:
del query_train_lrnn_2
del X_train_lrnn_2
del y_train_lrnn_2
del qid_train_lrnn_2
gc.collect()

2660

### #3

In [20]:
query_train_lrnn_3 = query_train[(query_train['user_id'] >= 40000) & (query_train['user_id'] < 60000)]
query_train_lrnn_3.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
73654,40039,196,1,16,0.0625,196:0.0625,196.0,-0.002918,0.001673,-0.00801,...,-0.004978,0.009756,0.014309,0.000228,0.008424,-0.020917,0.004051,0.007186,0.012614,0.0
73655,40039,21150,1,16,0.0625,21150:0.0625,21150.0,0.003102,-0.003429,-0.00457,...,-0.004978,0.009756,0.014309,0.000228,0.008424,-0.020917,0.004051,0.007186,0.012614,0.0
73656,40039,1511,1,16,0.0625,1511:0.0625,1511.0,0.005748,0.001878,-0.014344,...,-0.004978,0.009756,0.014309,0.000228,0.008424,-0.020917,0.004051,0.007186,0.012614,0.0
73657,40039,4660,1,16,0.0625,4660:0.0625,4660.0,0.027959,-0.001276,-0.068719,...,-0.004978,0.009756,0.014309,0.000228,0.008424,-0.020917,0.004051,0.007186,0.012614,0.0
73658,40039,39177,1,16,0.0625,39177:0.0625,39177.0,0.020762,-0.000871,0.089201,...,-0.004978,0.009756,0.014309,0.000228,0.008424,-0.020917,0.004051,0.007186,0.012614,1.0


In [21]:
X_train_lrnn_3 = np.array(query_train_lrnn_3.iloc[:, 7:135].values)
y_train_lrnn_3 = np.array(query_train_lrnn_3.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_3 = np.array(query_train_lrnn_3.loc[:, 'user_id'].values)

In [22]:
%%time
ranker.fit(X_train_lrnn_3, y_train_lrnn_3, qid_train_lrnn_3, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5449936745373245
Wall time: 30min 25s


In [23]:
del query_train_lrnn_3
del X_train_lrnn_3
del y_train_lrnn_3
del qid_train_lrnn_3
gc.collect()

2660

In [37]:
# import pickle
# pickle.dump(ranker, open('ranker_3.pkl', 'w'))

TypeError: cannot pickle 'weakref' object

In [29]:
# from rankerNN2pmml import rankerNN2pmml
# params = {
#     'feature_names': list(query_train.columns)[7:135],
#     'target_name': 'reordered'
# }

# rankerNN2pmml(estimator=ranker.model, file='Model_example.xml', **params)

TypeError: Provided model is not supported.

### #4

In [24]:
query_train_lrnn_4 = query_train[(query_train['user_id'] >= 60000) & (query_train['user_id'] < 80000)]
query_train_lrnn_4.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
114081,60010,196,1,27,0.037037,196:0.037037037037037035,196.0,-0.002918,0.001673,-0.00801,...,-0.002441,0.0042,0.007815,-0.004061,0.010253,0.002459,0.001121,0.007863,0.007912,0.0
114082,60010,5782,1,27,0.037037,5782:0.037037037037037035,5782.0,0.003811,0.000288,-0.010749,...,-0.002441,0.0042,0.007815,-0.004061,0.010253,0.002459,0.001121,0.007863,0.007912,0.0
114083,60010,19468,1,27,0.037037,19468:0.037037037037037035,19468.0,-0.005127,-0.001757,0.00988,...,-0.002441,0.0042,0.007815,-0.004061,0.010253,0.002459,0.001121,0.007863,0.007912,0.0
114084,60010,44234,1,27,0.037037,44234:0.037037037037037035,44234.0,-0.014767,-0.008047,0.022281,...,-0.002441,0.0042,0.007815,-0.004061,0.010253,0.002459,0.001121,0.007863,0.007912,0.0
114085,60010,30696,1,27,0.037037,30696:0.037037037037037035,30696.0,0.004183,-0.009553,0.013456,...,-0.002441,0.0042,0.007815,-0.004061,0.010253,0.002459,0.001121,0.007863,0.007912,0.0


In [25]:
X_train_lrnn_4 = np.array(query_train_lrnn_4.iloc[:, 7:135].values)
y_train_lrnn_4 = np.array(query_train_lrnn_4.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_4 = np.array(query_train_lrnn_4.loc[:, 'user_id'].values)

In [26]:
%%time
ranker.fit(X_train_lrnn_4, y_train_lrnn_4, qid_train_lrnn_4, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5449990678247852
Wall time: 31min 53s


In [27]:
del query_train_lrnn_4
del X_train_lrnn_4
del y_train_lrnn_4
del qid_train_lrnn_4
gc.collect()

2660

### #5

In [28]:
query_train_lrnn_5 = query_train[(query_train['user_id'] >= 80000) & (query_train['user_id'] < 100000)]
query_train_lrnn_5.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
150012,80002,196,2,58,0.034483,196:0.034482758620689655,196.0,-0.00291835,0.00167324,-0.00800956,...,0.000386,-0.000346,0.015087,-0.004149,0.007575,-0.012935,0.003442,0.008489,0.005086,0.0
150013,80002,13176,1,58,0.017241,13176:0.017241379310344827,13176.0,8.8323e-08,-1.64463e-07,-3.61833e-07,...,0.000386,-0.000346,0.015087,-0.004149,0.007575,-0.012935,0.003442,0.008489,0.005086,0.0
150014,80002,27966,1,58,0.017241,27966:0.017241379310344827,27966.0,-0.0015504,-0.000451568,0.00716011,...,0.000386,-0.000346,0.015087,-0.004149,0.007575,-0.012935,0.003442,0.008489,0.005086,0.0
150015,80002,45664,1,58,0.017241,45664:0.017241379310344827,45664.0,-0.0213048,0.0147644,0.0169766,...,0.000386,-0.000346,0.015087,-0.004149,0.007575,-0.012935,0.003442,0.008489,0.005086,0.0
150016,80002,14146,2,58,0.034483,14146:0.034482758620689655,14146.0,-0.00272537,-0.0114391,-0.0106444,...,0.000386,-0.000346,0.015087,-0.004149,0.007575,-0.012935,0.003442,0.008489,0.005086,0.0


In [29]:
X_train_lrnn_5 = np.array(query_train_lrnn_5.iloc[:, 7:135].values)
y_train_lrnn_5 = np.array(query_train_lrnn_5.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_5 = np.array(query_train_lrnn_5.loc[:, 'user_id'].values)

In [30]:
%%time
ranker.fit(X_train_lrnn_5, y_train_lrnn_5, qid_train_lrnn_5, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5442092044362319
Wall time: 29min 36s


In [31]:
del query_train_lrnn_5
del X_train_lrnn_5
del y_train_lrnn_5
del qid_train_lrnn_5
gc.collect()

2660

### #6

In [32]:
query_train_lrnn_6 = query_train[(query_train['user_id'] >= 100000) & (query_train['user_id'] < 120000)]
query_train_lrnn_6.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
188586,100002,196,1,35,0.028571,196:0.02857142857142857,196.0,-0.002918,0.001673,-0.00801,...,-0.002616,0.015334,0.010004,-0.01032,0.009768,-0.00452,-0.00351,0.012061,-0.001351,0.0
188587,100002,42450,2,35,0.057143,42450:0.05714285714285714,42450.0,-0.012292,0.015248,0.004069,...,-0.002616,0.015334,0.010004,-0.01032,0.009768,-0.00452,-0.00351,0.012061,-0.001351,1.0
188588,100002,41351,1,35,0.028571,41351:0.02857142857142857,41351.0,-0.02033,-0.020794,0.04765,...,-0.002616,0.015334,0.010004,-0.01032,0.009768,-0.00452,-0.00351,0.012061,-0.001351,0.0
188589,100002,38049,3,35,0.085714,38049:0.08571428571428572,38049.0,0.004313,-0.010814,0.046671,...,-0.002616,0.015334,0.010004,-0.01032,0.009768,-0.00452,-0.00351,0.012061,-0.001351,1.0
188590,100002,26172,9,35,0.257143,26172:0.2571428571428571,26172.0,-0.012826,-0.03508,0.014069,...,-0.002616,0.015334,0.010004,-0.01032,0.009768,-0.00452,-0.00351,0.012061,-0.001351,1.0


In [33]:
X_train_lrnn_6 = np.array(query_train_lrnn_6.iloc[:, 7:135].values)
y_train_lrnn_6 = np.array(query_train_lrnn_6.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_6 = np.array(query_train_lrnn_6.loc[:, 'user_id'].values)

In [34]:
%%time
ranker.fit(X_train_lrnn_6, y_train_lrnn_6, qid_train_lrnn_6, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5423239151172106
Wall time: 30min 27s


In [35]:
del query_train_lrnn_6
del X_train_lrnn_6
del y_train_lrnn_6
del qid_train_lrnn_6
gc.collect()

2660

### #7

In [36]:
query_train_lrnn_7 = query_train[(query_train['user_id'] >= 120000) & (query_train['user_id'] < 140000)]
query_train_lrnn_7.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
229051,120020,196,2,98,0.020408,196:0.02040816326530612,196.0,-0.002918,0.001673,-0.00801,...,-0.001647,-0.006543,0.008296,0.002089,0.004976,-0.004198,0.008767,0.007795,0.005214,0.0
229052,120020,49235,2,98,0.020408,49235:0.02040816326530612,49235.0,-0.022624,-0.019187,0.047209,...,-0.001647,-0.006543,0.008296,0.002089,0.004976,-0.004198,0.008767,0.007795,0.005214,0.0
229053,120020,1559,1,98,0.010204,1559:0.01020408163265306,1559.0,0.006028,-0.005067,-0.010621,...,-0.001647,-0.006543,0.008296,0.002089,0.004976,-0.004198,0.008767,0.007795,0.005214,1.0
229054,120020,21709,1,98,0.010204,21709:0.01020408163265306,21709.0,-0.003349,-0.004136,0.005443,...,-0.001647,-0.006543,0.008296,0.002089,0.004976,-0.004198,0.008767,0.007795,0.005214,1.0
229055,120020,24954,1,98,0.010204,24954:0.01020408163265306,24954.0,-0.00778,-0.002499,0.014989,...,-0.001647,-0.006543,0.008296,0.002089,0.004976,-0.004198,0.008767,0.007795,0.005214,0.0


In [37]:
X_train_lrnn_7 = np.array(query_train_lrnn_7.iloc[:, 7:135].values)
y_train_lrnn_7 = np.array(query_train_lrnn_7.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_7 = np.array(query_train_lrnn_7.loc[:, 'user_id'].values)

In [38]:
%%time
ranker.fit(X_train_lrnn_7, y_train_lrnn_7, qid_train_lrnn_7, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5468817128324764
Wall time: 30min 15s


In [39]:
del query_train_lrnn_7
del X_train_lrnn_7
del y_train_lrnn_7
del qid_train_lrnn_7
gc.collect()

2660

### #8

In [40]:
query_train_lrnn_8 = query_train[(query_train['user_id'] >= 140000) & (query_train['user_id'] < 160000)]
query_train_lrnn_8.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
266189,140024,196,2,81,0.024691,196:0.024691358024691357,196.0,-0.002918,0.001673,-0.00801,...,6.9e-05,0.001226,0.004509,-0.008941,-0.00118,-0.000932,0.006444,0.003015,0.000556,0.0
266190,140024,49235,20,81,0.246914,49235:0.24691358024691357,49235.0,-0.022624,-0.019187,0.047209,...,6.9e-05,0.001226,0.004509,-0.008941,-0.00118,-0.000932,0.006444,0.003015,0.000556,0.0
266191,140024,32478,11,81,0.135802,32478:0.13580246913580246,32478.0,0.002389,-0.001738,0.011383,...,6.9e-05,0.001226,0.004509,-0.008941,-0.00118,-0.000932,0.006444,0.003015,0.000556,1.0
266192,140024,1940,4,81,0.049383,1940:0.04938271604938271,1940.0,0.000798,-0.001186,-0.002575,...,6.9e-05,0.001226,0.004509,-0.008941,-0.00118,-0.000932,0.006444,0.003015,0.000556,0.0
266193,140024,32691,9,81,0.111111,32691:0.1111111111111111,32691.0,0.003745,0.00506,0.015559,...,6.9e-05,0.001226,0.004509,-0.008941,-0.00118,-0.000932,0.006444,0.003015,0.000556,0.0


In [41]:
X_train_lrnn_8 = np.array(query_train_lrnn_8.iloc[:, 7:135].values)
y_train_lrnn_8 = np.array(query_train_lrnn_8.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_8 = np.array(query_train_lrnn_8.loc[:, 'user_id'].values)

In [42]:
%%time
ranker.fit(X_train_lrnn_8, y_train_lrnn_8, qid_train_lrnn_8, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5417676489076133
Wall time: 29min 49s


In [43]:
del query_train_lrnn_8
del X_train_lrnn_8
del y_train_lrnn_8
del qid_train_lrnn_8
gc.collect()

2660

### #9

In [44]:
query_train_lrnn_9 = query_train[(query_train['user_id'] >= 160000) & (query_train['user_id'] < 180000)]
query_train_lrnn_9.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
304542,160060,196,8,436,0.018349,196:0.01834862385321101,196.0,-0.002918,0.001673,-0.00801,...,-0.00053,0.000487,0.006548,-0.000229,0.004948,-0.006907,-0.002909,0.007676,0.000387,1.0
304543,160060,16797,10,436,0.022936,16797:0.022935779816513763,16797.0,0.000135,0.000303,-0.000265,...,-0.00053,0.000487,0.006548,-0.000229,0.004948,-0.006907,-0.002909,0.007676,0.000387,1.0
304544,160060,24852,15,436,0.034404,24852:0.034403669724770644,24852.0,-0.002117,-0.001905,0.001426,...,-0.00053,0.000487,0.006548,-0.000229,0.004948,-0.006907,-0.002909,0.007676,0.000387,0.0
304545,160060,47526,1,436,0.002294,47526:0.0022935779816513763,47526.0,0.000451,0.020605,0.016447,...,-0.00053,0.000487,0.006548,-0.000229,0.004948,-0.006907,-0.002909,0.007676,0.000387,0.0
304546,160060,14992,5,436,0.011468,14992:0.011467889908256881,14992.0,0.005333,0.001499,-0.00483,...,-0.00053,0.000487,0.006548,-0.000229,0.004948,-0.006907,-0.002909,0.007676,0.000387,0.0


In [45]:
X_train_lrnn_9 = np.array(query_train_lrnn_9.iloc[:, 7:135].values)
y_train_lrnn_9 = np.array(query_train_lrnn_9.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_9 = np.array(query_train_lrnn_9.loc[:, 'user_id'].values)

In [46]:
%%time
ranker.fit(X_train_lrnn_9, y_train_lrnn_9, qid_train_lrnn_9, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5491575671456033
Wall time: 30min 34s


In [47]:
del query_train_lrnn_9
del X_train_lrnn_9
del y_train_lrnn_9
del qid_train_lrnn_9
gc.collect()

2660

### #10

In [48]:
query_train_lrnn_10 = query_train[(query_train['user_id'] >= 180000) & (query_train['user_id'] < 200000)]
query_train_lrnn_10.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
339055,180010,196,1,34,0.029412,196:0.029411764705882353,196.0,-0.002918,0.001673,-0.00801,...,-0.011921,0.005411,0.011084,-0.00341,0.010523,0.001448,-0.006525,0.011287,-0.001111,0.0
339056,180010,9387,1,34,0.029412,9387:0.029411764705882353,9387.0,0.006627,-0.000578,-0.00323,...,-0.011921,0.005411,0.011084,-0.00341,0.010523,0.001448,-0.006525,0.011287,-0.001111,0.0
339057,180010,47788,1,34,0.029412,47788:0.029411764705882353,47788.0,-0.028459,0.003021,0.051305,...,-0.011921,0.005411,0.011084,-0.00341,0.010523,0.001448,-0.006525,0.011287,-0.001111,0.0
339058,180010,29791,1,34,0.029412,29791:0.029411764705882353,29791.0,0.003771,-0.03006,0.023516,...,-0.011921,0.005411,0.011084,-0.00341,0.010523,0.001448,-0.006525,0.011287,-0.001111,0.0
339059,180010,16464,1,34,0.029412,16464:0.029411764705882353,16464.0,0.009046,-0.005595,-0.032329,...,-0.011921,0.005411,0.011084,-0.00341,0.010523,0.001448,-0.006525,0.011287,-0.001111,0.0


In [49]:
X_train_lrnn_10 = np.array(query_train_lrnn_10.iloc[:, 7:135].values)
y_train_lrnn_10 = np.array(query_train_lrnn_10.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_10 = np.array(query_train_lrnn_10.loc[:, 'user_id'].values)

In [50]:
%%time
ranker.fit(X_train_lrnn_10, y_train_lrnn_10, qid_train_lrnn_10, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.547122833208431
Wall time: 30min 4s


In [51]:
del query_train_lrnn_10
del X_train_lrnn_10
del y_train_lrnn_10
del qid_train_lrnn_10
gc.collect()

20

### #11

In [52]:
query_train_lrnn_11 = query_train[query_train['user_id'] >= 200000]
query_train_lrnn_11.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
376107,200005,196,4,109,0.036697,196:0.03669724770642202,196.0,-0.00291835,0.00167324,-0.00800956,...,-0.011604,0.005131,0.001757,-0.001579,-0.006589,0.0002,0.00214,0.003478,0.005386,0.0
376108,200005,13176,2,109,0.018349,13176:0.01834862385321101,13176.0,8.8323e-08,-1.64463e-07,-3.61833e-07,...,-0.011604,0.005131,0.001757,-0.001579,-0.006589,0.0002,0.00214,0.003478,0.005386,0.0
376109,200005,41787,1,109,0.009174,41787:0.009174311926605505,41787.0,-0.0061618,0.00407993,0.0112879,...,-0.011604,0.005131,0.001757,-0.001579,-0.006589,0.0002,0.00214,0.003478,0.005386,0.0
376110,200005,9387,1,109,0.009174,9387:0.009174311926605505,9387.0,0.0066267,-0.000577813,-0.00322966,...,-0.011604,0.005131,0.001757,-0.001579,-0.006589,0.0002,0.00214,0.003478,0.005386,0.0
376111,200005,39275,3,109,0.027523,39275:0.027522935779816515,39275.0,-0.00168595,0.00103109,0.00515084,...,-0.011604,0.005131,0.001757,-0.001579,-0.006589,0.0002,0.00214,0.003478,0.005386,0.0


In [53]:
X_train_lrnn_11 = np.array(query_train_lrnn_11.iloc[:, 7:135].values)
y_train_lrnn_11 = np.array(query_train_lrnn_11.loc[:, 'reordered'].values, dtype=np.int64)
qid_train_lrnn_11 = np.array(query_train_lrnn_11.loc[:, 'user_id'].values)

In [54]:
%%time
ranker.fit(X_train_lrnn_11, y_train_lrnn_11, qid_train_lrnn_11, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5436368936624765
Wall time: 8min 19s


In [61]:
del query_train_lrnn_11
del X_train_lrnn_11
del y_train_lrnn_11
del qid_train_lrnn_11
gc.collect()

In [63]:
del query_train
gc.collect()

279

### Test

In [109]:
gc.collect()

20

In [114]:
# X_test_lrnn = np.array(query_test.iloc[:, 7:135])

In [83]:
# y_test_lrnn_pred = ranker.predict(X_test_lrnn)
y_test_lrnn_pred = ranker.predict(np.array(query_test.iloc[:, 7:135]))
y_test_lrnn_pred

array([ 6.3035746,  1.7198427,  3.723648 , ...,  1.2115352, -3.202033 ,
       -1.3889401], dtype=float32)

In [95]:
query_test['LambdaRankNN_weights'] = y_test_lrnn_pred
query_test.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered,LambdaRankNN_weights
0,1,196,10,59,0.169492,196:0.1694915254237288,196.0,0.001409,0.005207,0.002497,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,6.303575
1,1,10258,9,59,0.152542,10258:0.15254237288135594,10258.0,0.005821,0.007413,6.1e-05,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,1.719843
2,1,10326,1,59,0.016949,10326:0.01694915254237288,10326.0,0.002304,0.003594,0.000376,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0,3.723648
3,1,12427,10,59,0.169492,12427:0.1694915254237288,12427.0,0.005686,0.012542,-0.002346,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0,4.016441
4,1,13032,3,59,0.050847,13032:0.05084745762711865,13032.0,0.004938,0.007218,0.002687,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,-0.945061


In [96]:
query_test.to_pickle('data/query_test.pkl')

In [106]:
query_test = query_test.sort_values(by=['user_id', 'LambdaRankNN_weights'], ascending=False)
query_test.reset_index(drop=True, inplace=True)
query_test.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered,LambdaRankNN_weights
0,206209,6825,1,129,0.007752,6825:0.007751937984496124,6825.0,-0.015088,0.026871,-0.0209,...,-0.002122,0.00271,-0.002614,-0.00113,-0.00546,0.004608,0.003469,-0.000821,0.0,4.969576
1,206209,6567,1,129,0.007752,6567:0.007751937984496124,6567.0,0.004388,0.016149,-0.001198,...,-0.002122,0.00271,-0.002614,-0.00113,-0.00546,0.004608,0.003469,-0.000821,0.0,4.432597
2,206209,26634,1,129,0.007752,26634:0.007751937984496124,26634.0,-0.013107,-0.010004,0.010183,...,-0.002122,0.00271,-0.002614,-0.00113,-0.00546,0.004608,0.003469,-0.000821,0.0,2.191908
3,206209,1979,1,129,0.007752,1979:0.007751937984496124,1979.0,-0.017001,0.012701,-0.003068,...,-0.002122,0.00271,-0.002614,-0.00113,-0.00546,0.004608,0.003469,-0.000821,0.0,2.003428
4,206209,6846,10,129,0.077519,6846:0.07751937984496124,6846.0,0.000177,0.019345,-0.021264,...,-0.002122,0.00271,-0.002614,-0.00113,-0.00546,0.004608,0.003469,-0.000821,1.0,0.861858


In [110]:
query_test_pivot = query_test.pivot_table(index='user_id', values=['product_id'], 
                                                            aggfunc={'product_id': lambda x: list(x)[:10]}).reset_index()
query_test_pivot = query_test_pivot.rename({'product_id':'LambdaRankNN_recommend'}, axis=1)
query_test_pivot.head()

Unnamed: 0,user_id,LambdaRankNN_recommend
0,1,"[196, 12427, 10326, 17122, 10258, 13176, 35951..."
1,2,"[13742, 1559, 5322, 4957, 21709, 35917, 5450, ..."
2,3,"[1819, 1005, 9387, 39190, 16797, 248, 15143, 3..."
3,4,"[2707, 36606, 7350, 42329, 35469, 43704, 1200,..."
4,5,"[31717, 6808, 13988, 3376, 13870, 22475, 26604..."


In [111]:
query_test_pivot = query_test_pivot.merge(df_test_reordered_pivot, on='user_id', how='left')
query_test_pivot.head()

Unnamed: 0,user_id,LambdaRankNN_recommend,true_label
0,1,"[196, 12427, 10326, 17122, 10258, 13176, 35951...","[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[13742, 1559, 5322, 4957, 21709, 35917, 5450, ...","[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,3,"[1819, 1005, 9387, 39190, 16797, 248, 15143, 3...","[39190, 18599, 23650, 21903, 47766, 24810]"
3,4,"[2707, 36606, 7350, 42329, 35469, 43704, 1200,...","[26576, 25623, 21573]"
4,5,"[31717, 6808, 13988, 3376, 13870, 22475, 26604...","[15349, 19057, 16185, 21413, 20843, 20114, 482..."


In [112]:
query_test_pivot.to_pickle('data/query_test_pivot_LambdaRankNN.pkl')

In [113]:
print_metrics(query_test_pivot['true_label'], query_test_pivot['LambdaRankNN_recommend'], 10)

      metric     value
0  HitRate@k  0.653410
1      MAP@k  0.247062
2     NDCG@k  0.174280


### Train all (MemoryError)

In [7]:
query_train.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,-0.002918,0.001673,-0.00801,-0.001002,-0.000706,0.000638,0.001654,0.003558,-0.002566,0.000831,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
1,0.007623,-0.000338,-0.002633,-0.011347,-0.005478,0.007257,0.00176,0.015209,-0.004028,0.01014,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
2,0.004164,-0.003791,-0.003216,-0.002906,-0.00572,0.000338,0.001366,-0.001528,0.002001,0.002501,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
3,7.6e-05,0.002956,-0.009676,-0.003613,-0.007548,-0.003277,0.000586,0.005006,0.000649,0.011093,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
4,0.005587,0.005941,-0.006445,-0.010694,-0.00851,-0.000476,-0.000557,0.005673,7.9e-05,0.013272,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745


In [8]:
X_train_lrnn = query_train.iloc[:, 7:135]
y_train_lrnn = query_train.loc[:, 'reordered'].values
qid_train_lrnn = query_train.loc[:, 'user_id'].values
qid_train_lrnn

array([     1,      1,      1, ..., 196943, 201262,  66343], dtype=int64)

In [9]:
X_train_lrnn = np.array(X_train_lrnn.values)
y_train_lrnn = np.array(y_train_lrnn, dtype=np.int64)
qid_train_lrnn = np.array(qid_train_lrnn)

In [10]:
ranker = LambdaRankNN(input_size=X_train_lrnn.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam')

In [11]:
%%time
ranker.fit(X_train_lrnn, y_train_lrnn, qid_train_lrnn, epochs=5)

MemoryError: Unable to allocate 87.6 GiB for an array with shape (91855793, 128) and data type float64

In [18]:
# y_train_lrnn_pred = ranker.predict(X_train_lrnn)
# y_train_lrnn_pred

In [16]:
# ranker.evaluate(X_train_lrnn, y_train_lrnn, qid_train_lrnn, eval_at=10)

### Train 1000 users

In [9]:
query_train_lrnn = query_train[query_train['user_id'] < 1000]

In [10]:
query_train_lrnn.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,9,50,0.18,196:0.18,196.0,-0.002918,0.001673,-0.00801,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
1,1,10258,8,50,0.16,10258:0.16,10258.0,0.007623,-0.000338,-0.002633,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
2,1,10326,1,50,0.02,10326:0.02,10326.0,0.004164,-0.003791,-0.003216,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,0.0
3,1,12427,9,50,0.18,12427:0.18,12427.0,7.6e-05,0.002956,-0.009676,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
4,1,13032,2,50,0.04,13032:0.04,13032.0,0.005587,0.005941,-0.006445,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0


In [11]:
query_train_lrnn.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,-0.002918,0.001673,-0.00801,-0.001002,-0.000706,0.000638,0.001654,0.003558,-0.002566,0.000831,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
1,0.007623,-0.000338,-0.002633,-0.011347,-0.005478,0.007257,0.00176,0.015209,-0.004028,0.01014,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
2,0.004164,-0.003791,-0.003216,-0.002906,-0.00572,0.000338,0.001366,-0.001528,0.002001,0.002501,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
3,7.6e-05,0.002956,-0.009676,-0.003613,-0.007548,-0.003277,0.000586,0.005006,0.000649,0.011093,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
4,0.005587,0.005941,-0.006445,-0.010694,-0.00851,-0.000476,-0.000557,0.005673,7.9e-05,0.013272,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745


In [13]:
X_train_lrnn = query_train_lrnn.iloc[:, 7:135]
y_train_lrnn = query_train_lrnn.loc[:, 'reordered'].values
qid_train_lrnn = query_train_lrnn.loc[:, 'user_id'].values
qid_train_lrnn

array([  1,   1,   1, ..., 619, 686, 986], dtype=int64)

In [14]:
X_train_lrnn = np.array(X_train_lrnn.values)
y_train_lrnn = np.array(y_train_lrnn, dtype=np.int64)
qid_train_lrnn = np.array(qid_train_lrnn)

In [16]:
ranker = LambdaRankNN(input_size=X_train_lrnn.shape[1], hidden_layer_sizes=(16,8,), activation=('relu', 'relu',), solver='adam')

In [17]:
%%time
ranker.fit(X_train_lrnn, y_train_lrnn, qid_train_lrnn, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
ndcg: 0.5775741327337354
Wall time: 1min 30s


In [18]:
y_train_lrnn_pred = ranker.predict(X_train_lrnn)

In [19]:
y_train_lrnn_pred

array([-1.1487375, -2.3165817, -1.7565346, ..., -3.0545864, -1.5383464,
       -5.802136 ], dtype=float32)

In [20]:
ranker.evaluate(X_train_lrnn, y_train_lrnn, qid_train_lrnn, eval_at=10)

ndcg@10: 0.4190110123130567


### Test 1000 users

In [21]:
query_test_lrnn = query_test[query_test['user_id'] < 1000]

In [22]:
query_test_lrnn.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,10,59,0.169492,196:0.1694915254237288,196.0,0.001409,0.005207,0.002497,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0
1,1,10258,9,59,0.152542,10258:0.15254237288135594,10258.0,0.005821,0.007413,6.1e-05,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0
2,1,10326,1,59,0.016949,10326:0.01694915254237288,10326.0,0.002304,0.003594,0.000376,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0
3,1,12427,10,59,0.169492,12427:0.1694915254237288,12427.0,0.005686,0.012542,-0.002346,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0
4,1,13032,3,59,0.050847,13032:0.05084745762711865,13032.0,0.004938,0.007218,0.002687,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0


In [28]:
query_test_lrnn.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,0.001409,0.005207,0.002497,0.001743,0.000704,0.007053,0.000961,0.003151,-0.004303,0.007784,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
1,0.005821,0.007413,6.1e-05,-0.001078,0.001821,0.00228,0.002057,-0.001182,-0.010053,0.0032,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
2,0.002304,0.003594,0.000376,0.0003,0.002372,0.001176,0.000238,-0.001016,-0.001593,0.000499,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
3,0.005686,0.012542,-0.002346,0.008426,0.005197,0.004188,-0.001592,-0.004076,-0.007299,-0.00169,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
4,0.004938,0.007218,0.002687,0.007423,0.005073,0.001644,0.001098,-0.004222,-0.011638,-0.003541,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842


In [29]:
X_test_lrnn = query_test_lrnn.iloc[:, 7:135]
y_test_lrnn = query_test_lrnn.loc[:, 'reordered'].values
qid_test_lrnn = query_test_lrnn.loc[:, 'user_id'].values
qid_test_lrnn

array([  1,   1,   1, ..., 663, 619, 986], dtype=int64)

In [30]:
X_test_lrnn = np.array(X_test_lrnn.values)
y_test_lrnn = np.array(y_test_lrnn, dtype=np.int64)
qid_test_lrnn = np.array(qid_test_lrnn)

In [31]:
y_test_lrnn_pred = ranker.predict(X_test_lrnn)

In [32]:
y_test_lrnn_pred

array([ 0.537487  , -1.0570554 , -0.52575386, ..., -1.8597049 ,
       -1.8250198 ,  0.12280083], dtype=float32)

In [33]:
ranker.evaluate(X_test_lrnn, y_test_lrnn, qid_test_lrnn, eval_at=10)

ndcg@10: 0.20935587507894388


In [34]:
query_test_lrnn['LambdaRankNN_proba'] = y_test_lrnn_pred
query_test_lrnn.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_test_lrnn['LambdaRankNN_proba'] = y_test_lrnn_pred


Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered,LambdaRankNN_proba
0,1,196,10,59,0.169492,196:0.1694915254237288,196.0,0.001409,0.005207,0.002497,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,0.537487
1,1,10258,9,59,0.152542,10258:0.15254237288135594,10258.0,0.005821,0.007413,6.1e-05,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,-1.057055
2,1,10326,1,59,0.016949,10326:0.01694915254237288,10326.0,0.002304,0.003594,0.000376,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0,-0.525754
3,1,12427,10,59,0.169492,12427:0.1694915254237288,12427.0,0.005686,0.012542,-0.002346,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0,-0.748508
4,1,13032,3,59,0.050847,13032:0.05084745762711865,13032.0,0.004938,0.007218,0.002687,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,-1.618577


In [35]:
query_test_lrnn = query_test_lrnn.sort_values(by=['user_id', 'LambdaRankNN_proba'], ascending=False)
query_test_lrnn.reset_index(drop=True, inplace=True)
query_test_lrnn.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered,LambdaRankNN_proba
0,999,1476,2,988,0.002024,1476:0.0020242914979757085,1476.0,-0.046046,0.026688,-0.017409,...,0.000772,0.000749,-0.001319,-0.001663,0.002852,0.005463,0.003597,0.002072,0.0,1.247136
1,999,5392,1,988,0.001012,5392:0.0010121457489878543,5392.0,-0.032877,0.03348,0.004977,...,0.000772,0.000749,-0.001319,-0.001663,0.002852,0.005463,0.003597,0.002072,0.0,1.047388
2,999,39921,2,988,0.002024,39921:0.0020242914979757085,39921.0,0.037254,-0.016318,0.038172,...,0.000772,0.000749,-0.001319,-0.001663,0.002852,0.005463,0.003597,0.002072,0.0,0.7155
3,999,34055,1,988,0.001012,34055:0.0010121457489878543,34055.0,0.000891,-0.008681,0.015536,...,0.000772,0.000749,-0.001319,-0.001663,0.002852,0.005463,0.003597,0.002072,0.0,0.686936
4,999,2164,1,988,0.001012,2164:0.0010121457489878543,2164.0,-0.012827,-0.008801,-0.005632,...,0.000772,0.000749,-0.001319,-0.001663,0.002852,0.005463,0.003597,0.002072,0.0,0.585415


In [37]:
query_test_lrnn_pivot = query_test_lrnn.pivot_table(index='user_id', values=['product_id'], 
                                                            aggfunc={'product_id': lambda x: list(x)[:10]}).reset_index()
query_test_lrnn_pivot = query_test_lrnn_pivot.rename({'product_id':'LambdaRankNN_recommend'}, axis=1)
query_test_lrnn_pivot.head()

Unnamed: 0,user_id,LambdaRankNN_recommend
0,1,"[26088, 196, 17122, 10326, 13176, 12427, 14084..."
1,2,"[9124, 27413, 16589, 24990, 13742, 5322, 5869,..."
2,3,"[39922, 15143, 38596, 14992, 1819, 1005, 23650..."
3,4,"[7160, 17769, 42329, 43704, 2707, 11865, 7350,..."
4,5,"[15349, 43693, 6808, 21616, 13988, 40706, 3376..."


In [38]:
query_test_lrnn_pivot = query_test_lrnn_pivot.merge(df_test_reordered_pivot, on='user_id', how='left')
query_test_lrnn_pivot.head()

Unnamed: 0,user_id,LambdaRankNN_recommend,true_label
0,1,"[26088, 196, 17122, 10326, 13176, 12427, 14084...","[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[9124, 27413, 16589, 24990, 13742, 5322, 5869,...","[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,3,"[39922, 15143, 38596, 14992, 1819, 1005, 23650...","[39190, 18599, 23650, 21903, 47766, 24810]"
3,4,"[7160, 17769, 42329, 43704, 2707, 11865, 7350,...","[26576, 25623, 21573]"
4,5,"[15349, 43693, 6808, 21616, 13988, 40706, 3376...","[15349, 19057, 16185, 21413, 20843, 20114, 482..."


In [39]:
print_metrics(query_test_lrnn_pivot['true_label'], query_test_lrnn_pivot['LambdaRankNN_recommend'], 10)

      metric     value
0  HitRate@k  0.632633
1      MAP@k  0.244464
2     NDCG@k  0.167357


## LambdaMART

In [44]:
import pyltr

### Train 1000 (sklearn error)

In [None]:
query_train_lm = query_train[query_train['user_id'] < 1000]

In [41]:
query_train_lm.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,9,50,0.18,196:0.18,196.0,-0.002918,0.001673,-0.00801,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
1,1,10258,8,50,0.16,10258:0.16,10258.0,0.007623,-0.000338,-0.002633,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
2,1,10326,1,50,0.02,10326:0.02,10326.0,0.004164,-0.003791,-0.003216,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,0.0
3,1,12427,9,50,0.18,12427:0.18,12427.0,7.6e-05,0.002956,-0.009676,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
4,1,13032,2,50,0.04,13032:0.04,13032.0,0.005587,0.005941,-0.006445,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0


In [42]:
query_train_lm.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,-0.002918,0.001673,-0.00801,-0.001002,-0.000706,0.000638,0.001654,0.003558,-0.002566,0.000831,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
1,0.007623,-0.000338,-0.002633,-0.011347,-0.005478,0.007257,0.00176,0.015209,-0.004028,0.01014,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
2,0.004164,-0.003791,-0.003216,-0.002906,-0.00572,0.000338,0.001366,-0.001528,0.002001,0.002501,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
3,7.6e-05,0.002956,-0.009676,-0.003613,-0.007548,-0.003277,0.000586,0.005006,0.000649,0.011093,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
4,0.005587,0.005941,-0.006445,-0.010694,-0.00851,-0.000476,-0.000557,0.005673,7.9e-05,0.013272,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745


In [43]:
X_train_lm = query_train_lm.iloc[:, 7:135]
y_train_lm = query_train_lm.loc[:, 'reordered'].values
qid_train_lm = query_train_lm.loc[:, 'user_id'].values
qid_train_lm

array([  1,   1,   1, ..., 619, 686, 986], dtype=int64)

In [45]:
metric = pyltr.metrics.NDCG(k=10)
model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=10,
    learning_rate=0.02,
    max_features=0.5,
    query_subsample=0.5,
    max_leaf_nodes=10,
    min_samples_leaf=64,
    verbose=1,
)

In [46]:
model.fit(X_train_lm, y_train_lm, qid_train_lm)

 Iter  Train score  OOB Improve    Remaining                           Monitor Output 


TypeError: __init__() got an unexpected keyword argument 'presort'

In [47]:
import sklearn
sklearn.__version__

'0.24.1'

In [48]:
!pip install -U scikit-learn==0.21.3 --force-reinstall

Collecting scikit-learn==0.21.3
  Using cached scikit-learn-0.21.3.tar.gz (12.2 MB)
Collecting numpy>=1.11.0
  Using cached numpy-1.20.2-cp38-cp38-win_amd64.whl (13.7 MB)
Collecting scipy>=0.17.0
  Using cached scipy-1.6.2-cp38-cp38-win_amd64.whl (32.7 MB)
Collecting joblib>=0.11
  Using cached joblib-1.0.1-py3-none-any.whl (303 kB)
Building wheels for collected packages: scikit-learn
  Building wheel for scikit-learn (setup.py): started
  Building wheel for scikit-learn (setup.py): finished with status 'error'
  Running setup.py clean for scikit-learn
Failed to build scikit-learn
Installing collected packages: numpy, scipy, joblib, scikit-learn
  Attempting uninstall: numpy
    Found existing installation: numpy 1.20.2
    Uninstalling numpy-1.20.2:
      Successfully uninstalled numpy-1.20.2


  ERROR: Command errored out with exit status 1:
   command: 'C:\Users\Elizaveta_Masharina\Anaconda3\python.exe' -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\Elizaveta_Masharina\\AppData\\Local\\Temp\\pip-install-jig0ekhw\\scikit-learn\\setup.py'"'"'; __file__='"'"'C:\\Users\\Elizaveta_Masharina\\AppData\\Local\\Temp\\pip-install-jig0ekhw\\scikit-learn\\setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\Elizaveta_Masharina\AppData\Local\Temp\pip-wheel-0nsr61cm'
       cwd: C:\Users\Elizaveta_Masharina\AppData\Local\Temp\pip-install-jig0ekhw\scikit-learn\
  Complete output (33 lines):
  Partial import of sklearn during the build process.
  No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  Traceback (most recent call last):
    File "<string>", line 1, in <module>
    File "

In [None]:
# y_train_lm_pred = model.predict(X_train_lm)
# y_train_lm_pred

## LGBMRanker

In [23]:
import lightgbm as lgb

### Train (MemoryError)

In [18]:
query_train.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,-0.002918,0.001673,-0.00801,-0.001002,-0.000706,0.000638,0.001654,0.003558,-0.002566,0.000831,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
1,0.007623,-0.000338,-0.002633,-0.011347,-0.005478,0.007257,0.00176,0.015209,-0.004028,0.01014,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
2,0.004164,-0.003791,-0.003216,-0.002906,-0.00572,0.000338,0.001366,-0.001528,0.002001,0.002501,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
3,7.6e-05,0.002956,-0.009676,-0.003613,-0.007548,-0.003277,0.000586,0.005006,0.000649,0.011093,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
4,0.005587,0.005941,-0.006445,-0.010694,-0.00851,-0.000476,-0.000557,0.005673,7.9e-05,0.013272,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745


In [19]:
X_lgbm = query_train.iloc[:, 7:135]
y_lgbm = query_train.loc[:, 'reordered'].values
# qid_lgbm = query_train_lgbm.loc[:, 'user_id'].values
# qid_lgbm

In [20]:
from sklearn.model_selection import train_test_split
X_train_lgbm, X_val_lgbm, y_train_lgbm, y_val_lgbm = train_test_split(X_lgbm, y_lgbm, test_size=0.2, random_state=1)

In [21]:
query_train = [X_train_lgbm.shape[0]]
query_val = [X_val_lgbm.shape[0]]

In [22]:
X_train_lgbm.shape, y_train_lgbm.shape

((9667928, 128), (9667928,))

In [24]:
gbm = lgb.LGBMRanker()

In [25]:
gbm.fit(X_train_lgbm, y_train_lgbm, group=query_train, 
        eval_set=[(X_val_lgbm, y_val_lgbm)], eval_group = [query_val], 
        eval_at=[5, 10, 20], early_stopping_rounds = 50)

MemoryError: Unable to allocate 9.22 GiB for an array with shape (9667928, 128) and data type float64

In [59]:
# y_train_val_lgbm_pred = gbm.predict(X_lgbm)

### Train 200 users

In [49]:
query_train_lgbm = query_train[query_train['user_id'] < 200]
query_train_lgbm.shape

(10862, 136)

In [50]:
query_train_lgbm.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,9,50,0.18,196:0.18,196.0,-0.002918,0.001673,-0.00801,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
1,1,10258,8,50,0.16,10258:0.16,10258.0,0.007623,-0.000338,-0.002633,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
2,1,10326,1,50,0.02,10326:0.02,10326.0,0.004164,-0.003791,-0.003216,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,0.0
3,1,12427,9,50,0.18,12427:0.18,12427.0,7.6e-05,0.002956,-0.009676,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0
4,1,13032,2,50,0.04,13032:0.04,13032.0,0.005587,0.005941,-0.006445,...,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745,1.0


In [51]:
query_train_lgbm.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,-0.002918,0.001673,-0.00801,-0.001002,-0.000706,0.000638,0.001654,0.003558,-0.002566,0.000831,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
1,0.007623,-0.000338,-0.002633,-0.011347,-0.005478,0.007257,0.00176,0.015209,-0.004028,0.01014,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
2,0.004164,-0.003791,-0.003216,-0.002906,-0.00572,0.000338,0.001366,-0.001528,0.002001,0.002501,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
3,7.6e-05,0.002956,-0.009676,-0.003613,-0.007548,-0.003277,0.000586,0.005006,0.000649,0.011093,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745
4,0.005587,0.005941,-0.006445,-0.010694,-0.00851,-0.000476,-0.000557,0.005673,7.9e-05,0.013272,...,-0.000857,-0.000432,-0.001412,-0.005363,-0.00308,-0.007207,-0.000772,0.011067,0.002274,0.001745


In [52]:
X_lgbm = query_train_lgbm.iloc[:, 7:135]
y_lgbm = query_train_lgbm.loc[:, 'reordered'].values
# qid_lgbm = query_train_lgbm.loc[:, 'user_id'].values
# qid_lgbm

In [53]:
from sklearn.model_selection import train_test_split
X_train_lgbm, X_val_lgbm, y_train_lgbm, y_val_lgbm = train_test_split(X_lgbm, y_lgbm, test_size=0.2, random_state=1)

In [54]:
query_train = [X_train_lgbm.shape[0]]
query_val = [X_val_lgbm.shape[0]]

In [55]:
X_train_lgbm.shape, y_train_lgbm.shape

((8689, 128), (8689,))

In [56]:
import lightgbm as lgb

In [57]:
gbm = lgb.LGBMRanker()

In [58]:
gbm.fit(X_train_lgbm, y_train_lgbm, group=query_train, 
        eval_set=[(X_val_lgbm, y_val_lgbm)], eval_group = [query_val], 
        eval_at=[5, 10, 20], early_stopping_rounds = 50)

[1]	valid_0's ndcg@5: 0.131205	valid_0's ndcg@10: 0.0851431	valid_0's ndcg@20: 0.163505
Training until validation scores don't improve for 50 rounds
[2]	valid_0's ndcg@5: 0	valid_0's ndcg@10: 0	valid_0's ndcg@20: 0.107101
[3]	valid_0's ndcg@5: 0	valid_0's ndcg@10: 0	valid_0's ndcg@20: 0.110251
[4]	valid_0's ndcg@5: 0	valid_0's ndcg@10: 0	valid_0's ndcg@20: 0.13875
[5]	valid_0's ndcg@5: 0	valid_0's ndcg@10: 0.0662542	valid_0's ndcg@20: 0.152024
[6]	valid_0's ndcg@5: 0.300785	valid_0's ndcg@10: 0.195189	valid_0's ndcg@20: 0.198416
[7]	valid_0's ndcg@5: 0.146068	valid_0's ndcg@10: 0.230474	valid_0's ndcg@20: 0.218534
[8]	valid_0's ndcg@5: 0.213986	valid_0's ndcg@10: 0.280882	valid_0's ndcg@20: 0.253094
[9]	valid_0's ndcg@5: 0.345191	valid_0's ndcg@10: 0.293437	valid_0's ndcg@20: 0.25699
[10]	valid_0's ndcg@5: 0.485229	valid_0's ndcg@10: 0.456899	valid_0's ndcg@20: 0.327206
[11]	valid_0's ndcg@5: 0.639945	valid_0's ndcg@10: 0.415281	valid_0's ndcg@20: 0.304365
[12]	valid_0's ndcg@5: 0.6992

LGBMRanker()

In [59]:
y_train_val_lgbm_pred = gbm.predict(X_lgbm)

### Test 200 users

In [60]:
query_test_lgbm = query_test[query_test['user_id'] < 200]

In [61]:
query_test_lgbm.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered
0,1,196,10,59,0.169492,196:0.1694915254237288,196.0,0.001409,0.005207,0.002497,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0
1,1,10258,9,59,0.152542,10258:0.15254237288135594,10258.0,0.005821,0.007413,6.1e-05,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0
2,1,10326,1,59,0.016949,10326:0.01694915254237288,10326.0,0.002304,0.003594,0.000376,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0
3,1,12427,10,59,0.169492,12427:0.1694915254237288,12427.0,0.005686,0.012542,-0.002346,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0
4,1,13032,3,59,0.050847,13032:0.05084745762711865,13032.0,0.004938,0.007218,0.002687,...,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0


In [62]:
query_test_lgbm.iloc[:5, 7:135]

Unnamed: 0,1_product,2_product,3_product,4_product,5_product,6_product,7_product,8_product,9_product,10_product,...,55_user,56_user,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user
0,0.001409,0.005207,0.002497,0.001743,0.000704,0.007053,0.000961,0.003151,-0.004303,0.007784,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
1,0.005821,0.007413,6.1e-05,-0.001078,0.001821,0.00228,0.002057,-0.001182,-0.010053,0.0032,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
2,0.002304,0.003594,0.000376,0.0003,0.002372,0.001176,0.000238,-0.001016,-0.001593,0.000499,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
3,0.005686,0.012542,-0.002346,0.008426,0.005197,0.004188,-0.001592,-0.004076,-0.007299,-0.00169,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842
4,0.004938,0.007218,0.002687,0.007423,0.005073,0.001644,0.001098,-0.004222,-0.011638,-0.003541,...,-0.005122,-0.003182,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842


In [63]:
X_test_lgbm = query_test_lgbm.iloc[:, 7:135]
y_test_lgbm = query_test_lgbm.loc[:, 'reordered'].values
# qid_test_lgbm = query_test_lgbm.loc[:, 'user_id'].values
# qid_test_lgbm

In [65]:
y_test_lgbm_pred = gbm.predict(X_test_lgbm)

In [66]:
y_test_lgbm_pred

array([-1.00763234, -1.19188156, -1.06481065, ..., -1.32054768,
       -1.291208  , -0.76003953])

In [67]:
query_test_lgbm['LGBMRanker_proba'] = y_test_lgbm_pred
query_test_lgbm.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  query_test_lgbm['LGBMRanker_proba'] = y_test_lgbm_pred


Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered,LGBMRanker_proba
0,1,196,10,59,0.169492,196:0.1694915254237288,196.0,0.001409,0.005207,0.002497,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,-1.007632
1,1,10258,9,59,0.152542,10258:0.15254237288135594,10258.0,0.005821,0.007413,6.1e-05,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,-1.191882
2,1,10326,1,59,0.016949,10326:0.01694915254237288,10326.0,0.002304,0.003594,0.000376,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0,-1.064811
3,1,12427,10,59,0.169492,12427:0.1694915254237288,12427.0,0.005686,0.012542,-0.002346,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,0.0,-0.944576
4,1,13032,3,59,0.050847,13032:0.05084745762711865,13032.0,0.004938,0.007218,0.002687,...,-0.001425,-0.000811,-0.001972,-0.005607,0.000983,0.000224,0.004254,-0.004842,1.0,-1.049666


In [68]:
query_test_lgbm = query_test_lgbm.sort_values(by=['user_id', 'LGBMRanker_proba'], ascending=False)
query_test_lgbm.reset_index(drop=True, inplace=True)
query_test_lgbm.head()

Unnamed: 0,user_id,product_id,rating,rating_sum,rating_norm,product_and_rating,0,1_product,2_product,3_product,...,57_user,58_user,59_user,60_user,61_user,62_user,63_user,64_user,reordered,LGBMRanker_proba
0,199,27966,2,16,0.125,27966:0.125,27966.0,0.000645,0.000505,-0.002225,...,0.005682,-0.002584,0.004355,0.002186,-0.001008,0.01276,-0.005409,-0.008028,0.0,-1.104573
1,199,27521,2,16,0.125,27521:0.125,27521.0,-0.005818,0.003314,-9.4e-05,...,0.005682,-0.002584,0.004355,0.002186,-0.001008,0.01276,-0.005409,-0.008028,1.0,-1.104573
2,199,28435,1,16,0.0625,28435:0.0625,28435.0,-0.014981,-0.023518,-0.046914,...,0.005682,-0.002584,0.004355,0.002186,-0.001008,0.01276,-0.005409,-0.008028,0.0,-1.104573
3,199,47560,2,16,0.125,47560:0.125,47560.0,0.027152,-0.009294,-0.01502,...,0.005682,-0.002584,0.004355,0.002186,-0.001008,0.01276,-0.005409,-0.008028,0.0,-1.127164
4,199,39275,2,16,0.125,39275:0.125,39275.0,0.002421,-0.004495,-0.000127,...,0.005682,-0.002584,0.004355,0.002186,-0.001008,0.01276,-0.005409,-0.008028,0.0,-1.129217


In [70]:
query_test_lgbm_pivot = query_test_lgbm.pivot_table(index='user_id', values=['product_id'], 
                                                            aggfunc={'product_id': lambda x: list(x)[:10]}).reset_index()
query_test_lgbm_pivot = query_test_lgbm_pivot.rename({'product_id':'LGBMRanker_recommend'}, axis=1)
query_test_lgbm_pivot.head()

Unnamed: 0,user_id,LGBMRanker_recommend
0,1,"[12427, 17122, 30450, 26088, 196, 13176, 38928..."
1,2,"[5869, 21709, 13176, 1559, 4957, 5212, 5322, 5..."
2,3,"[248, 40604, 1005, 7503, 8021, 9387, 43961, 24..."
3,4,"[2707, 17769, 1200, 25146, 7350, 11865, 7160, ..."
4,5,"[3376, 8518, 18761, 24231, 13870, 13988, 31717..."


In [71]:
query_test_lgbm_pivot = query_test_lgbm_pivot.merge(df_test_reordered_pivot, on='user_id', how='left')
query_test_lgbm_pivot.head()

Unnamed: 0,user_id,LGBMRanker_recommend,true_label
0,1,"[12427, 17122, 30450, 26088, 196, 13176, 38928...","[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[5869, 21709, 13176, 1559, 4957, 5212, 5322, 5...","[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,3,"[248, 40604, 1005, 7503, 8021, 9387, 43961, 24...","[39190, 18599, 23650, 21903, 47766, 24810]"
3,4,"[2707, 17769, 1200, 25146, 7350, 11865, 7160, ...","[26576, 25623, 21573]"
4,5,"[3376, 8518, 18761, 24231, 13870, 13988, 31717...","[15349, 19057, 16185, 21413, 20843, 20114, 482..."


In [72]:
print_metrics(query_test_lgbm_pivot['true_label'], query_test_lgbm_pivot['LGBMRanker_recommend'], 10)

      metric     value
0  HitRate@k  0.653266
1      MAP@k  0.252049
2     NDCG@k  0.163699
