In [1]:
from src.data_reader import DataReader
from src.constant import *
import matplotlib.pyplot as plt
import numpy as np
from src.datasets.dataset import Dataset
from src.models.itemknn.itemknn import ItemKNN
import pandas as pd
from src.evaluation import compute_mrr

In [2]:
dataset = Dataset()
split_dict = dataset.get_split()
train, train_label = split_dict[TRAIN]
val, val_label = split_dict[VAL]
test, test_label = split_dict[TEST]

val_test = pd.concat([val, test])
val_test_label = pd.concat([val_label, test_label])


In [19]:
val_label

Unnamed: 0,session_id,item_id,date
0,115,21895,2021-05-27 10:24:05.043
1,526,9191,2021-05-28 08:35:35.820
2,782,8343,2021-05-14 17:48:41.203
3,934,2197,2021-05-02 20:56:45.032
4,937,8353,2021-05-26 14:32:16.215
...,...,...,...
40805,4439487,6771,2021-05-27 13:32:59.702
40806,4439489,973,2021-05-02 17:03:07.823
40807,4439576,16391,2021-05-19 15:44:53.635
40808,4439823,2993,2021-05-29 16:42:31.491


In [3]:
features = dataset.get_oh_item_features()

In [4]:
sess_features = dataset.get_sess_features()

In [24]:
recs = dataset.get_recs_df("EASE_tw", kind="train")

In [25]:
recs

Unnamed: 0,session_id,item_id,EASE_tw_score,EASE_tw_rank
0,115,7392,0.076908,1
1,115,10592,0.028550,2
2,115,10531,0.026369,3
3,115,12390,0.022898,4
4,115,4135,0.020975,5
...,...,...,...,...
8161995,4439949,4742,0.001568,96
8161996,4439949,21143,0.001563,97
8161997,4439949,6511,0.001560,98
8161998,4439949,23279,0.001559,99


In [7]:
val_test_label = val_test_label.rename(columns={ITEM_ID:"relevance"})

In [8]:
val_test_label = val_test_label.drop(DATE, axis=1)

In [20]:
print(f"GT len: {len(val_test_label)}")

GT len: 81620


In [15]:
val_test_label

Unnamed: 0,session_id,relevance
0,115,21895
1,526,9191
2,782,8343
3,934,2197
4,937,8353
...,...,...
40805,4439376,14295
40806,4439488,11403
40807,4439680,17813
40808,4439898,20251


In [9]:
merged=pd.merge(recs, val_test_label, left_on=[SESS_ID, ITEM_ID], 
                right_on=[SESS_ID, "relevance"], how="left")

In [10]:
merged

Unnamed: 0,session_id,item_id,score,rank,relevance
0,115,7392,0.076908,1,
1,115,10592,0.028550,2,
2,115,10531,0.026369,3,
3,115,12390,0.022898,4,
4,115,4135,0.020975,5,
...,...,...,...,...,...
8161995,4439949,4742,0.001568,96,
8161996,4439949,21143,0.001563,97,
8161997,4439949,6511,0.001560,98,
8161998,4439949,23279,0.001559,99,


In [13]:
merged.loc[merged["relevance"].notnull(), "relevance"] = 1
merged["hit_sum"] = merged.groupby(SESS_ID)[
    "relevance"
].transform("sum")

merged_filtered = merged[merged["hit_sum"] > 0]

# we can drop the hit sum column
merged_filtered = merged_filtered.drop("hit_sum", axis=1)

# fill with 0 the nan values, the nan are the one for which we do not do an hit
merged_filtered["relevance"] = merged_filtered["relevance"].fillna(0)

In [14]:
merged_filtered

Unnamed: 0,session_id,item_id,score,rank,relevance
100,526,10819,0.066461,1,0.0
101,526,2592,0.013120,2,0.0
102,526,13318,0.012728,3,0.0
103,526,5968,0.010128,4,0.0
104,526,15161,0.009393,5,0.0
...,...,...,...,...,...
8161995,4439949,4742,0.001568,96,0.0
8161996,4439949,21143,0.001563,97,0.0
8161997,4439949,6511,0.001560,98,0.0
8161998,4439949,23279,0.001559,99,0.0


In [21]:
print(f"Retained sessions: {merged_filtered[SESS_ID].nunique()}")

Retained sessions: 38415


In [26]:
item_features = dataset.get_oh_item_features()

In [27]:
item_features

Unnamed: 0_level_0,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8,cat_9,cat_10,...,val_896,val_897,val_898,val_899,val_900,val_901,val_902,val_903,val_904,val_905
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23686,0,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23687,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23688,1,0,1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
23689,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [28]:
sess_features = dataset.get_sess_features()

In [29]:
sess_features

Unnamed: 0,session_id,cat_1_sum,cat_2_sum,cat_3_sum,cat_4_sum,cat_5_sum,cat_6_sum,cat_7_sum,cat_8_sum,cat_9_sum,...,val_897_sum,val_898_sum,val_899_sum,val_900_sum,val_901_sum,val_902_sum,val_903_sum,val_904_sum,val_905_sum,session_length
0,3,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
1,13,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,18,3.0,0.0,3.0,3.0,3.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,3
3,19,0.0,0.0,17.0,18.0,17.0,0.0,17.0,0.0,0.0,...,0.0,2.0,0.0,0.0,1.0,14.0,0.0,0.0,0.0,17
4,24,0.0,1.0,0.0,4.0,0.0,0.0,8.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099995,4439986,0.0,0.0,0.0,6.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
1099996,4439990,0.0,1.0,8.0,12.0,8.0,0.0,10.0,0.0,0.0,...,0.0,1.0,0.0,0.0,4.0,2.0,0.0,0.0,0.0,11
1099997,4439994,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1099998,4439999,0.0,1.0,6.0,6.0,6.0,0.0,6.0,4.0,0.0,...,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,7


In [30]:
merged_filtered = merged_filtered.set_index(ITEM_ID)

In [33]:
# join item features
recs_item_f = merged_filtered.join(item_features).reset_index()

In [34]:
# join session features

recs_final = merged_filtered.set_index(SESS_ID).join(sess_features.set_index(SESS_ID)).reset_index()

In [35]:
recs_final

Unnamed: 0,session_id,score,rank,relevance,cat_1_sum,cat_2_sum,cat_3_sum,cat_4_sum,cat_5_sum,cat_6_sum,...,val_897_sum,val_898_sum,val_899_sum,val_900_sum,val_901_sum,val_902_sum,val_903_sum,val_904_sum,val_905_sum,session_length
0,113,0.065421,1,0.0,0.0,0.0,4.0,6.0,4.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
1,113,0.020965,2,0.0,0.0,0.0,4.0,6.0,4.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
2,113,0.020177,3,0.0,0.0,0.0,4.0,6.0,4.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
3,113,0.017046,4,0.0,0.0,0.0,4.0,6.0,4.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
4,113,0.014374,5,1.0,0.0,0.0,4.0,6.0,4.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3841495,4439986,0.001545,96,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
3841496,4439986,0.001527,97,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
3841497,4439986,0.001525,98,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
3841498,4439986,0.001513,99,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6


In [None]:
merged.loc[merged["bought_item"].notnull(), "bought_item"] = 1
merged["hit_sum"] = merged.groupby(SESS_ID)[
    "bought_item"
].transforam("sum")

merged_filtered = merged[merged["hit_sum"] > 0]

# we can drop the hit sum column
merged_filtered = merged_filtered.drop("hit_sum", axis=1)

#fill with 0 the nan values, the nan are the one for which we do not do an hit
merged_filtered["article_id"] = merged_filtered["article_id"].fillna(0)

In [None]:
import torch

In [None]:
torch.empty(10)

In [None]:
a = torch.zeros(128, 10, 64)
b = torch.rand(10, 64)

In [None]:
b

In [None]:
a+b

In [None]:
d=c.repeat(1,10)

In [None]:
d

In [None]:
a-d

In [None]:
a = torch.rand(10, 128)

In [None]:
torch.unsqueeze(a, 0).shape

In [None]:
t = torch.tensor([[1, 2], [3, 4]])
t[torch.tensor([[0,0,0], [1, 1, 1]])]

In [None]:
t[torch.tensor([0, 0, 0])]

In [None]:
torch.index_select(t, 0, torch.tensor([[0,1], [1, 1]]))

In [None]:
features_tensor = torch.Tensor(features.values)

In [None]:
features_tensor.shape

In [None]:
padding_feature_tensor = torch.zeros(1, features_tensor.shape[1])

In [None]:
padded_features = torch.cat([features_tensor, padding_feature_tensor])

In [None]:
padded_features[-1, :].sum()

In [None]:
pd.concat([val_label, test_label])[[ITEM_ID]].drop_duplicates()

In [None]:
dataset.get_candidate_items()

In [None]:
import os 
os.getcwd()

In [None]:
pd.read_csv("./../../submissions/avid_rain-5390.csv")

In [None]:
sess2items = dataset.get_sess2items()

In [None]:
sess2items

In [None]:
prova = np.array(range(100))

In [None]:
b=np.random.choice(prova, 5)

In [None]:
b

In [None]:
sampling = sess2items.apply(lambda x: np.random.choice(x, 5))

In [None]:
sampling

In [None]:
max(sess2items.apply(lambda x: len(x)))

In [None]:
padded_sess = sess2items.apply(lambda x: np.array(x[-5:]) if len(x) >= 5 else np.pad(x, (5 - len(x), 0), constant_values=0))

In [None]:
padded_sess

In [None]:
aa=padded_sess.loc[[3,13]]

In [None]:
aa.values

In [None]:
np.stack(aa.values)

In [None]:
aa.values[1].shape

In [None]:
np.stack(aa.values).shape

In [None]:
np.array(aa, dtype=np.float32)

In [None]:
mapping_dict = dict(zip(sess2items.index.values, np.arange(len(sess2items.index.values))))

In [None]:
unique_val_id = val[SESS_ID].unique()

In [None]:
session_indices = [mapping_dict[s] for s in unique_val_id]

In [None]:
padded_sess[session_indices]

In [None]:
sess2items

In [None]:
PADDING_LENGTH = 2
PADDING_IDX = 0

In [None]:
sess2items_array = sess2items.values

In [None]:
sample_1 = [1,2,3]
sample_2 = [4,5]


In [None]:
import numpy as np

In [None]:
dataset.get_train_sessions()

In [None]:
dataset.get_train_purchases()

In [None]:
len(dataset.get_sess2items().index.values)

In [None]:
sess2items_array

In [None]:
padded_sess = np.array([s[-PADDING_LENGTH:] if len(s)>=PADDING_LENGTH else np.pad(s, (PADDING_LENGTH-len(s), 0)) for s in sess2items_array])

In [None]:
padded_sess[0]

In [None]:
CONTEXT_SIZE = 5

In [None]:
np.array([np.random.choice(s, size=CONTEXT_SIZE, replace=True) for s in sess2items_array])

In [None]:
np.pad(sample_1, (0, PADDING_LENGTH-len(sample_1)))

In [None]:
[s[-PADDING_LENGTH:] if len(s)>=5 for s in sess2items_array]

In [None]:
train_sess = dataset.get_train_sessions()

In [None]:
sorted_train_sess = train_sess.sort_values([SESS_ID, DATE])

In [None]:
for sess_id, item_id in zip(sorted_train_sess[SESS_ID], sorted_train_sess[ITEM_ID]):
    pass

In [None]:
dataset.get_sess2items()

In [None]:
train_data = dataset.get_train_sessions()
lead_data = dataset.get_test_leaderboard_sessions()
test_data = dataset.get_test_final_sessions()
all_data = pd.concat([train_data, lead_data, test_data], axis=0)

In [None]:
import torch
import torch.nn.functional as F

In [None]:
a = torch.randn(10,256)

In [None]:
b = torch.randn(10)

In [None]:
b

In [None]:
a/(b.view(10,1))

In [None]:
a/b

In [None]:
emb = torch.nn.Embedding(num_embeddings = dataset._ITEMS_NUM+1, embedding_dim = 10, padding_idx=dataset._ITEMS_NUM)
emb(torch.LongTensor([dataset._ITEMS_NUM]))


In [None]:
rand1 = torch.rand((10, 2))
norm_rand1 = F.normalize(rand1)
rand2 = torch.rand((10, 2))
torch.einsum("bf,bf->b", rand1, rand2)

In [None]:
norm_rand1

In [None]:
rand1

In [None]:
rand.view(1024,-1,64).shape

In [None]:
prova = torch.Tensor(np.array([[1,2,3], [0,0,1]]))
prova

In [None]:
prova.mean(dim=-1)

In [None]:
den = (prova!=0).sum(dim=-1)
num = prova.sum(dim=-1)
num/den
# torch.nonzero(prova)
#mask = prova!=0
#prova*mask.sum(dim=-1)/mask.sum(dim=-1)

In [None]:
emb = torch.nn.Embedding(4, 10, padding_idx=0)

In [None]:
emb_tensor = emb.weight

In [None]:
emb_tensor

In [None]:
emb_tensor[torch.LongTensor([0,1])]

In [None]:
torch.mean(emb(torch.LongTensor([0,1,2])))

In [None]:
torch.mean(emb(torch.LongTensor([1,2])))

In [None]:
a, b = train_label[[SESS_ID, ITEM_ID]].values[0]

In [None]:
b

In [None]:
train[[SESS_ID, ITEM_ID]].to_dict(orient="records")

In [None]:
dataset = Dataset()

In [None]:
item_features = dataset.get_item_features()

In [None]:
item_features

In [None]:
oh_cat = pd.get_dummies(item_features[F_CAT], prefix = "cat")

In [None]:
oh_val = pd.get_dummies(item_features[F_VAL], prefix="val")

In [None]:
item_features_oh = item_features.join(oh_cat).join(oh_val)

In [None]:
item_features_oh = item_features_oh.groupby(ITEM_ID).sum()

In [None]:
item_features_oh = item_features_oh.drop([F_VAL, F_CAT], axis=1).reset_index()

In [None]:
item_features_oh