In [1]:
import pandas as pd

In [4]:
import sys
sys.path.insert(0, "../scripts/")

from dataset import *

In [2]:
USER_ID, NEWS_ID, IMPRESSION_ID, TIMESTAMP, HISTORY, IMPRESSIONS = 'user_id', 'news_id', 'impression_id', 'timestamp', 'history', 'impressions'

In [19]:
env = 'MINDsmall_train'
train_df_raw = load_behavior_data(f'../data/{env}/behaviors.tsv')
train_df = behavior_to_user_item_pair_w_label(train_df_raw)
train_df.shape

(5843444, 3)

In [20]:
env = 'MINDsmall_dev'
test_df_raw = load_behavior_data(f'../data/{env}/behaviors.tsv')
test_df = behavior_to_user_item_pair_w_label(test_df_raw)
test_df.shape

(2740998, 3)

#### Ratio of shared users and news

If the ratio of shared users and news is low, it means embedding built purely on relationship won't work.

In [15]:
user_set_train = set(train_df[USER_ID].unique().tolist())
user_set_test = set(test_df[USER_ID].unique().tolist())
size_inter = len(user_set_train.intersection(user_set_test))
size_union = len(user_set_train.union(user_set_test))

print(f'number of users in common between train and validation:{size_inter}')
print(f'number of users in total among train and validation:{size_union}')
print(f'shared users percentage {size_inter/size_union*100:.2f}%')

number of users in common between train and validation:5943
number of users in total among train and validation:94057
shared users percentage 6.32%


In [16]:
news_set_train = set(train_df[NEWS_ID].unique().tolist())
news_set_test = set(test_df[NEWS_ID].unique().tolist())
size_inter = len(news_set_train.intersection(news_set_test))
size_union = len(news_set_train.union(news_set_test))

print(f'number of news in common between train and validation:{size_inter}')
print(f'number of news in total among train and validation:{size_union}')
print(f'shared news percentage {size_inter/size_union*100:.2f}%')

number of news in common between train and validation:2886
number of news in total among train and validation:22771
shared news percentage 12.67%


The ratio is indeed pretty low. So it is very likely that models purely on relationship won't work.

As a proof, I did a simple MF. While AUC is around 0.7 on the training data, the AUC on test data is 0.5, which means nothing is really learnt from the model.

#### Number of click per impression

In [22]:
def count_click(impressions):
    return sum([int(x[-1]) for x in impressions.split(' ')])

train_df_raw[IMPRESSIONS].apply(count_click)

0         1
1         1
2         1
3         1
4         1
         ..
156960    1
156961    1
156962    1
156963    5
156964    1
Name: impressions, Length: 156965, dtype: int64

In [23]:
train_df_raw[IMPRESSIONS].apply(count_click).value_counts()

1     113888
2      25571
3       9263
4       3975
5       1957
6        942
7        515
8        296
9        198
10       117
11        81
12        46
13        38
14        22
15        17
16        10
18         9
17         6
19         2
21         2
26         2
31         1
20         1
22         1
23         1
24         1
25         1
27         1
35         1
Name: impressions, dtype: int64

In [24]:
test_df_raw[IMPRESSIONS].apply(count_click).value_counts()

1     52067
2     12707
3      4443
4      1911
5       932
6       426
7       268
8       166
9        83
10       61
11       34
12       17
13       11
16        6
14        5
15        5
17        3
18        2
19        2
21        1
20        1
24        1
Name: impressions, dtype: int64