In [1]:
import os
import pandas as pd
from tqdm.auto import tqdm

import src.constants as constants
from src.common_utils import read_pickled_data
from src.data.preprocessing_news_utils import read_data_news, preprocess_data_news, concat_data_news
from src.data.preprocessing_behaviors_utils import read_data_behaviors, preprocess_data_behaviors, get_time_data, concat_data_behaviors

tqdm.pandas()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dvesely\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dvesely\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Behaviors

### Dimensions and Columns

We examine the behavior data dimensions and columns of each split. For a detailed explanation of how these splits were constructed, refer to the accompanying work. All splits have the same set of columns, but the test set does not include labels in the `impression` column.

In [44]:
splits = ["train", "dev", "test"]
# Iterate over all splits, read data, print dimensions and columns
for i, path in enumerate([constants.TRAIN_PATH, constants.DEV_PATH, constants.TEST_PATH]):
    data_behaviors = read_data_behaviors(path)
    print(f"{splits[i]} data shape: {data_behaviors.shape} -- data columns: {data_behaviors.columns.to_list()}")

train data shape: (2232748, 5) -- data columns: ['id', 'user_id', 'time', 'history', 'impression']
dev data shape: (376471, 5) -- data columns: ['id', 'user_id', 'time', 'history', 'impression']
test data shape: (2370727, 5) -- data columns: ['id', 'user_id', 'time', 'history', 'impression']


The following two cells show samples from the labeled data and unlabeled data.

In [3]:
data_behaviors_labeled = read_data_behaviors(constants.DEV_PATH)
data_behaviors_labeled.head(2)

Unnamed: 0,id,user_id,time,history,impression
0,1,U134050,11/15/2019 8:55:22 AM,N12246 N128820 N119226 N4065 N67770 N33446 N10...,N91737-0 N30206-0 N54368-0 N117802-0 N18190-0 ...
1,2,U254959,11/15/2019 11:42:35 AM,N34011 N9375 N67397 N7936 N118985 N109453 N103...,N119999-0 N24958-0 N104054-0 N33901-0 N9250-0 ...


In [4]:
data_behaviors_unlabeled = read_data_behaviors(constants.TEST_PATH)
data_behaviors_unlabeled.head(2)

Unnamed: 0,id,user_id,time,history,impression
0,1,U64099,11/19/2019 11:37:45 AM,N121133 N104200 N43255 N55860 N128965 N38014 N...,N101071 N15647 N83400 N124838 N57092 N64623 N6...
1,2,U231077,11/19/2019 5:28:08 AM,N45124 N84730 N45128 N104312 N70022 N99111 N26...,N14657 N51253 N49521 N126571 N74286 N101071 N1...


### Concatenations

We create concatenated sets of the data for further use. Namely:
- A full training set (`trainfull`), containing training and dev data. The data is pickled to `CONCAT_TRAINFULL_PATH`.
- A full dataset (`all`), containing all splits. The data is pickled to `CONCAT_ALL_PATH`.

In [3]:
# Read all splits
data_behaviors_train = read_data_behaviors(constants.TRAIN_PATH)
data_behaviors_dev = read_data_behaviors(constants.DEV_PATH)
data_behaviors_test = read_data_behaviors(constants.TEST_PATH)

# Create trainfull data
data_behaviors_trainfull_list = [
    data_behaviors_train,
    data_behaviors_dev
]
data_behaviors_trainfull = concat_data_behaviors(
    data_behaviors_trainfull_list,
    save_dir=constants.CONCAT_TRAINFULL_PATH
)
print(f"concatenated full training data shape: {data_behaviors_trainfull.shape}")

# Create all data
data_behaviors_list = [
    data_behaviors_trainfull,
    data_behaviors_test
]
data_behaviors = concat_data_behaviors(
    data_behaviors_list,
    save_dir=constants.CONCAT_ALL_PATH
)
print(f"concatenated full data shape: {data_behaviors.shape}")

concatenated full training data shape: (2609219, 5)
concatenated full data shape: (4979946, 5)


### Preprocessing

The main preprocessing routine is started in the cells below. The first cell runs the preprocessing on the training data (`train`, `dev`, `trainfull`), the second cell on the `test` data. All preprocessed data is pickled into the same directory as the original data, in a subdirectory `./preprocessed`. Exploratory data has a `exp_` prefix in the filename.

In [3]:
training_data_paths = [constants.TRAIN_PATH, constants.DEV_PATH, constants.CONCAT_TRAINFULL_PATH]
split_names = ["TRAIN DATA", "DEV DATA", "TRAINFULL DATA"]
for i, path in enumerate(training_data_paths):
    print(f"==== PREPROCESSING {split_names[i]} ====")
    # Concatenated data is pickled
    if i == 2:
        data_behaviors = read_pickled_data([path, "behaviors_concat.pkl"])
    else:
        data_behaviors = read_data_behaviors(path)
    
    pp_data_behaviors = preprocess_data_behaviors(
        data_behaviors,
        save_dir=path,
        test_data=False,
        exploration=True,
        get_relevant_news=True,
        save_user_histories=True,
    )

==== PREPROCESSING TRAIN DATA ====
[INFO] preprocessed data will be saved in ../dataset_MIND\MINDlarge_train\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] separating impression column


100%|██████████| 2232748/2232748 [00:28<00:00, 78707.25it/s] 


[INFO] sorting data
[INFO] processing user histories


100%|██████████| 2232748/2232748 [00:13<00:00, 167985.80it/s]


[INFO] saving preprocessed data
[INFO] obtaining exploration data
[INFO] saving exploratory data
[INFO] collecting set of relevant news IDs
[INFO] saving relevant news data
==== PREPROCESSING DEV DATA ====
[INFO] preprocessed data will be saved in ../dataset_MIND\MINDlarge_dev\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] separating impression column


100%|██████████| 376471/376471 [00:05<00:00, 63264.53it/s] 


[INFO] sorting data
[INFO] processing user histories


100%|██████████| 376471/376471 [00:01<00:00, 371042.18it/s]


[INFO] saving preprocessed data
[INFO] obtaining exploration data
[INFO] saving exploratory data
[INFO] collecting set of relevant news IDs
[INFO] saving relevant news data
==== PREPROCESSING TRAINFULL DATA ====
[INFO] preprocessed data will be saved in ../dataset_MIND\MINDlarge_trainfull\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] separating impression column


100%|██████████| 2609219/2609219 [00:35<00:00, 73828.11it/s] 


[INFO] sorting data
[INFO] processing user histories


100%|██████████| 2609219/2609219 [00:13<00:00, 194239.83it/s]


[INFO] saving preprocessed data
[INFO] obtaining exploration data
[INFO] saving exploratory data
[INFO] collecting set of relevant news IDs
[INFO] saving relevant news data


In [3]:
training_data_paths = [constants.TRAIN_PATH, constants.DEV_PATH, constants.CONCAT_TRAINFULL_PATH]
split_names = ["TRAIN DATA", "DEV DATA", "TRAINFULL DATA"]
for i, path in enumerate(training_data_paths):
    print(f"==== PREPROCESSING {split_names[i]} ====")
    # Concatenated data is pickled
    if i == 2:
        data_behaviors = read_pickled_data([path, "behaviors_concat.pkl"])
    else:
        data_behaviors = read_data_behaviors(path)
    
    pp_data_behaviors = preprocess_data_behaviors(
        data_behaviors,
        save_dir=path,
        save_name_suffix="_ignore_history",
        test_data=False,
        exploration=False,
        get_relevant_news=False,
        save_user_histories=False,
        build_ignore_history=True
    )

==== PREPROCESSING TRAIN DATA ====
[INFO] preprocessed data will be saved in ../../dataset_MIND\MINDlarge_train\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] separating impression column


100%|██████████| 2232748/2232748 [00:29<00:00, 75534.66it/s] 


[INFO] sorting data
[INFO] processing user histories


100%|██████████| 2232748/2232748 [00:18<00:00, 122059.40it/s]


[INFO] saving preprocessed data
==== PREPROCESSING DEV DATA ====
[INFO] preprocessed data will be saved in ../../dataset_MIND\MINDlarge_dev\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] separating impression column


100%|██████████| 376471/376471 [00:02<00:00, 151578.69it/s]


[INFO] sorting data
[INFO] processing user histories


100%|██████████| 376471/376471 [00:01<00:00, 320118.63it/s]


[INFO] saving preprocessed data
==== PREPROCESSING TRAINFULL DATA ====
[INFO] preprocessed data will be saved in ../../dataset_MIND\MINDlarge_trainfull\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] separating impression column


100%|██████████| 2609219/2609219 [00:34<00:00, 75997.50it/s] 


[INFO] sorting data
[INFO] processing user histories


100%|██████████| 2609219/2609219 [00:24<00:00, 107369.26it/s]


[INFO] saving preprocessed data


In [4]:
data_behaviors = read_data_behaviors(constants.TEST_PATH)

print(f"==== PREPROCESSING TEST DATA ====")
pp_data_behaviors_test = preprocess_data_behaviors(
    data_behaviors,
    save_dir=constants.TEST_PATH,
    test_data=True,
    exploration=False,
    get_relevant_news=True,
    save_user_histories=False
)

==== PREPROCESSING TEST DATA ====
[INFO] preprocessed data will be saved in ../dataset_MIND\MINDlarge_test\preprocessed
[INFO] converting timestamp data
[INFO] splitting reading history
[INFO] splitting impression column
[INFO] saving preprocessed data
[INFO] collecting set of relevant news IDs
[INFO] saving relevant news data


### Preprocessed Data NaN Check

Simple sanity check to confirm that the data contains no `NaN` values.

In [6]:
splits = ["train", "dev", "test"]
for i, path in enumerate([constants.TRAIN_PATH, constants.DEV_PATH, constants.TEST_PATH]):
    data_behaviors = read_pickled_data([path, "preprocessed", "behaviors.pkl"])
    print(data_behaviors.isna().sum())

id              0
user_id         0
history         0
timestamp       0
clicked_news    0
ignored_news    0
shown_news      0
dtype: int64
id              0
user_id         0
history         0
timestamp       0
clicked_news    0
ignored_news    0
shown_news      0
dtype: int64
id            0
user_id       0
history       0
timestamp     0
shown_news    0
dtype: int64


### Concatenated Preprocessed Data

The following cell concatenates the preprocessed data over all splits. This will be needed for obtaining survival data for each news during the news preprocessing.

In [3]:
# Create preprocessed folder
save_dir = os.path.join(constants.CONCAT_ALL_PATH, "preprocessed")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Load all splits and only keep shown_news and timestamp columns
# Only the two columns are needed for obtaining survival data
# The concatenated data is otherwise unused
data_behaviors_train = read_pickled_data([constants.TRAIN_PATH, "preprocessed", "behaviors.pkl"])[["shown_news", "timestamp"]]
data_behaviors_dev = read_pickled_data([constants.DEV_PATH, "preprocessed", "behaviors.pkl"])[["shown_news", "timestamp"]]
data_behaviors_test = read_pickled_data([constants.TEST_PATH, "preprocessed", "behaviors.pkl"])[["shown_news", "timestamp"]]
data_behaviors_list = [
    data_behaviors_train,
    data_behaviors_dev,
    data_behaviors_test
]

# Concatenate all splits
data_behaviors = concat_data_behaviors(data_behaviors_list, save_dir=save_dir)
print(f"concatenated full data shape: {data_behaviors.shape}")

concatenated full data shape: (4979946, 2)


### Data Heads

The following cells show examples of the preprocessed data. For labeled data, the `dev` split is shown, because it is small and thus loads quickly.

Examples of the minimal preprocessed behaviors data (without additional columns with exploratory data). 

In [14]:
data_behaviors_labeled = read_pickled_data([constants.DEV_PATH, "preprocessed", "behaviors.pkl"])
data_behaviors_labeled.head(2)

Unnamed: 0,id,user_id,history,timestamp,clicked_news,ignored_news,shown_news
2782,2783,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",2019-11-15 08:13:43,[N69938],"[N19162, N83491, N121138, N94999, N44453, N807...","[N19162, N83491, N121138, N94999, N44453, N807..."
46868,46869,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",2019-11-15 10:36:07,[N104644],"[N121138, N7728, N56565, N69938, N4331, N20802...","[N121138, N104644, N7728, N56565, N69938, N433..."


In [4]:
data_behaviors_labeled = read_pickled_data([constants.DEV_PATH, "preprocessed", "behaviors_ignore_history.pkl"])
data_behaviors_labeled.head(2)

Unnamed: 0,id,user_id,history,timestamp,clicked_news,ignored_news,shown_news,ignore_history
2782,2783,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",2019-11-15 08:13:43,[N69938],"[N19162, N83491, N121138, N94999, N44453, N807...","[N19162, N83491, N121138, N94999, N44453, N807...",[]
46868,46869,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",2019-11-15 10:36:07,[N104644],"[N121138, N7728, N56565, N69938, N4331, N20802...","[N121138, N104644, N7728, N56565, N69938, N433...","[N19162, N83491, N121138, N94999, N44453, N807..."


The preprocessed `test` split of the behaviors data has no `clicked_news` and `ignored_news` columns (the labels are not available).

In [5]:
data_behaviors_test = read_pickled_data([constants.TEST_PATH, "preprocessed", "behaviors.pkl"])
data_behaviors_test.head(2)

Unnamed: 0,id,user_id,history,timestamp,shown_news
0,1,U64099,"[N121133, N104200, N43255, N55860, N128965, N3...",2019-11-19 11:37:45,"[N101071, N15647, N83400, N124838, N57092, N64..."
1,2,U231077,"[N45124, N84730, N45128, N104312, N70022, N991...",2019-11-19 05:28:08,"[N14657, N51253, N49521, N126571, N74286, N101..."


Examples of the preprocessed labeled data with additional exploratory columns.

In [6]:
data_behaviors_exp = read_pickled_data([constants.DEV_PATH, "preprocessed", "exp_behaviors.pkl"])
data_behaviors_exp.head(2)

Unnamed: 0,id,user_id,history,timestamp,clicked_news,ignored_news,shown_news,clicked_news_length,ignored_news_length,shown_news_length,clicked_news_percent,ignored_news_percent,ts_date,ts_time,ts_hour,time_category
2782,2783,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",2019-11-15 08:13:43,[N69938],"[N19162, N83491, N121138, N94999, N44453, N807...","[N19162, N83491, N121138, N94999, N44453, N807...",1,15,16,6.25,93.75,2019-11-15,08:13:43,8,morning
46868,46869,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",2019-11-15 10:36:07,[N104644],"[N121138, N7728, N56565, N69938, N4331, N20802...","[N121138, N104644, N7728, N56565, N69938, N433...",1,16,17,5.882353,94.117647,2019-11-15,10:36:07,10,morning


Data containing numeric value stats for certain exploratory columns.

In [12]:
data_stats = read_pickled_data([constants.DEV_PATH, "preprocessed", "exp_stats.pkl"])
data_stats

Unnamed: 0,clicked_news_length,ignored_news_length,shown_news_length,clicked_news_percent,ignored_news_percent
count,376471.0,376471.0,376471.0,376471.0,376471.0
mean,1.52693,35.887789,37.41472,10.014902,89.985098
std,1.169522,39.174353,39.62298,11.596071,11.596071
min,1.0,1.0,2.0,0.334448,20.0
25%,1.0,9.0,10.0,2.941176,87.804878
50%,1.0,21.0,23.0,5.882353,94.117647
75%,2.0,49.0,51.0,12.195122,97.058824
95%,4.0,117.0,119.0,50.0,98.837209
99%,6.0,182.0,185.0,50.0,99.270073
max,39.0,298.0,299.0,80.0,99.665552


Data containing initial user histories. The non-exploratory version of this dataframe just omits the `history_length` column.

In [3]:
data_users_exp = read_pickled_data([constants.CONCAT_TRAINFULL_PATH, "preprocessed", "exp_users.pkl"])
data_users_exp.head()

Unnamed: 0,user_id,history,history_length
0,U0,"[N39011, N112324, N78884, N111503, N63941, N68...",8
1,U1,"[N14639, N27258, N63237, N112729, N42180, N109...",72
2,U10,"[N85722, N111503, N104737]",3
3,U100,"[N99587, N61339, N129790, N12721, N100405, N10...",43
4,U1000,"[N33446, N20131, N65823, N65823, N111503, N399...",9


In [4]:
data_users_exp["history_length"].describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.99])

count    750434.000000
mean         18.313292
std          23.889606
min           0.000000
25%           5.000000
50%          10.000000
75%          22.000000
90%          42.000000
99%         117.000000
max         801.000000
Name: history_length, dtype: float64

### Relevant News

The relevant news data contains for each news ID that occurs in the corresponding behaviors data, two boolean values. `shown` is `True`, if the news occurred in an impression. `history` is `True` if the news ID occured in one of the initial user histories. Both can be `True`.

In [5]:
splits = ["train", "dev", "trainfull", "test"]
for i, path in enumerate([constants.TRAIN_PATH, constants.DEV_PATH, constants.CONCAT_TRAINFULL_PATH, constants.TEST_PATH]):
    relevant_news = read_pickled_data([path, "preprocessed", "relevant_news.pkl"])
    print(f"{splits[i]} relevant news data shape: {relevant_news.shape}")

relevant_news.head()

train relevant news data shape: (101527, 3)
dev relevant news data shape: (72023, 3)
trainfull relevant news data shape: (104151, 3)
test relevant news data shape: (120961, 3)


Unnamed: 0,news_id,shown,history
0,N129135,False,True
1,N108581,True,True
2,N90655,False,True
3,N114087,False,True
4,N111452,False,True


The next cell displays the number of unique news IDs in each split and over the entire dataset.

In [10]:
splits = ["train", "dev", "trainfull", "test"]
rel_news_sets = []
rel_news_data_list = []

# Get set of relevant news IDs for each split
for i, path in enumerate([constants.TRAIN_PATH, constants.DEV_PATH, constants.CONCAT_TRAINFULL_PATH, constants.TEST_PATH]):
    relevant_news = read_pickled_data([path, "preprocessed", "relevant_news.pkl"])
    rel_news_data_list.append(relevant_news)

    rel_news_set = set(relevant_news["news_id"])
    print(f"There are {len(rel_news_set)} relevant news in the {splits[i]} set")
    rel_news_sets.append(rel_news_set)

# Get union of all unique news IDs
union_rel_news = set.union(*rel_news_sets)
print(f"There are {len(union_rel_news)} relevant news in the union of all sets")

There are 101527 relevant news in the train set
There are 72023 relevant news in the dev set
There are 104151 relevant news in the trainfull set
There are 120961 relevant news in the test set
There are 130379 relevant news in the union of all sets


The next cell calculates the percentage of news IDs that occurred in the dataset only in an impression, only in an initial user history, or both.

In [11]:
# Concat all relevant news dataframes
data_relevant_news = pd.concat(rel_news_data_list)
data_relevant_news = data_relevant_news.groupby(by="news_id").any().reset_index()

# Count news that occurred only in an impression, only in a history or both
shown_rel_news = (data_relevant_news["shown"] & ~data_relevant_news["history"]).sum()
history_rel_news = (~data_relevant_news["shown"] & data_relevant_news["history"]).sum()
both_rel_news = (data_relevant_news["shown"] & data_relevant_news["history"]).sum()

# Print information
print(f"{shown_rel_news / len(union_rel_news) * 100:.0f}% ({shown_rel_news}) of the relevant news were just shown.")
print(f"{history_rel_news / len(union_rel_news) * 100:.0f}% ({history_rel_news}) of the relevant news were just in a user history.")
print(f"{both_rel_news / len(union_rel_news) * 100:.0f}% ({both_rel_news}) of the relevant news were both shown and in a user history.")

25% (32871) of the relevant news were just shown.
58% (75798) of the relevant news were just in a user history.
17% (21710) of the relevant news were both shown and in a user history.


### Timestamp Information

The following cell shows the time spans of the individual splits. For a detailed explanation of how these splits were constructed, refer to the accompanying work.

In [18]:
splits = ["train", "dev", "trainfull", "test"]
for i, path in enumerate([constants.TRAIN_PATH, constants.DEV_PATH, constants.CONCAT_TRAINFULL_PATH, constants.TEST_PATH]):
    data_behaviors = read_pickled_data([path, "preprocessed", "behaviors.pkl"])
    time_data = get_time_data(data_behaviors)
    print(f"{splits[i]} time data:\n{time_data}")

train time data:
{'min': Timestamp('2019-11-09 00:00:00'), 'max': Timestamp('2019-11-14 23:59:59'), 'span': Timedelta('5 days 23:59:59')}
dev time data:
{'min': Timestamp('2019-11-15 00:00:00'), 'max': Timestamp('2019-11-15 23:59:43'), 'span': Timedelta('0 days 23:59:43')}
trainfull time data:
{'min': Timestamp('2019-11-09 00:00:00'), 'max': Timestamp('2019-11-15 23:59:43'), 'span': Timedelta('6 days 23:59:43')}
test time data:
{'min': Timestamp('2019-11-16 00:00:05'), 'max': Timestamp('2019-11-22 23:59:58'), 'span': Timedelta('6 days 23:59:53')}


### Users

We can count the number of unique users in each split using the users data, containing the initial histories for each unique user in the split. For the test data, we have to use the behaviors data and count the number of unique user IDs.

In [6]:
train_users = set(read_pickled_data([constants.TRAIN_PATH, "preprocessed", "users.pkl"])["user_id"])
dev_users = set(read_pickled_data([constants.DEV_PATH, "preprocessed", "users.pkl"])["user_id"])
test_users = set(read_pickled_data([constants.TEST_PATH, "preprocessed", "behaviors.pkl"])["user_id"].unique())

print(f"There are {len(train_users)} users in the train set")
print(f"There are {len(dev_users)} users in the dev set")
print(f"There are {len(test_users)} users in the test set")

union = set.union(*[train_users, dev_users, test_users])
intersection = set.intersection(*[train_users, dev_users, test_users])
print(f"There are {len(union)} users in the union of all sets")
print(f"There are {len(intersection)} users in the intersection of all sets")

There are 711222 users in the train set
There are 255990 users in the dev set
There are 702005 users in the test set
There are 876956 users in the union of all sets
There are 188592 users in the intersection of all sets


# Data News

### Dimensions and Columns

We examine the news dataset split dimensions and columns. Each split contains data for the news that occur in the corresponding behaviors data.

In [8]:
splits = ["train", "dev", "test"]
# Iterate over all splits, read data, print dimensions and columns
for i, path in enumerate([constants.TRAIN_PATH, constants.DEV_PATH, constants.TEST_PATH]):
    data_news = read_data_news(path)
    print(f"{splits[i]} data shape: {data_news.shape} -- data columns: {data_news.columns.to_list()}")

train data shape: (101527, 8) -- data columns: ['news_id', 'category', 'sub_category', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
dev data shape: (72023, 8) -- data columns: ['news_id', 'category', 'sub_category', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
test data shape: (120959, 8) -- data columns: ['news_id', 'category', 'sub_category', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']


### Concatenations

We create a concatenated set of the data for further use. The concatenated set will contain no duplicates.

In [9]:
data_news_train = read_data_news(constants.TRAIN_PATH)
data_news_dev = read_data_news(constants.DEV_PATH)
data_news_test = read_data_news(constants.TEST_PATH)
data_news_list = [
    data_news_train,
    data_news_dev,
    data_news_test
]
data_news = concat_data_news(data_news_list, save_dir=constants.CONCAT_ALL_PATH)
print(f"concatenated data shape: {data_news.shape}")

concatenated data shape: (130379, 8)


### Preprocessing

The main preprocessing routine is started in the cells below. Only the concatenated news data is preprocessed. The preprocessed data is pickled into the same directory as the original data, in a subdirectory `./preprocessed`. Exploratory data has a `exp_` prefix in the filename. Data prepared for embedding has a `emb_` prefix in the filename. It requires the preprocessed `trainfull` behaviors data and the concatenated, preprocessed behaviors data over all splits.

In [3]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "news_concat.pkl"])
behaviors_paths = {
    "survival": os.path.join(constants.CONCAT_ALL_PATH, "preprocessed"),
    "engagement": os.path.join(constants.CONCAT_TRAINFULL_PATH, "preprocessed")
}
data_news = preprocess_data_news(
    data_news,
    save_dir=constants.CONCAT_ALL_PATH,
    exploration=False,
    behaviors_paths=behaviors_paths,
    embedding=True
)

[INFO] preprocessed data will be saved in ../../dataset_MIND\MINDlarge_all\preprocessed
[INFO] replacing NA abstracts with empty string
[INFO] remapping categories
[INFO] saving
[INFO] dropping columns irrelevant for embedding
[INFO] concatenating title and abstract


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] concatenating all


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] saving data for embedding


In [2]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "news_concat.pkl"])
behaviors_paths = {
    "survival": os.path.join(constants.CONCAT_ALL_PATH, "preprocessed"),
    # Depending on whether this is for statistics, or training, choose path
    # "engagement": os.path.join(constants.CONCAT_TRAINFULL_PATH, "preprocessed")
    "engagement": os.path.join(constants.TRAIN_PATH, "preprocessed")
}
data_news = preprocess_data_news(
    data_news,
    save_dir=constants.CONCAT_ALL_PATH,
    exploration=True,
    behaviors_paths=behaviors_paths,
    embedding=True
)

[INFO] preprocessed data will be saved in c:\workbench\developer\drlnrs\dataset_MIND\MINDlarge_all\preprocessed
[INFO] replacing NA abstracts with empty string
[INFO] remapping categories
[INFO] saving
[INFO] dropping columns irrelevant for embedding
[INFO] concatenating title and abstract


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] concatenating all


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] saving data for embedding
[INFO] preprocessing title and abstract
	[INFO] lowercasing title
	[INFO] tokenizing title


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] removing punctuation in title


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] cleaning contractions in title


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] removing stopwords in title


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] lowercasing abstract
	[INFO] tokenizing abstract


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] removing punctuation in abstract


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] cleaning contractions in abstract


  0%|          | 0/130379 [00:00<?, ?it/s]

	[INFO] removing stopwords in abstract


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] obtaining length stats for title and abstract
[INFO] loading behaviors data
[INFO] obtaining survival data


  0%|          | 0/4979946 [00:00<?, ?it/s]

[INFO] processing survival data


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] loading behaviors data
[INFO] obtaining engagement data


  0%|          | 0/2232748 [00:00<?, ?it/s]

[INFO] processing engagement data


  0%|          | 0/130379 [00:00<?, ?it/s]

[INFO] saving exploratory data


### Preprocessed Data NaN Check

Simple sanity check to confirm that the data contains no `NaN` values. Note that there are 75798 `NaT` values in the `first_read_timestamp` column, which corresponds to the number of news that are just in a user history, and not in an impression.

In [8]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "exp_news.pkl"])
print(data_news.isna().sum())

news_id                                       0
category                                      0
sub_category                                  0
title                                         0
abstract                                      0
url                                           1
title_entities                                6
abstract_entities                             9
title_tokens                                  0
title_tokens_no_stopwords                     0
abstract_tokens                               0
abstract_tokens_no_stopwords                  0
title_length                                  0
title_no_stopwords_length                     0
abstract_length                               0
abstract_no_stopwords_length                  0
title_and_abstract_length                     0
title_and_abstract_no_stopwords_length        0
survival_time_hrs                             0
first_read_timestamp                      75798
clicked                                 

### Data Heads

The following cells show examples of the preprocessed data.

Examples of the minimal preprocessed news data (without additional columns with exploratory data).

In [4]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "news.pkl"])
data_news.head(2)

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."


Examples of the preprocessed data with additional exploratory columns.

In [14]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "exp_news.pkl"])
data_news.head(2)

Unnamed: 0,news_id,category,sub_category,title,abstract,url,title_entities,abstract_entities,title_tokens,title_tokens_no_stopwords,...,abstract_length,abstract_no_stopwords_length,title_and_abstract_length,title_and_abstract_no_stopwords_length,survival_time_hrs,first_read_timestamp,clicked,ignored,shown,engagement_percentage
0,N88753,lifestyle,lifestyleroyals,"the brands queen elizabeth, prince charles, an...","shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],"[the, brands, queen, elizabeth, prince, charle...","[brands, queen, elizabeth, prince, charles, pr...",...,12,7,23,15,161.186389,2019-11-11 07:55:42,0,1,1,0.0
1,N45436,news,newsscienceandtechnology,walmart slashes prices on last-generation ipads,apple's new ipad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[walmart, slashes, prices, on, last-generation...","[walmart, slashes, prices, last-generation, ip...",...,11,10,17,15,0.0,NaT,0,0,0,0.0


Examples of the preprocessed data prepared for embedding tasks.

In [5]:
data_news = read_pickled_data([constants.CONCAT_ALL_PATH, "preprocessed", "emb_news.pkl"])
data_news.head(2)

Unnamed: 0,news_id,title,abstract,title_and_abstract
0,N88753,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","The Brands Queen Elizabeth, Prince Charles, an..."
1,N45436,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,Walmart Slashes Prices on Last-Generation iPad...


### All/Relevant News

We construct a set of unique news IDs (the IDs in the news data should already be unique). We can then check whether all news we have data for occur in the behaviors data and vice versa.

In [8]:
all_news = set(data_news["news_id"].unique())
print(f"There are {len(all_news)} news in the concatenated dataset (train+dev+test)")

irrelevant_news = all_news.difference(union_rel_news)
print(f"There are {len(irrelevant_news)} irrelevant news in the data")
print(f"The sets of all news and all relevant news are equal: {all_news == union_rel_news}")

There are 130379 news in the concatenated dataset (train+dev+test)
There are 0 irrelevant news in the data
The sets of all news and all relevant news are equal: True
