## Data creation Notebook for KRED: Knowledge-Aware Document Representation for News Recommendations

This notebook is useful to create a pickle file that is useful to train and test the architecture of KRED.

> **Note:** The file generated name is "data_dict_{timestamp}.pkl"


In [1]:
import time
import torch
import os
from utils import *
from train_test import *
import argparse
from parse_config import ConfigParser

In [3]:
# The download part has been removed since it is deprecated

MIND_type = 'demo'
data_path = "/datasets/"

train_news_file = os.path.join(data_path, 'mind_train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'mind_train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'mind_val', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'mind_val', r'behaviors.tsv')
knowledge_graph_file = os.path.join(data_path, 'mind_kg', r'wikidata-graph.tsv')
entity_embedding_file = os.path.join(data_path, 'mind_kg', r'entity2vecd100.vec')
relation_embedding_file = os.path.join(data_path, 'mind_kg', r'relation2vecd100.vec')

In [4]:
# Config Generation

parser = argparse.ArgumentParser(description='KRED')
parser.add_argument('-f')
parser.add_argument('-c', '--config', default="./config.json", type=str,
                    help='config file path (default: None)')
parser.add_argument('-r', '--resume', default=None, type=str,
                    help='path to latest checkpoint (default: None)')
parser.add_argument('-d', '--device', default=None, type=str,
                    help='indices of GPUs to enable (default: all)')

config = ConfigParser.from_args(parser)

In [5]:
from utils.cleaner import *
config = cleaner(config)

ModuleNotFoundError: No module named 'utils.util_new'

In [None]:
entity2id_dict = entity_to_id(config, entities_news(config))

In [None]:
entity_embedding = []
entity_embedding.append(np.zeros(config['model']['entity_embedding_dim']))
entity2embedding_dict = {}
entity2embedding_dict, entity_embedding, relation_embedding = construct_embedding_mind(config, entity2id_dict, entity_embedding, entity2embedding_dict)

In [None]:
entity_adj, relation_adj = construct_adj_mind(config, entity2id_dict, entity2embedding_dict)

In [None]:
entities_not_embedded = set([item for items in entity_adj for item in items]).difference(set(entity2id_dict.values()))
entity2id_dict_not_embedded = id_to_entity(config, entities_not_embedded)
entity2embedding_dict, entity_embedding, relation_embedding = construct_embedding_mind(config, entity2id_dict_not_embedded, entity_embedding, entity2embedding_dict)

# Add the new entities to the dictionary
entity2id_dict.update(entity2id_dict_not_embedded)
# Invert the dictionary
id2entity_dict = {v: k for k, v in entity2id_dict.items()}

In [None]:
for i in range(1, len(entity_adj)):
    for j in range(0, len(entity_adj[i])):
        entity_adj[i][j] = entity2embedding_dict[id2entity_dict[entity_adj[i][j]]]
entity_embedding = torch.FloatTensor(np.array(entity_embedding))
relation_embedding = torch.FloatTensor(np.array(relation_embedding))

In [None]:
embedding_folder=None
news_feature, max_entity_freq, max_entity_pos, max_entity_type = build_news_features_mind(config, entity2embedding_dict, embedding_folder)

# Load the user history
user_history = build_user_history(config)

In [None]:
train_data, dev_data = get_user2item_data(config)
vert_train, vert_test = build_vert_data(config)
pop_train, pop_test = build_pop_data(config)
item2item_train, item2item_test = build_item2item_data(config)

In [None]:
data_dict = {
    'user_history' : user_history,
    'entity_embedding' : entity_embedding,
    'relation_embedding' : relation_embedding,
    'entity_adj' : entity_adj,
    'relation_adj' : relation_adj,
    'news_feature': news_feature,
    'max_entity_freq':max_entity_freq,
    'max_entity_pos': max_entity_pos,
    'max_entity_type':max_entity_type,
    'train_data': train_data,
    'dev_data':dev_data,
    'vert_train': vert_train,
    'vert_test' : vert_test,
    'pop_train': pop_train,
    'pop_test':pop_test,
    'item2item_train': item2item_train,
    'item2item_test': item2item_test
}

In [None]:
import gzip
import pickle
timestamp = int(time.time()*1000)
def save_compressed_pickle(filename, obj):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f)
save_compressed_pickle(f'data_dict_{timestamp}.pkl', data_dict)