In [1]:
!git pull

Username for 'https://github.com': ^C


In [1]:
#!conda install pytorch torchvision torchaudio -c pytorch --yes

import torch

In [2]:
!ls

DNLP_project.ipynb  config.yaml		  main.py	   requirements.txt
KG_part.ipynb	    embedding_part.ipynb  model		   train_test.py
README.md	    framework.PNG	  out		   trainer
__init__.py	    kred_example.ipynb	  parse_config.py  utils
base		    logger		  req.txt


In [3]:
import os

from utils.util import *
from train_test import *

In [4]:
!ls /datasets/mind_kg

description.txt     entity_adj.npy   relation2vecd100.vec  wikidata-graph.tsv
entity2id.txt	    label.txt	     relation_adj.npy
entity2vecd100.vec  relation2id.txt  triple2id.txt


In [5]:
!ls /datasets/mind_train

behaviors.tsv  entity_embedding.vec  news.tsv  relation_embedding.vec


In [6]:
!ls /datasets/mind_val

behaviors.tsv  entity_embedding.vec  news.tsv  relation_embedding.vec


In [7]:
MIND_type = 'small'
data_path = "/datasets/"

train_news_file = os.path.join(data_path, 'mind_train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'mind_train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'mind_val', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'mind_val', r'behaviors.tsv')
knowledge_graph_file = os.path.join(data_path, 'mind_kg', r'wikidata-graph.tsv')
entity_embedding_file = os.path.join(data_path, 'mind_kg', r'entity2vecd100.vec')
relation_embedding_file = os.path.join(data_path, 'mind_kg', r'relation2vecd100.vec')

mind_url, mind_train_dataset, mind_dev_dataset, _ = get_mind_data_set(MIND_type)


if not os.path.exists(train_news_file):
    download_deeprec_resources(mind_url, os.path.join(data_path, 'train'), mind_train_dataset)
    
if not os.path.exists(valid_news_file):
    download_deeprec_resources(mind_url, \
                               os.path.join(data_path, 'valid'), mind_dev_dataset)

kg_url = "https://kredkg.blob.core.windows.net/wikidatakg/"


In [18]:
import sys
import os
#sys.path.append('/content/KRED_LAJ')

import argparse
from parse_config import ConfigParser


parser = argparse.ArgumentParser(description='KRED')

parser.add_argument('-f')


parser.add_argument('-c', '--config', default="./config.json", type=str,
                    help='config file path (default: None)')
parser.add_argument('-r', '--resume', default=None, type=str,
                    help='path to latest checkpoint (default: None)')
parser.add_argument('-d', '--device', default=None, type=str,
                    help='indices of GPUs to enable (default: all)')

config = ConfigParser.from_args(parser)

In [19]:
config['data']

{'train_news': '/datasets/mind_train/news.tsv',
 'train_behavior': '/datasets/mind_train/behaviors.tsv',
 'valid_news': '/datasets/mind_val/news.tsv',
 'valid_behavior': '/datasets/mind_val/behaviors.tsv',
 'knowledge_graph': '/datasets/mind_kg/triple2id.txt',
 'entity_embedding': '/datasets/mind_kg/entity2vecd100.vec',
 'relation_embedding': '/datasets/mind_kg/relation2vecd100.vec',
 'entity_index': '/datasets/mind_kg/entity2id.txt',
 'relation_index': '/datasets/mind_kg/realtion2id.txt',
 'sparse_adj_entity': '/datasets/mind_kg/entity_adj.npy',
 'sparse_adj_relation': '/datasets/mind_kg/relation_adj.npy'}

In [10]:
epochs = 5
batch_size = 64
train_type = "single_task"
task = "user2item" # task should be within: user2item, item2item, vert_classify, pop_predict

config['trainer']['epochs'] = epochs
config['data_loader']['batch_size'] = batch_size
config['trainer']['training_type'] = train_type
config['trainer']['task'] = task

config['data']['knowledge_graph']

'/datasets/mind_kg/triple2id.txt'

In [11]:
entity_adj, relation_adj = construct_adj_mind(config)

In [12]:
news_feature, max_entity_freq, max_entity_pos, max_entity_type = build_news_features_mind(config)

Feature encoding done


In [20]:
user_history = build_user_history(config)

In [21]:
entity_embedding, relation_embedding = construct_embedding_mind_optmized(config)

constructing embedding ... but optimized...


In [22]:
train_data, dev_data = get_user2item_data(config)

In [23]:
vert_train, vert_test = build_vert_data(config)

{'lifestyle': 0, 'health': 1, 'news': 2, 'sports': 3, 'weather': 4, 'entertainment': 5, 'autos': 6, 'travel': 7, 'foodanddrink': 8, 'tv': 9, 'finance': 10, 'movies': 11, 'video': 12, 'music': 13, 'kids': 14, 'middleeast': 15, 'northamerica': 16}


In [24]:
pop_train, pop_test = build_pop_data(config)

In [25]:
item2item_train, item2item_test = build_item2item_data(config)

In [27]:
data_dict = {
    'user_history' : user_history,
    'entity_embedding' : entity_embedding,
    'relation_embedding' : relation_embedding,
    'entity_adj' : entity_adj,
    'relation_adj' : relation_adj,
    'news_feature': news_feature,
    'max_entity_freq':max_entity_freq,
    'max_entity_pos': max_entity_pos,
    'max_entity_type':max_entity_type,
    'train_data': train_data,
    'dev_data':dev_data,
    'vert_train': vert_train,
    'vert_test' : vert_test,
    'pop_train': pop_train,
    'pop_test':pop_test,
    'item2item_train': item2item_train,
    'item2item_test': item2item_test
}


In [None]:
import gzip
import pickle

def save_compressed_pickle(obj, filename):
    with gzip.open(filename, 'wb') as f:
        pickle.dump(obj, f)
save_compressed_pickle('data_dict.pkl', data_dict)

In [None]:
print('done')

In [None]:
def load_compressed_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        obj = pickle.load(f)
    return obj
restored_data = load_compressed_pickle('data_dict_compressed.pickle')

In [None]:
restored_data.keys()

In [None]:
# Must adapt dict to do the task
if train_type == "single_task":
    single_task_training(config, data)
else:
    multi_task_training(config, data)

In [None]:
test_data = data[-1]
testing(test_data, config)