In [1]:
import argparse
from typing import Dict
import logging
import torch
from torch import optim
import pickle
from datasets import TemporalDataset
from optimizers import TKBCOptimizer, IKBCOptimizer
from models import ComplEx, TComplEx, TNTComplEx
from regularizers import N3, Lambda3

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [2]:
# Copyright (c) Facebook, Inc. and its affiliates.
# create model
# this resets the model

import argparse
from typing import Dict
import logging
import torch
from torch import optim

from datasets import TemporalDataset
from optimizers import TKBCOptimizer, IKBCOptimizer
from models import ComplEx, TComplEx, TNTComplEx
from regularizers import N3, Lambda3

DATASET_NAME = 'MultiTQ'
class Args:
    dataset =  '../../data/'+DATASET_NAME+'/kg/tkbc_processed_data'
    #dataset =  '../../data/'+DATASET_NAME+'/kg/tkbc_processed_data'
    model =  'TComplEx'
    max_epochs = 200
    valid_freq = 20
    rank = 256
    batch_size = 1024
    learning_rate = 0.1
    emb_reg = 0.001
    time_reg = 0.001
    no_time_emb = False
    
args=Args()

dataset = TemporalDataset(args.dataset)

sizes = dataset.get_shape()
model = {
    'ComplEx': ComplEx(sizes, args.rank),
    'TComplEx': TComplEx(sizes, args.rank, no_time_emb=args.no_time_emb),
    'TNTComplEx': TNTComplEx(sizes, args.rank, no_time_emb=args.no_time_emb),
}[args.model]
model = model.cuda()


opt = optim.Adagrad(model.parameters(), lr=args.learning_rate)

emb_reg = N3(args.emb_reg)
time_reg = Lambda3(args.time_reg)

Assume all timestamps are regularly spaced
Not using time intervals and events eval


In [3]:
dataset = TemporalDataset(args.dataset)

Assume all timestamps are regularly spaced
Not using time intervals and events eval


In [6]:
# training
for epoch in range(10):
    examples = torch.from_numpy(
        dataset.get_train().astype('int64')
    )

    model.train()
    if dataset.has_intervals():
        optimizer = IKBCOptimizer(
            model, emb_reg, time_reg, opt, dataset,
            batch_size=args.batch_size
        )
        optimizer.epoch(examples)

    else:
        optimizer = TKBCOptimizer(
            model, emb_reg, time_reg, opt,
            batch_size=args.batch_size
        )
        optimizer.epoch(examples)


    def avg_both(mrrs: Dict[str, float], hits: Dict[str, torch.FloatTensor]):
        """
        aggregate metrics for missing lhs and rhs
        :param mrrs: d
        :param hits:
        :return:
        """
        m = (mrrs['lhs'] + mrrs['rhs']) / 2.
        h = (hits['lhs'] + hits['rhs']) / 2.
        return {'MRR': m, 'hits@[1,3,10]': h}

    if epoch < 0 or (epoch + 1) % 5 == 0:
        if dataset.has_intervals():
            valid, test, train = [
                dataset.eval(model, split, -1 if split != 'train' else 50000)
                for split in ['valid', 'test', 'train']
            ]
            print("valid: ", valid)
            print("test: ", test)
            print("train: ", train)

        else:
            valid, test, train = [
                avg_both(*dataset.eval(model, split, -1 if split != 'train' else 50000))
                for split in ['valid', 'test', 'train']
            ]
            print("valid: ", valid['MRR'])
            print("test: ", test['MRR'])
            print("train: ", train['MRR'])


train loss: 100%|██████████| 737924/737924 [00:08<00:00, 88123.33ex/s, cont=0, loss=4, reg=0]
train loss: 100%|██████████| 737924/737924 [00:09<00:00, 79081.52ex/s, cont=0, loss=4, reg=0]
train loss: 100%|██████████| 737924/737924 [00:08<00:00, 82656.49ex/s, cont=0, loss=4, reg=0]
train loss: 100%|██████████| 737924/737924 [00:11<00:00, 62366.24ex/s, cont=0, loss=4, reg=0] 
train loss: 100%|██████████| 737924/737924 [00:19<00:00, 37978.25ex/s, cont=0, loss=3, reg=0]


valid:  0.5059451311826706
test:  0.5054048001766205
train:  0.9835851788520813


train loss: 100%|██████████| 737924/737924 [00:05<00:00, 133943.13ex/s, cont=0, loss=4, reg=0]
train loss: 100%|██████████| 737924/737924 [00:05<00:00, 135228.60ex/s, cont=0, loss=4, reg=0]
train loss: 100%|██████████| 737924/737924 [00:05<00:00, 131247.21ex/s, cont=0, loss=3, reg=0]
train loss: 100%|██████████| 737924/737924 [00:05<00:00, 143963.70ex/s, cont=0, loss=3, reg=0]
train loss: 100%|██████████| 737924/737924 [00:05<00:00, 140531.79ex/s, cont=0, loss=3, reg=0]


valid:  0.5010942667722702
test:  0.5007961839437485
train:  0.9892488420009613


In [5]:
# save model
path = '../../Baseline仓库/code/models/kg_embeddings/tcomplex_new.ckpt'
torch.save(model.state_dict(), path)

In [13]:
# save model
path = '../models/'+DATASET_NAME+'/kg_embeddings/enhanced_kg_with_time.ckpt'
torch.save(model.state_dict(), path)

In [3]:
path = 'tkbc_model/'+DATASET_NAME+'/kg_embeddings/icews_enhanced_v.ckpt'
x = torch.load(path)
model.load_state_dict(x)

<All keys matched successfully>

In [10]:
def predictTime(question, model, all_dicts, k=1):
    entities = list(question['entities'])
    times = question['time']
    target_type = 'simple_entity'
    ent2id = all_dicts['ent2id']
    rel2id = all_dicts['rel2id']
    ts2id = all_dicts['ts2id']
    id2ent = all_dicts['id2ent']
    id2ts = all_dicts['id2ts']
    head = ent2id[entities[0]]
    tail = ent2id[entities[1]]
    relation = question['relations']
    relation = rel2id[relation]  # + model.embeddings[1].weight.shape[0]//2 #+ 90
    data_point = [head, relation, tail, 1]
    data_batch = torch.from_numpy(np.array([data_point])).cuda()
    time_scores = model.forward_over_time(data_batch)
    val, ind = torch.topk(time_scores, k, dim=1)
    topk_set = set()
    for row in ind:
        for x in row:
            topk_set.add(id2ts[x.item()])
    return topk_set


def predictTail(question, model, all_dicts, k=1):
    entities = list(question['entities'])
    times = question['time']
    target_type = 'simple_entity'
    ent2id = all_dicts['ent2id']
    rel2id = all_dicts['rel2id']
    ts2id = all_dicts['ts2id']
    id2ent = all_dicts['id2ent']
    id2ts = all_dicts['id2ts']
    head = ent2id[entities[0]]
    time = ts2id[times[0]]
    relation = question['relations']
    relation = rel2id[relation]  # + model.embeddings[1].weight.shape[0]//2 #+ 90
    data_point = [head, relation, 1, time]
    data_batch = torch.from_numpy(np.array([data_point])).cuda()
    predictions, factors, time = model.forward(data_batch)
    val, ind = torch.topk(predictions, k, dim=1)
    topk_set = set()
    for row in ind:
        for x in row:
            topk_set.add(id2ent[x.item()])
    return topk_set


def checkIfTkbcEmbeddingsTrained(tkbc_model, split='test'):
    with open('../data/MultiTQ/questions/full_data/'+split+'.json') as f:
        questions = json.load(f)
    question_type ='equal'
    correct_count = 0
    total_count = 0
    k = 1  # hit at k
    for i in tqdm(range(len(questions))):
        this_question_type = questions[i]['qtype']
        if question_type == this_question_type and questions[i]['answer_type'] == 'entity' and questions[i]['time_level'] == 'day':
            which_question_function = predictTail
        elif question_type == this_question_type and questions[i]['answer_type'] == 'time' and questions[i]['time_level'] == 'day':
            which_question_function = predictTime
        else:
            continue
        total_count += 1
        id = i
        predicted = which_question_function(questions[id], tkbc_model, all_dicts, k)
        intersection_set = set(questions[id]['answers']).intersection(predicted)
        if len(intersection_set) > 0:
            correct_count += 1
    print(question_type, correct_count, total_count, correct_count / total_count)


In [11]:
all_dicts = getAllDicts(args.dataset_name,args.kg_dir)
checkIfTkbcEmbeddingsTrained(tkbc_model, split='test')

NameError: name 'getAllDicts' is not defined