In [1]:
import torch
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

import numpy as np
import pandas as pd

from tqdm import tqdm
import json

In [2]:
with open(r'data/info.txt', 'r') as info_file:
    NEG_SAMPLE_SIZE = json.load(info_file)['NEG_SAMPLE_SIZE']
BATCH_SIZE_TEST = 1024
NUM_WORKERS = 10

#### Data Loader

In [3]:
train = pd.read_csv (r'data/train.csv')
test = pd.read_csv (r'data/test.csv')
valid = pd.read_csv (r'data/valid.csv')

for i in [train, valid, test]:
    i['neg_head'] = [eval(l) for l in i['neg_head']]
    i['neg_tail'] = [eval(l) for l in i['neg_tail']]

nentity = len(pd.concat([train['head'], valid['head'], test['head']]).unique()) + len(pd.concat([train['tail'], valid['tail'], test['tail']]).unique())
nrelation = len(pd.concat([train['relation'], valid['relation'], test['relation']]).unique())

print(f'NUMBER OF ENTITY: {nentity}')
print(f'NUMBER OF RELETION: {nrelation}')
print()
print(f'TRAIN: {round(len(train)/(len(train)+len(test)+len(valid))*100)}%')
print(f'VALID: {round(len(valid)/(len(train)+len(test)+len(valid))*100)}%')
print(f'TEST: {round(len(test)/(len(train)+len(test)+len(valid))*100)}%')

NUMBER OF ENTITY: 20720
NUMBER OF RELETION: 8

TRAIN: 58%
VALID: 17%
TEST: 25%


In [4]:
class KGEDataset(Dataset):

  def __init__(self,table,mode='train'):

    self.mode = mode
    self.head = torch.tensor(np.array(table['head']))
    self.tail = torch.tensor(np.array(table['tail']))
    self.relation = torch.tensor(np.array(table['relation']))
    self.neg_head = torch.tensor(np.array(list(table['neg_head'])))
    self.neg_tail = torch.tensor(np.array(list(table['neg_tail'])))

    if mode=='train':
      self.subsampling_weight = torch.tensor(np.array(table['subsampling_weight']))

  def __len__(self):
    return len(self.head)
  
  def __getitem__(self,idx):
    if self.mode == 'train':
      return self.head[idx],self.tail[idx],self.relation[idx], self.neg_head[idx],self.neg_tail[idx], self.subsampling_weight[idx]
    else:
      return self.head[idx],self.tail[idx],self.relation[idx], self.neg_head[idx],self.neg_tail[idx]

#### Metrics

In [5]:
class Evaluator:
    def eval(self, input_dict):
        y_pred_pos, y_pred_neg = input_dict['y_pred_pos'], input_dict['y_pred_neg']
        y_pred = torch.cat([y_pred_pos.view(-1,1), y_pred_neg], dim = 1)
        argsort = torch.argsort(y_pred, dim = 1, descending = True)
        ranking_list = torch.nonzero(argsort == 0, as_tuple=False)
        ranking_list = ranking_list[:, 1] + 1
        hits1_list = (ranking_list <= 1).to(torch.float)
        hits3_list = (ranking_list <= 3).to(torch.float)
        hits10_list = (ranking_list <= 10).to(torch.float)
        mrr_list = 1./ranking_list.to(torch.float)

        return mrr_list, hits1_list, hits3_list, hits10_list

#### Baseline Random Score

In [32]:
evaluator = Evaluator()

metrics = []
test_loader=DataLoader(KGEDataset(test,mode='test'),batch_size=BATCH_SIZE_TEST,shuffle=False,num_workers=NUM_WORKERS)
for test_values in test_loader:
        positive_score = torch.randint(1, 100, (BATCH_SIZE_TEST,1))
        negative_score = torch.randint(1, 100, (BATCH_SIZE_TEST,2*NEG_SAMPLE_SIZE))

        metrics.append([metric.mean() for metric in evaluator.eval({'y_pred_pos': positive_score, 'y_pred_neg': negative_score})])

metrics = torch.tensor(metrics).mean(0)
dict_metrics_rand = {'NEG_SAMPLE_SIZE': NEG_SAMPLE_SIZE,
                'MRR': float(metrics[0]),
                'HITS@1': float(metrics[1]),
                'HITS@3': float(metrics[2]),
                'HITS@10': float(metrics[3])}

with open('baseline//metrics_random.txt', 'w') as var_file:
            json.dump(dict_metrics_rand, var_file)

print(dict_metrics_rand)

{'NEG_SAMPLE_SIZE': 10, 'MRR': 0.17497257888317108, 'HITS@1': 0.0495002381503582, 'HITS@3': 0.1441274732351303, 'HITS@10': 0.4762464463710785}


#### Baseline Frequency

In [31]:
evaluator = Evaluator()
metrics = []
freq = pd.concat([train['head'], valid['head'], test['head'], train['tail'], valid['tail'], test['tail']]).value_counts()

test_loader=DataLoader(KGEDataset(test,mode='test'),batch_size=BATCH_SIZE_TEST,shuffle=True,num_workers=NUM_WORKERS)
for test_values in tqdm(test_loader):
        
        head,tail,relation,neg_head,neg_tail = test_values

        positive_score = torch.tensor(list(freq[list(tail)]))
        negative_tail_score = torch.tensor(list(freq[list(neg_head.view(-1))])).view(-1,NEG_SAMPLE_SIZE)
        negative_head_score = torch.tensor(list(freq[list(neg_tail.view(-1))])).view(-1,NEG_SAMPLE_SIZE)

        metrics.append([metric.mean() for metric in evaluator.eval({'y_pred_pos': positive_score, 'y_pred_neg': torch.cat((negative_tail_score,negative_head_score), dim=1)})])

metrics = torch.tensor(metrics).mean(0)
dict_metrics_freq = {'NEG_SAMPLE_SIZE': NEG_SAMPLE_SIZE,
                'MRR': float(metrics[0]),
                'HITS@1': float(metrics[1]),
                'HITS@3': float(metrics[2]),
                'HITS@10': float(metrics[3])}

with open('baseline//metrics_frequency.txt', 'w') as var_file:
            json.dump(dict_metrics_freq, var_file)

print(dict_metrics_freq)

100%|██████████| 757/757 [01:52<00:00,  6.72it/s]

{'NEG_SAMPLE_SIZE': 10, 'MRR': 0.3209775984287262, 'HITS@1': 0.21609827876091003, 'HITS@3': 0.31698736548423767, 'HITS@10': 0.4308541417121887}



