<a href="https://colab.research.google.com/github/bayrameda/MrAP/blob/main/MrAP_YAGO15K.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch-scatter==2.0.4

Collecting torch-scatter==2.0.4
  Downloading https://files.pythonhosted.org/packages/98/a9/47cd92673b6ba251240d587815c763baac2099b07bb76fecdb3b7ae5cece/torch_scatter-2.0.4.tar.gz
Building wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l[?25hdone
  Created wheel for torch-scatter: filename=torch_scatter-2.0.4-cp36-cp36m-linux_x86_64.whl size=11357778 sha256=d00eebb50752dc77154029c3efc2917e366a04ef1bd63aaf9d0b52230c6c7968
  Stored in directory: /root/.cache/pip/wheels/fb/28/28/458ddcee4849d5f8a14dd1be1e957d2e8b2955e8c96b07a12d
Successfully built torch-scatter
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.0.4


In [1]:
import numpy as np
import torch
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


# Data Read

In [3]:
ent_100 = pd.read_csv('../KGs/YAGO15k/literals/train.txt', sep='\t', header=None)
ent_dev = pd.read_csv('../KGs/YAGO15k/literals/valid.txt', sep='\t',header=None,)
ent_test = pd.read_csv('../KGs/YAGO15k/literals/test.txt' , sep='\t', header=None)

entities = pd.concat([ent_100, ent_dev, ent_test], ignore_index=True)
entities = entities.set_axis(['node', 'attribute', 'numeric'], axis=1)

# triples = pd.read_csv('data/triples')

# entities = pd.concat([ent_100, ent_dev, ent_test], ignore_index=True)
# entities = entities.set_axis(['node', 'attribute', 'numeric'], axis=1)

triples = pd.read_csv('../KGs/YAGO15k/YAGO15k_EntityTriples.txt', sep='\t', header=None, names= ['node_1', 'relation', 'node_2'])
triples = triples[triples['node_1'] != triples['node_2']]

In [4]:
duplicates = entities[entities.duplicated(subset=['node', 'attribute', 'numeric'], keep=False)]
duplicates

Unnamed: 0,node,attribute,numeric


### Extract the multi-relational edge list

In [5]:
from utils import extract_edges_YAGO, estimate_params, drop_sym, reduce_to_singles, performance

In [6]:
dates = ['wasBornOnDate','wasCreatedOnDate','wasDestroyedOnDate', 'diedOnDate','happenedOnDate']
corr_attributes = [dates,['hasLatitude'],['hasLongitude']]

In [8]:
edge_list = []
relations = []
for atts in corr_attributes:
  entities_ofint = entities[entities.attribute.isin(atts)]
  edge_ofint, rel_ofint = extract_edges_YAGO(triples, entities_ofint)
  edge_list = edge_list + edge_ofint
  relations = relations + rel_ofint

## Stats

In [9]:
entity_stat = pd.DataFrame(columns=['count','min', 'mean', 'max'])
attributes = entities['attribute'].unique().tolist()
for key in attributes:
  numerics = entities[entities['attribute']==key]['numeric']
  entity_stat.loc[key] = [len(numerics), numerics.min(),numerics.mean(),numerics.max()]
entity_stat

Unnamed: 0,count,min,mean,max
wasBornOnDate,8218.0,354.1113,1952.458203,2014.073
wasCreatedOnDate,6588.0,100.0,1904.161563,2018.0914
hasLatitude,2989.0,-51.683333,37.502377,73.0
hasLongitude,2989.0,-175.0,-39.150271,179.0
diedOnDate,1822.0,348.0,1961.742226,2161.101
happenedOnDate,388.0,218.0,1944.13116,2018.071
wasDestroyedOnDate,538.0,476.0,1966.944244,2017.0416


### Multi-relational graph stats

In [10]:
print('Number of attribute types = ', entities['attribute'].nunique())
N = len(entities)
print('Total number of numerical facts = ', N)
print('Total number of entities = ',entities.node.nunique())
print('Total number of triple facts = ', len(triples))
print('Number of relation types = ',triples.relation.nunique())
print('Number of regression models = ', len(relations))
asym_edge_list = drop_sym(edge_list)
print('Number of message passing paths =', len(np.concatenate(asym_edge_list)))

Number of attribute types =  7
Total number of numerical facts =  23532
Total number of entities =  15081
Total number of triple facts =  138056
Number of relation types =  32
Number of regression models =  261
Number of message passing paths = 186650


In [11]:
attribute_labels = entities.attribute.values # corresponding entity labels of the nodes
x = entities.numeric.values.copy() # ordering of entities is compatible with the node ordering in triples

# Performances

In [12]:
def indices(entities_interest): #get the indicies of the entities of interest wrt x ordering
  return [entities[(entities.node == row[0]) & (entities.attribute == row[1])].index.item() for ind, row in entities_interest.iterrows()]

splits = ['100']
idx_train = [indices(item) for item in [ent_100]] #train = known, dev+test=unknown

def comp_u(idx):
  u = np.zeros(N, dtype=bool)
  u[idx] = 1
  return u

u_0_list = [comp_u(idx) for idx in idx_train]

idx_test = indices(ent_test)
idx_test_atts = [[item for item in idx_test if item in np.where(attribute_labels == att)[0]] for att in attributes]

Ratio known dates

In [13]:
print('Train=known, ratios=', [len(item)/N for item in idx_train])

Train=known, ratios= [0.7999745028046915]


In [14]:
def get_performance(x_pred, u_0):
  tups = [performance(x_pred, x[idx], u_0, idx) for idx in idx_test_atts] # performance on each attribute type
  return [item for tupl in tups for item in tupl] #merging all

## Algorithms

In [15]:
from MrAP import MrAP
from algs import Global, Local, iter_MrAP

A variation : Reduction of the edges to the ones among single type of attribute \\
i.e., reduce the regression of one attribute from the same type of another

In [16]:
edge_list_singles, relations_singles, attribute_coupled = reduce_to_singles(edge_list, attribute_labels)
asym_edge_list_singles = drop_sym(edge_list_singles)

In [17]:
taus, omegas, _, _ = estimate_params(edge_list, x)
tau_singles = taus[relations_singles]
omega_singles = omegas[relations_singles]

In [18]:
model = MrAP(device=device, edge_list=asym_edge_list, omega=omegas, tau=taus)
model_singles = MrAP(device=device, edge_list=asym_edge_list_singles, omega=omega_singles, tau=tau_singles)

In [19]:
table_result = pd.DataFrame(columns=['born-RMSE', 'born-MAE', 'Created-RMSE', 'Created-MAE', 'Destroyed-RMSE', 'Destroyed-MAE','died-RMSE', 'died-MAE',
                                     'Long-RMSE', 'Long-MAE', 'Lat-RMSE', 'Lat-MAE', 'happened-RMSE', 'happened-MAE' ])

kk=0
for u_00 in u_0_list:
  u_0 = torch.tensor(u_00, device=device)
  x_0 = torch.tensor(x, device=device)
  x_0[u_0 == 0] = 0 # Zero-padding of unknown

  x_pred = Global(x_0, u_0, attribute_labels)
  table_result.loc['Global-' + splits[kk]] = get_performance(x_pred, u_0)

  x_pred = Local(asym_edge_list, x_0, u_0, attribute_labels)
  table_result.loc['Local-' + splits[kk]] = get_performance(x_pred, u_0)

  x_pred = iter_MrAP(x_0, u_0, model_singles, xi=0.5, entity_labels=attribute_labels)
  table_result.loc['MrAP_single-' + splits[kk]]  = get_performance(x_pred, u_0)

  x_pred = iter_MrAP(x_0, u_0, model, xi=0.5, entity_labels=attribute_labels)
  table_result.loc['MrAP_cross-' + splits[kk]]  = get_performance(x_pred, u_0)

  kk = kk+1

In [20]:
table_result

Unnamed: 0,born-RMSE,born-MAE,Created-RMSE,Created-MAE,Destroyed-RMSE,Destroyed-MAE,died-RMSE,died-MAE,Long-RMSE,Long-MAE,Lat-RMSE,Lat-MAE,happened-RMSE,happened-MAE
Global-100,49.563165,26.494826,156.338394,91.565368,14.514309,9.169515,71.784233,60.466441,171.630996,58.508044,76.16455,54.443786,56.516399,42.63854
Local-100,37.729378,24.088106,218.099283,146.536987,5.717964,2.590733,25.616566,9.468187,132.307627,54.703934,76.16455,54.443786,59.052033,43.582202
MrAP_single-100,36.474859,24.015162,185.560353,100.893683,4.590147,1.964421,17.379443,5.913326,122.858593,55.54131,76.16455,54.443786,60.018016,44.393936
MrAP_cross-100,30.934117,17.738172,145.037056,67.856857,4.591997,1.96518,17.380309,5.914758,99.037263,36.085294,130.613365,62.409168,84.10642,39.7238


###  Another variation: Dropping inner edges
i.e., regression over the KG neighbors, no inner loss

In [None]:
relations_inner = []
for ii in range(len(relations)):
  if relations[ii].rsplit('_',1)[-1] == 'coupling':
    relations_inner.append(ii)

In [None]:
edge_list_wo_inner = list(np.delete(np.array(edge_list), relations_inner))
relations_wo_inner = list(np.delete(np.array(relations), relations_inner))
asym_edge_list_wo_inner = list(np.delete(np.array(asym_edge_list), relations_inner))
taus_wo_inner = np.delete(taus, relations_inner)
omegas_wo_inner = np.delete(omegas, relations_inner)

In [None]:
model_wo_inner =  MrAP(device=device, edge_list=asym_edge_list_wo_inner, omega=omegas_wo_inner, tau=taus_wo_inner)

In [None]:
table_result = pd.DataFrame(columns=['born-RMSE', 'born-MAE', 'Created-RMSE', 'Created-MAE', 'Destroyed-RMSE', 'Destroyed-MAE','died-RMSE', 'died-MAE',
                                     'Long-RMSE', 'Long-MAE', 'Lat-RMSE', 'Lat-MAE', 'happened-RMSE', 'happened-MAE' ])

kk=0
for u_00 in u_0_list:
  u_0 = torch.tensor(u_00, device=device)
  x_0 = torch.tensor(x, device=device)
  x_0[u_0 == 0] = 0 # Zero-padding of unknown

  x_pred = iter_MrAP(x_0, u_0, model, xi=0.5, entity_labels=attribute_labels)
  table_result.loc['MrAP-' + splits[kk]]  = get_performance(x_pred, u_0)

  x_pred = iter_MrAP(x_0, u_0, model_wo_inner, xi=0.5, entity_labels=attribute_labels)
  table_result.loc['MrAPwoInner-' + splits[kk]]  = get_performance(x_pred, u_0)

  kk = kk+1

table_result

Unnamed: 0,born-RMSE,born-MAE,Created-RMSE,Created-MAE,Destroyed-RMSE,Destroyed-MAE,died-RMSE,died-MAE,Long-RMSE,Long-MAE,Lat-RMSE,Lat-MAE,happened-RMSE,happened-MAE
MrAP-100,31.479919,19.74391,149.636901,70.444597,62.005726,34.614763,84.242613,33.961682,17.142654,5.704237,7.929983,2.765666,73.816374,54.138783
MrAPwoInner-100,68.780741,24.209732,145.912957,70.791935,67.979523,42.434878,105.015414,49.565534,17.142239,5.701881,7.929909,2.766549,99.392834,62.778376
MrAP-80,62.362344,21.669413,149.215673,70.534499,54.406482,33.240274,94.271882,37.266638,17.493987,6.396199,8.090711,3.088407,71.500651,50.77544
MrAPwoInner-80,67.497754,24.0165,145.131702,70.164571,63.626865,37.550853,106.943938,48.179037,17.493987,6.396199,8.090711,3.088407,97.245183,59.19131
MrAP-50,61.925559,21.100572,135.278577,65.838745,45.931085,28.112485,84.441823,35.003534,18.016979,7.373341,8.578351,3.703517,95.570496,54.027444
MrAPwoInner-50,65.79662,22.776493,135.236811,65.938374,52.945976,30.373739,93.453452,42.711189,18.016979,7.373341,8.578351,3.703517,93.269154,54.212561
MrAP-20,61.856513,20.061757,132.552194,66.109362,60.083823,37.627866,93.348505,37.595172,22.583517,11.950388,10.441993,5.400599,97.092485,58.876741
MrAPwoInner-20,63.50788,21.159391,132.528702,66.744273,64.46508,42.487379,90.689122,37.873996,23.548196,12.054674,9.889951,5.119563,94.310245,58.75268
