In [42]:
import os
import re
import json
import torch
import collections
import numpy as np
from tqdm import tqdm

In [51]:
def is_template_valid(template):
    """Checks that the template has one [X], one [Y], and extra text."""
    return (template.count("[X]") == 1 and
            template.count("[Y]") == 1 and
            not re.match(r'^[\b\[X\]\b\b\[Y\]\b., ]+$', template))

mpararel_folder = "../generated_datasets/mpararel_clean"
relations = [x.replace(".jsonl", "") for x in os.listdir(os.path.join(
    mpararel_folder, "en/triples"))]
language = "es"
K_POPULATE_PATTERN = 10

phrases_per_relation = {}
for relation in relations:
    sub_obj = []
    with open(os.path.join(mpararel_folder, language, "triples", relation + ".jsonl")) as f_triples:
        for line in f_triples:
            data = json.loads(line)
            if np.random.rand() > 0.6:
                sub_obj.append((data["sub_label"], data["obj_label"]))
            if len(sub_obj) == K_POPULATE_PATTERN:
                break
    with open(os.path.join(mpararel_folder, language, "patterns", relation + ".jsonl")) as f_patterns:
        patterns = set()
        for line in f_patterns:
            data = json.loads(line)
            if is_template_valid(data["pattern"]):
                patterns.add(data["pattern"])
    this_relation_phrases = collections.defaultdict(list)
    for pattern in patterns:
        for sub, obj in sub_obj:
            phrase = pattern.replace("[X]", sub)
            phrase = phrase.replace("[Y]", obj)
            this_relation_phrases[pattern].append(phrase)
    phrases_per_relation[relation] = this_relation_phrases

In [54]:
phrases_per_relation['P127']

defaultdict(list,
            {'[X] propietario [Y].': ['BMW Motorrad propietario BMW.',
              'Soldier Field propietario Chicago.',
              'Ducati propietario Audi.',
              'Miami Orange Bowl propietario Miami.',
              'Deutsche Bahn propietario Alemania.',
              'Acura propietario Honda.',
              'Aeropuerto Internacional de Honolulu propietario Hawái.',
              'Lotus Software propietario IBM.',
              'disco compacto propietario Sony.',
              'CodePlex propietario Microsoft.'],
             'El [Y] -wed [X]': ['El BMW -wed BMW Motorrad',
              'El Chicago -wed Soldier Field',
              'El Audi -wed Ducati',
              'El Miami -wed Miami Orange Bowl',
              'El Alemania -wed Deutsche Bahn',
              'El Honda -wed Acura',
              'El Hawái -wed Aeropuerto Internacional de Honolulu',
              'El IBM -wed Lotus Software',
              'El Sony -wed disco compacto',
          

In [22]:
from transformers import BertTokenizer, BertLMHeadModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertLMHeadModel.from_pretrained("bert-base-multilingual-cased")

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [56]:
relations_to_patterns_to_ppl = {}
for relation in tqdm(relations):
    patterns_to_ppl = {}
    for pattern, phrases in phrases_per_relation[relation].items():
        ppls = []
        for phrase in phrases:
            encodings = tokenizer(phrase, return_tensors='pt')
            outputs = model(encodings["input_ids"], labels=encodings["input_ids"])
            ppls.append(torch.exp(-outputs.loss).item())
        patterns_to_ppl[pattern] = np.mean(ppls)
    relations_to_patterns_to_ppl[relation] = patterns_to_ppl

100%|██████████| 39/39 [05:53<00:00,  9.06s/it]


In [62]:
ppls = []
for relation, pattern_to_ppl in relations_to_patterns_to_ppl.items():
    for pattern, ppl in pattern_to_ppl.items():
        ppls.append(ppl)

In [79]:
for relation, pattern_to_ppl in relations_to_patterns_to_ppl.items():
    for pattern, ppl in pattern_to_ppl.items():
        if ppl > 1e-5:
            print(relation, pattern, ppl)

P178 [X], un producto desarrollado por [Y] 1.138791129733363e-05
P178 [X] es un producto de [Y] 1.1611506556619133e-05
P463 [X] es una parte de la organización [Y]. 1.522542280554262e-05
P1376 [X], esa es la ciudad capital de [Y]. 2.1063356143713463e-05
P1376 [X] es la ciudad capital de [Y]. 2.325843488932833e-05
P1376 La ciudad capital de [Y] es [X]. 1.096088454630717e-05
P1376 [Y], que tiene la capital [X]. 2.2923432678112476e-05
P1376 [Y] la ciudad capital es [X]. 2.0789034059021107e-05
P1376 [Y], que tiene la ciudad capital [X]. 8.718334537434202e-05
P1376 [X], la ciudad capital de [Y]. 1.1358985642573315e-05
P136 [X] juega la música [Y]. 1.059941949961285e-05
P407 [X] es un trabajo [Y] -language. 1.1269550429915399e-05
P937 [X] encontrado empleo en [Y]. 1.502322058968275e-05
P176 [X] es un producto de [Y]. 1.2234501343755256e-05
P36 [Y] es la ciudad capital de [X]. 1.708727619416095e-05
P36 La ciudad capital [X] es [Y]. 1.9944285571682486e-05


In [71]:
for i in range(0, 110, 10):
    print(i, np.percentile(ppls, i))

0 3.571987676198596e-08
10 1.7310609143894597e-07
20 3.0822326131385584e-07
30 4.481712007375904e-07
40 6.854630703401202e-07
50 1.0100167742166377e-06
60 1.649448329885672e-06
70 2.441333534051182e-06
80 3.5999798234342963e-06
90 6.493832446636818e-06
100 8.718334537434202e-05


1.0100167742166377e-06

2.965522008935295e-06

In [55]:
relations_to_patterns_to_ppl["P127"]['[X] propietario [Y].'] > relations_to_patterns_to_ppl["P127"]['El [Y] -wed [X]']

True