In [1]:
import os
import json
from tqdm import tqdm
import re
from glob import glob

In [2]:
def is_template_valid(template):
    """Checks that the template has one [X], one [Y], and extra text."""
    return (template.count("[X]") == 1 and
            template.count("[Y]") == 1 and
            not re.match(r'^[\b\[X\]\b\b\[Y\]\b., ]+$', template))

In [3]:
import re

def clean_template(template):
    template = re.sub(r'[.!,:]', '', template)
    template = re.sub(r' +', ' ', template)
    template = re.sub(r' $', '', template)
    return template

In [4]:
mpararel_folders = glob("../generated_datasets/mpararel_*/")
mpararel_folders

['../generated_datasets/mpararel_google/',
 '../generated_datasets/mpararel_mbart50_en2m/',
 '../generated_datasets/mpararel_m2m100_big/',
 '../generated_datasets/mpararel_opus_mt/',
 '../generated_datasets/mpararel_bing/']

In [6]:
language = "es"
relations = [x.replace(".jsonl", "") for x in
             os.listdir(os.path.join(mpararel_folders[0], "en/triples"))]

translator_to_relation_to_phrases = {}
for mpararel_folder in mpararel_folders:
    translator_to_relation_to_phrases[mpararel_folder] = {}
    for relation in relations:
        praphrases = set()
        with open(os.path.join(mpararel_folder, language, "patterns", relation + '.jsonl')) as f_praphrases:
            for line in f_praphrases:
                data = json.loads(line)
                if is_template_valid(data["pattern"]):
                    praphrases.add(clean_template(data["pattern"]))
        translator_to_relation_to_phrases[mpararel_folder][relation] = praphrases

TODO:
- convert text to lowercase
- ignore? puntutaction, or format spaces around puntuaction.

In [7]:
es_paraphrases = {}
with open("/home/wsr217/ppdb/ppdb-1.0-s-phrasal") as f_ppdb:
    for i, line in enumerate(f_ppdb):
        info = line.split(' ||| ')
        es_paraphrases[info[1]] = info[2]
        es_paraphrases[info[2]] = info[1]

In [8]:
def get_longest_text(template):
    text_groups = re.match(r'(.*)(\[[XY]\])(.*)(\[[XY]\])(.*)', template)
    text_to_length = {text_groups.group(i): len(text_groups.group(i)) for i in range(1,6,2)}
    longest_text = sorted(text_to_length.items(), key=lambda item: item[1], reverse=True)[0][0]
    longest_text = re.sub(r'^ ', '',longest_text)
    longest_text = re.sub(r' $', '',longest_text)
    return longest_text

In [9]:
def get_middle_text(template):
    matches = re.match(r'^\[[XY]\] (.*) \[[XY]\]$', template)
    if not matches:
        return None
    return matches.group(1)

In [11]:
translator_to_relation_to_phrases.keys()

dict_keys(['../generated_datasets/mpararel_google/', '../generated_datasets/mpararel_mbart50_en2m/', '../generated_datasets/mpararel_m2m100_big/', '../generated_datasets/mpararel_opus_mt/', '../generated_datasets/mpararel_bing/'])

In [12]:
import collections

In [13]:
for relation in relations:
    translations_to_votes = collections.defaultdict(int)
    for translator, relation_to_phrases in translator_to_relation_to_phrases.items():
        for translated_phrase in relation_to_phrases[relation]:
            translations_to_votes[translated_phrase] += 1
    agreed = set()
    not_agreed = set()
    for translated_phrase, votes in translations_to_votes.items():
        if votes > 1:
            agreed.add(translated_phrase)
        else:
            not_agreed.add(translated_phrase)
    ppdb_paraphrases = {}
    for phrase in translations_to_votes.keys():
        #template_longest_text = get_longest_text(phrase)
        #if len(template_longest_text.split(' ')) == 1:
        #    continue
        middle_text = get_middle_text(phrase)
        if middle_text and middle_text in es_paraphrases:
            ppdb_paraphrases[middle_text] = es_paraphrases[middle_text]
    if not_agreed:
        print("In relation {} there were {} agreed and {} not agreed translations.".format(
            relation, len(agreed), len(not_agreed)))
        print("Agreed:", agreed)
        print("Not agreed:", not_agreed)
        if ppdb_paraphrases:
            print("From ppdb:", ppdb_paraphrases)
        print()

In relation P30 there were 4 agreed and 3 not agreed translations.
Agreed: {'[X] se encuentra en el continente [Y]', '[X] pertenece al continente de [Y]', '[X] es una parte del continente de [Y]', '[X] se encuentra en [Y]'}
Not agreed: {'[X] está ubicado en [Y]', '[X] está ubicado en el continente [Y]', '[X] es parte del continente de [Y]'}
From ppdb: {'se encuentra en': 'se encuentra situado en'}

In relation P279 there were 4 agreed and 3 not agreed translations.
Agreed: {'[X] un tipo de [Y]', '[X] es una subclase de [Y]', '[X] una subclase de [Y]', '[X] que es una subclase de [Y]'}
Not agreed: {'[X] una subcategoría de [Y]', '[X] que es una subcategoría de [Y]', '[X] es una subcategoría de [Y]'}
From ppdb: {'un tipo de': 'una especie de'}

In relation P39 there were 6 agreed and 6 not agreed translations.
Agreed: {'[X] tiene la posición de [Y]', '[X] que ocupa la posición de [Y]', 'La posición de [X] es [Y]', '[X] cuya posición es la de [Y]', '[X] que tiene la posición de [Y]', '[X]