In [1]:
import os
import json
import pandas as pd
from glob import glob
from collections import defaultdict
from tqdm import tqdm

In [2]:
glob("../data/mpararel*/patterns")

['../data/mpararel_00_00_06_02/patterns']

In [3]:
mpararels = defaultdict(list)
for mpararel_path in glob("../data/mpararel*/patterns"):
    mpararel_name = mpararel_path.split('/')[-2]
    for language_folder in os.listdir(mpararel_path):
        for relation_filename in os.listdir(os.path.join(mpararel_path, language_folder)):
            # TODO: measure lexical and syntactical variation
            patterns_file = os.path.join(mpararel_path, language_folder, relation_filename)
            with open(patterns_file) as patterns:
                count_patterns = len([1 for pattern in patterns])
                mpararels[mpararel_name].append((
                    language_folder, relation_filename[:-len(".jsonl")],
                    count_patterns))

In [4]:
columns = ["lang", "relation", "count_patterns"]
mpararels = {k : pd.DataFrame(items, columns=columns)
             for k, items in mpararels.items()}

In [5]:
for k, df in mpararels.items():
    lang_unique = df.lang.unique()
    print(f"{k} has {len(lang_unique)} languages ({lang_unique})")

mpararel_00_00_06_02 has 46 languages (['ar' 'tr' 'lt' 'hu' 'sl' 'hr' 'en' 'sk' 'ga' 'gl' 'nl' 'sw' 'hy' 'sq'
 'ro' 'ca' 'cs' 'pl' 'it' 'ms' 'el' 'fi' 'et' 'fa' 'ja' 'fr' 'vi' 'es'
 'he' 'is' 'pt' 'lv' 'hi' 'af' 'id' 'bg' 'mk' 'tl' 'cy' 'ko' 'de' 'az'
 'da' 'uk' 'ru' 'sv'])


In [6]:
for k, df in mpararels.items():
    print(">>> {} #patterns = [min={}, avg={}, max={}]".format(
        k, df["count_patterns"].min(), df["count_patterns"].mean(),
        df["count_patterns"].max()))
    for lang in df.lang.unique():
        this_df = df[df["lang"] == lang]
        relations = this_df.relation.unique()
        print("{} has #relations={} and #patterns = [min={}, avg={}, max={}]".format(
            lang, len(relations), this_df["count_patterns"].min(),
            this_df["count_patterns"].mean(), this_df["count_patterns"].max()))

>>> mpararel_00_00_06_02 #patterns = [min=1, avg=16.28546910755149, max=85]
ar has #relations=38 and #patterns = [min=3, avg=16.94736842105263, max=40]
tr has #relations=38 and #patterns = [min=1, avg=9.710526315789474, max=28]
lt has #relations=38 and #patterns = [min=1, avg=13.631578947368421, max=48]
hu has #relations=38 and #patterns = [min=3, avg=14.657894736842104, max=45]
sl has #relations=38 and #patterns = [min=1, avg=14.368421052631579, max=41]
hr has #relations=38 and #patterns = [min=1, avg=12.973684210526315, max=42]
en has #relations=38 and #patterns = [min=2, avg=12.236842105263158, max=41]
sk has #relations=38 and #patterns = [min=1, avg=16.263157894736842, max=43]
ga has #relations=38 and #patterns = [min=2, avg=14.81578947368421, max=39]
gl has #relations=38 and #patterns = [min=5, avg=19.81578947368421, max=47]
nl has #relations=38 and #patterns = [min=5, avg=20.31578947368421, max=55]
sw has #relations=38 and #patterns = [min=1, avg=15.736842105263158, max=35]
hy ha

# Debug why languages were filtered

In [2]:
sys.path.append(os.path.join(os.path.dirname("/home/wsr217/mpararel/")))

In [3]:
from dataset.create_mpararel import get_agreed_translations_and_stats, add_tuples_counts, add_ratio_column, get_language_and_relations_count
from glob import glob

In [4]:
lang_code_to_name = None
def get_lang_name(lang_code):
    global lang_code_to_name
    if lang_code_to_name is None:
        language_mapping = pd.read_csv("../dataset/languages_mapping.txt", sep='\t')
        lang_code_to_name = {k:v for k, v in zip(language_mapping.wiki.values, language_mapping.name.values)}
    return lang_code_to_name[lang_code]

In [6]:
def filter_repeated_across_languages(agreed_translations,
                                     language_and_relation_counts):
    for i in range(len(language_and_relation_counts)):
        (language, relation, agreed_templates_count,
         translators_count) = language_and_relation_counts[i]
        remove_templates = []
        print("checking:", language, relation, agreed_translations[language][relation])
        for template in agreed_translations[language][relation]:
            for other_relation, other_templates in agreed_translations[
                    language].items():
                if other_relation == relation:
                    continue
                print("comparing to:", other_relation, other_templates)
                if template in other_templates:
                    remove_templates.append(template)
        agreed_templates_count -= len(remove_templates)
        for template in remove_templates:
            agreed_translations[language][relation].remove(template)
        language_and_relation_counts[i] = (language, relation,
                                           agreed_templates_count,
                                           translators_count)
    return agreed_translations, language_and_relation_counts

#agreed_translations, df = get_agreed_translations_and_stats(
#    glob("/home/wsr217/mpararel/data/cleaned_mtrex_and_mpatterns/patterns/*"))
translations_folders = glob("/home/wsr217/mpararel/data/cleaned_mtrex_and_mpatterns/patterns/*")
relations = [
    x.replace(".jsonl", "")
    for x in os.listdir(os.path.join(translations_folders[0], "en"))
]
language_and_relation_counts = []
agreed_translations = defaultdict(lambda: defaultdict(list))
for relation in tqdm(relations):
    lang_to_translations_to_votes = defaultdict(lambda: defaultdict(int))
    lang_to_translators_count = defaultdict(int)
    all_languages = set()
    for translation_folder in translations_folders:
        for language_dirname in os.listdir(translation_folder):
            all_languages.add(language_dirname)
            patterns_file = os.path.join(translation_folder,
                                            language_dirname,
                                            relation + '.jsonl')
            if not os.path.exists(patterns_file):
                continue
            lang_to_translators_count[language_dirname] += 1
            with open(patterns_file) as patterns:
                for line in patterns:
                    data = json.loads(line)
                    vote = 1
                    if ("bing" in translation_folder
                            and not "populated" in translation_folder):
                        vote = 2
                    lang_to_translations_to_votes[language_dirname][
                        data["pattern"]] += vote
    for language in all_languages:
        translations_to_votes = lang_to_translations_to_votes[language]
        agreed_templates_count = 0
        not_agreed_templates_count = 0
        for template_translation, votes in translations_to_votes.items():
            if votes > 1:
                agreed_templates_count += 1
                agreed_translations[language][relation].append(
                    template_translation)
            else:
                not_agreed_templates_count += 1
        language_and_relation_counts.append(
            (language, relation, agreed_templates_count,
                not_agreed_templates_count,
                lang_to_translators_count[language]))
agreed_translations, counts = filter_repeated_across_languages(
        agreed_translations, language_and_relation_counts)
df = pd.DataFrame(language_and_relation_counts,
                  columns=[
                      'language', 'relation', 'agreed_templates_count',
                      'not_agreed_templates_count', 'translators_count'
                  ])

100%|██████████| 38/38 [01:26<00:00,  2.28s/it]


In [7]:
df = add_tuples_counts(df, "../data/cleaned_mtrex_and_mpatterns/tuples")
df["phrases_count"] = df["agreed_templates_count"] * df["tuples_count"]

# Add ratio compared to the count in english.
df = df.sort_values(by=['language', 'relation'])
add_ratio_column(df, 'agreed_templates_count')
add_ratio_column(df, 'phrases_count')

2021-09-24 14:22:32,012 - dataset.create_mpararel - INFO - Counting subject object pairs in each relation of each language.
100%|██████████| 104/104 [00:28<00:00,  3.71it/s]


In [8]:
print("Languages with relations with <= 1 template")
print(", ".join([get_lang_name(l)+f"[{l}]" for l in df[df['agreed_templates_count'] <= 1].language.unique()]))

Languages with relations with <= 1 template
Aragonese[an], Asturian[ast], South Azerbaijani[azb], Bashkir[ba], Bavarian[bar], Belarusian[be], Bulgarian[bg], Bishnupriya Manipuri[bpy], Breton[br], Bosnian[bs], Chechen[ce], Cebuano[ceb], Czech[cs], Chuvash[cv], Danish[da], Basque[eu], Persian (Farsi)[fa], Finnish[fi], West Frisian[fy], Croatian[hr], Haitian[ht], Indonesian[id], Ido[io], Icelandic[is], Japanese[ja], Javanese[jv], Korean[ko], Kirghiz[ky], Latin[la], Luxembourgish[lb], Lombard[lmo], Lithuanian[lt], Minangkabau[min], Macedonian[mk], Malayalam[ml], Mongolian[mn], Burmese[my], Low Saxon[nds], Nepali[ne], Newar[new], Norwegian (Nynorsk)[nn], Norwegian (Bokmal)[no], Occitan[oc], Punjabi[pa], Polish[pl], Piedmontese[pms], Western Punjabi[pnb], Russian[ru], Sicilian[scn], Scots[sco], Serbo-Croatian[sh], Slovak[sk], Slovenian[sl], Serbian[sr], Sundanese[su], Swedish[sv], Swahili[sw], Tamil[ta], Telugu[te], Tajik[tg], Thai[th], Turkish[tr], Tatar[tt], Ukrainian[uk], Urdu[ur], Uzbek[

Danish (da), Greek(el), Hindi(hi), Japanese(ja), Koren(ko), Russian(ru), Chinese(zh)

In [12]:
def get_language_and_phrases_count(valid_df):
    language_relations_count = []
    for language in valid_df['language'].unique():
        phrases_count = sum(valid_df[valid_df['language'] == language].phrases_count)
        language_relations_count.append((language, phrases_count))
    return language_relations_count

# Filter relations that don't have enough templates.
min_templates_per_relation = 0.0
min_phrases_per_relation = 0.0
min_total_phrases = 0.2

enough_templates_df = df[df['agreed_templates_count'] > 1]
enough_templates_df = enough_templates_df[
    enough_templates_df['agreed_templates_rate'] >=
    min_templates_per_relation]
print(
    "From a total of {} relations across all languages, {} relations have "
    "more than 1 template and at least {} of the total patterns for the "
    "same relation in english.".format(len(df), len(enough_templates_df),
                                        min_templates_per_relation))
# Filter relations that don't have enough phrases.
valid_df = enough_templates_df[enough_templates_df['tuples_count'] > 0]
valid_df = valid_df[
    valid_df['phrases_rate'] >= min_phrases_per_relation]
print(
    "From a total of {} relations across all languages, {} relations have "
    "at least {} of the total phrases for the same relation in "
    "english.".format(len(enough_templates_df), len(valid_df),
                        min_phrases_per_relation))
# Filter languages that don't have enough phrases.
valid_df = pd.DataFrame(get_language_and_phrases_count(valid_df),
                        columns=['language', 'total_phrases'])
en_phrases_count = valid_df[valid_df['language'] ==
                                'en'].total_phrases.values[0]
min_phrases_count = en_phrases_count * min_total_phrases
valid_languages = valid_df[
    valid_df['total_phrases'] >= min_phrases_count].language.values
len(valid_languages), valid_languages

From a total of 3952 relations across all languages, 2627 relations have more than 1 template and at least 0.0 of the total patterns for the same relation in english.
From a total of 2627 relations across all languages, 2543 relations have at least 0.0 of the total phrases for the same relation in english.


(45,
 array(['af', 'ar', 'az', 'bg', 'ca', 'cs', 'cy', 'da', 'de', 'el', 'en',
        'es', 'et', 'fa', 'fi', 'fr', 'ga', 'gl', 'he', 'hi', 'hr', 'hu',
        'hy', 'id', 'is', 'it', 'ja', 'ko', 'lt', 'lv', 'mk', 'ms', 'nl',
        'pl', 'pt', 'ro', 'ru', 'sk', 'sl', 'sq', 'sv', 'sw', 'tr', 'uk',
        'vi'], dtype=object))

In [4]:
lang = "zh"
relation = "P17"
for translation_folder in glob("../data/multilingual_logging/pararel_*_fixed"):
    relation_file = os.path.join(translation_folder, lang, relation + ".jsonl")
    print(">>>", translation_folder.split('/')[-1])
    if os.path.isfile(relation_file):
        with open(relation_file) as f:
            for line in f:
                print(json.loads(line))
for i in range(10):
    print("")

>>> pararel_mbart50_en2m_fixed
{'pattern': '[X]位于[Y]。', 'lemma': 'is-located', 'extended_lemma': 'is-located', 'tense': 'present'}
{'pattern': '[X],位于[Y]。', 'lemma': 'which-is-located', 'extended_lemma': 'which-is-located', 'tense': 'present'}
{'pattern': '[X],位于[Y]。', 'lemma': 'located', 'extended_lemma': 'located', 'tense': 'present'}
>>> pararel_m2m100_big_fixed
{'pattern': '', 'lemma': 'is-located', 'extended_lemma': 'is-located', 'tense': 'present'}
{'pattern': '', 'lemma': 'which-is-located', 'extended_lemma': 'which-is-located', 'tense': 'present'}
{'pattern': '', 'lemma': 'located', 'extended_lemma': 'located', 'tense': 'present'}
>>> pararel_bing_fixed
>>> pararel_opus_mt_fixed
{'pattern': '[X]位于[Y]。', 'lemma': 'is-located', 'extended_lemma': 'is-located', 'tense': 'present'}
{'pattern': '[X],位于[Y]。', 'lemma': 'which-is-located', 'extended_lemma': 'which-is-located', 'tense': 'present'}
{'pattern': '[X],位于[Y]。', 'lemma': 'located', 'extended_lemma': 'located', 'tense': 'presen

In [9]:
df[(df["language"] == "da") & (df['agreed_templates_count'] <= 1)]

Unnamed: 0,language,relation,agreed_templates_count,not_agreed_templates_count,translators_count,tuples_count,phrases_count,agreed_templates_rate,phrases_rate
2927,da,P361,1,0,8,524,524,0.5,0.281116


In [10]:
df[(df["language"] == "ja") & (df['agreed_templates_count'] <= 1)]

Unnamed: 0,language,relation,agreed_templates_count,not_agreed_templates_count,translators_count,tuples_count,phrases_count,agreed_templates_rate,phrases_rate
2183,ja,P17,1,5,10,531,531,0.2,0.114194
2807,ja,P276,1,2,10,700,700,0.5,0.365344


In [12]:
df[(df["language"] == "ko") & (df['agreed_templates_count'] <= 1)]

Unnamed: 0,language,relation,agreed_templates_count,not_agreed_templates_count,translators_count,tuples_count,phrases_count,agreed_templates_rate,phrases_rate
2107,ko,P17,0,6,10,261,0,0.0,0.0
2731,ko,P276,1,3,10,343,343,0.5,0.179019
2939,ko,P361,1,1,10,631,631,0.5,0.338519


In [13]:
df[(df["language"] == "ru") & (df['agreed_templates_count'] <= 1)]

Unnamed: 0,language,relation,agreed_templates_count,not_agreed_templates_count,translators_count,tuples_count,phrases_count,agreed_templates_rate,phrases_rate
2707,ru,P276,1,3,10,629,629,0.5,0.328288
2915,ru,P361,1,0,10,815,815,0.5,0.437232


In [14]:
df[(df["language"] == "zh") & (df['agreed_templates_count'] <= 1)]

Unnamed: 0,language,relation,agreed_templates_count,not_agreed_templates_count,translators_count,tuples_count,phrases_count,agreed_templates_rate,phrases_rate
3314,zh,P108,0,12,8,235,0,0.0,0.0
3938,zh,P127,1,9,8,531,531,0.25,0.193231
402,zh,P1303,0,8,8,337,0,0.0,0.0
2378,zh,P136,1,13,8,310,310,0.125,0.041622
714,zh,P1412,1,20,8,522,522,0.125,0.067337
2274,zh,P190,1,7,8,995,995,0.25,0.25
1234,zh,P20,1,20,8,502,502,0.111111,0.058529
194,zh,P279,0,12,8,726,0,0.0,0.0
3002,zh,P361,1,0,8,838,838,0.5,0.449571
2066,zh,P413,1,14,8,427,427,0.142857,0.064076


In [10]:
df[(df["language"] == "sco") & (df['agreed_templates_count'] <= 1)]

Unnamed: 0,language,relation,agreed_templates_count,not_agreed_templates_count,translators_count,tuples_count,phrases_count,agreed_templates_rate,phrases_rate
587,sco,P101,0,0,2,173,0,0.0,0.0
1003,sco,P103,0,0,2,210,0,0.0,0.0
1835,sco,P106,0,0,2,120,0,0.0,0.0
3291,sco,P108,0,0,2,45,0,0.0,0.0
3915,sco,P127,0,0,2,73,0,0.0,0.0
379,sco,P1303,0,0,2,50,0,0.0,0.0
3707,sco,P131,0,0,2,82,0,0.0,0.0
2355,sco,P136,0,0,2,64,0,0.0,0.0
1731,sco,P1376,0,0,2,213,0,0.0,0.0
1419,sco,P138,0,0,2,137,0,0.0,0.0
