In [1]:
import os
from util import Stats
import pandas as pd
from util import count_lines_from_file, collect_all_seeds


LANGS = open("./meta/low-langs.txt").read().split()
DATA_DIR = "./data"

records = []

for lang in LANGS:
    stats = Stats(os.path.join(DATA_DIR, f"{lang}_en"))
    record_1, record_2 = stats.analyze()
    records += [record_1, record_2]
    
df = pd.DataFrame(records)

In [2]:
def get_lang(row):
    langs = row.data_dir.split('/')[-1].split('_')
    if row.lang == 1:
        return langs[-2]
    else:
        return langs[-1]

df['language'] = df.apply(get_lang, axis=1)

In [3]:
ILL_stats = {}
ILL_stats['num_of_ILLs'] = count_lines_from_file("data/zh_yue_en/ref_ent_ids")
ILL_stats['train'] = count_lines_from_file("data/zh_yue_en/train")
ILL_stats['dev'] = count_lines_from_file("data/zh_yue_en/dev")
ILL_stats['test'] = count_lines_from_file("data/zh_yue_en/test")

print(ILL_stats)

{'num_of_ILLs': 15000, 'train': 9450, 'dev': 1050, 'test': 4500}


In [4]:
table = df[(df.lang == 1) | (df.index == 1)] \
    .drop(columns=["lang", "data_dir"]) \
    .sort_values(by="num_entities", ascending=False) \
    .transpose() \
    .sort_index()

LANGS = open("meta/low-langs.txt").read().strip().split()
SEED_ID = "./meta/seeds.id.{lang}.txt"

seed_path_map = {lang: SEED_ID.format(lang=lang) for lang in LANGS}
seeds = collect_all_seeds(seed_path_map)
seeds["yue"] = seeds.pop("zh_yue")

count_seeds = lambda ents: len(ents)
num_of_seeds = {lang: count_seeds(ents) for lang, ents in seeds.items()}
num_of_seeds = pd.DataFrame([num_of_seeds]).rename(index={0: "num_of_seeds"})

miss = lambda ents: round((len(ents) - sum(ents.values()))/len(ents), 2)
missing_rate = {lang: miss(ents) for lang, ents in seeds.items()}
missing_rate = pd.DataFrame([missing_rate]).rename(index={0: "ILL_missing_rate"})

table = table.rename(columns = dict(table.iloc[0]))
table = table.drop(index="language")
table = pd.concat([table, num_of_seeds, missing_rate])

In [5]:
table

Unnamed: 0,en,fa,ko,he,hy,el,yue,ka,mk,bn,ur,hi,ta,tl,ml,kn
num_attribute_triples,1473285,80602.0,71547.0,121470.0,69381.0,94136.0,41823.0,67154.0,108720.0,71699.0,34436.0,31804.0,32513.0,22315.0,52570.0,35089.0
num_attributes,4634,1035.0,1056.0,984.0,175.0,627.0,80.0,224.0,1663.0,685.0,231.0,77.0,133.0,28.0,130.0,56.0
num_descriptions,411872,34608.0,34484.0,33896.0,33408.0,32899.0,29806.0,29140.0,28024.0,25161.0,24311.0,22606.0,22522.0,21258.0,21231.0,17071.0
num_entities,411882,34635.0,34489.0,33899.0,33438.0,32912.0,29893.0,29157.0,28043.0,25199.0,24331.0,22631.0,22541.0,21271.0,21243.0,17081.0
num_relation_triples,1083093,58882.0,57774.0,70089.0,65121.0,67831.0,49155.0,53279.0,52703.0,49681.0,36980.0,36101.0,42762.0,33554.0,38270.0,24849.0
num_relations,921,401.0,438.0,468.0,236.0,389.0,117.0,255.0,433.0,278.0,199.0,128.0,193.0,62.0,157.0,92.0
num_of_seeds,149369,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
ILL_missing_rate,0,0.14,0.31,0.27,0.38,0.26,0.27,0.25,0.29,0.16,0.13,0.39,0.37,0.16,0.26,0.38


#### Sampling methods
(1) Languages: 15 languages, each of which uses its distinct script.

    Rank - Lang (Script; Wikicode)
    
    <Wikipeida #article rank: 26-50>
    26 - Hebrew (Hebr; he), 
    28 - Korean (Hang; ko), 
    37 - Farsi (Persian; fa), 
    44 - Greek (Grek; el), 
    50 - Georgian (Geor; ka)

    <Wikipeida #article rank: 51-75>
    54 - Hindi (Deva; hi), 
    60 - Macedonian (Cyrl; mk), 
    61 - Tagalog (Latn; tl), 
    63 - Bengali (Beng; bn), 
    67 - Tamil (Taml; ta)

    <Wikipeida #article rank: 76-108>
    83 - Cantonese (Hant; yue), 
    86 - Urdu (Arab; ur), 
    90 - Malayalam (Mlym; ml), 
    99 - Kannada (Knda; kn), 
    108 - Armenian (Armn; hy) 

(2) We first sample 15K seed entities from each language, except for English.

(3) To ensure comparibility, all the sampled entities that have English counterparts, i.e., the union, are set as English seeds.
As a result, there are approximately 150K English seed entities. 

(4) Using the seeds of each language, we collect the one-hop neighbours to build the monolingual KG, which triples are inserted if:
    
    (i) the relation has a label written in this language, and 
    (ii) the object has a wiki sitelink of this language.
    
(4) Collect language-specific attributes of the seeds, which labels are written in the specified language.

(5) Collect language-specific descriptions of all the entities in the monolingual KG of the specified language. 

(6) Note that the sampled 15K entities may and may not have ILLs to English. Each language has a different missing rate, and the missed ones are annotated in the groundtruth.  

#### Issues

(1) The union English KG is quite big. We may need to filter out some triples, e.g., low contribution ones, so a GPU has enough memory to run GCNs with it.

In [6]:
# print(pd.DataFrame.to_markdown(table))