In [1]:
import os
from util import Stats
import pandas as pd
from util import count_lines_from_file


LANGS = open("./meta/low-langs.txt").read().split()
DATA_DIR = "./data"

records = []

for lang in LANGS:
    stats = Stats(os.path.join(DATA_DIR, f"{lang}_en"))
    record_1, record_2 = stats.analyze()
    records += [record_1, record_2]
    
df = pd.DataFrame(records)

In [2]:
def get_lang(row):
    langs = row.data_dir.split('/')[-1].split('_')
    if row.lang == 1:
        return langs[-2]
    else:
        return langs[-1]

df['language'] = df.apply(get_lang, axis=1)

In [3]:
ILL_stats = {}
ILL_stats['num_of_ILLs'] = count_lines_from_file("data/zh_yue_en/ref_ent_ids")
ILL_stats['train'] = count_lines_from_file("data/zh_yue_en/train")
ILL_stats['dev'] = count_lines_from_file("data/zh_yue_en/dev")
ILL_stats['test'] = count_lines_from_file("data/zh_yue_en/test")

print(ILL_stats)

{'num_of_ILLs': 2777, 'train': 1748, 'dev': 195, 'test': 834}


In [4]:
table = df[(df.lang == 1) | (df.index == 1)] \
    .drop(columns=["lang", "data_dir"]) \
    .sort_values(by="num_entities", ascending=False) \
    .transpose() \
    .sort_index()

print(pd.DataFrame.to_markdown(table))

|                       | 1      | 4     | 2     | 0     | 28    | 6     | 8     | 12     | 22    | 16    | 18    | 10    | 20    | 24    | 14    | 26    |
|:----------------------|:-------|:------|:------|:------|:------|:------|:------|:-------|:------|:------|:------|:------|:------|:------|:------|:------|
| language              | en     | fa    | ko    | he    | hy    | el    | ka    | mk     | ur    | bn    | ta    | hi    | yue   | ml    | tl    | kn    |
| num_attribute_triples | 118154 | 90405 | 68040 | 93656 | 30552 | 62054 | 38834 | 107520 | 49700 | 65278 | 24113 | 26805 | 14490 | 26330 | 5743  | 17699 |
| num_attributes        | 2405   | 971   | 906   | 851   | 164   | 560   | 208   | 1596   | 240   | 613   | 130   | 78    | 69    | 120   | 29    | 52    |
| num_descriptions      | 47389  | 11112 | 6831  | 9989  | 5193  | 8182  | 3251  | 4222   | 3302  | 5379  | 2750  | 3209  | 3292  | 2929  | 1787  | 959   |
| num_entities          | 49931  | 27024 | 25633 | 17432 | 16397

In [5]:
table

Unnamed: 0,1,4,2,0,28,6,8,12,22,16,18,10,20,24,14,26
language,en,fa,ko,he,hy,el,ka,mk,ur,bn,ta,hi,yue,ml,tl,kn
num_attribute_triples,118154,90405,68040,93656,30552,62054,38834,107520,49700,65278,24113,26805,14490,26330,5743,17699
num_attributes,2405,971,906,851,164,560,208,1596,240,613,130,78,69,120,29,52
num_descriptions,47389,11112,6831,9989,5193,8182,3251,4222,3302,5379,2750,3209,3292,2929,1787,959
num_entities,49931,27024,25633,17432,16397,16072,13962,12504,10975,9634,9040,8782,8731,7624,5468,3716
num_relation_triples,92056,62031,58988,48404,45972,44366,40887,39327,38142,35532,33738,23904,22019,22564,14590,12499
num_relations,562,398,422,397,218,336,233,424,229,266,189,122,107,147,60,88


#### Sampling methods
(1) Languages: 15 languages, each of which uses its distinct script.

    Rank - Lang (Script; Wikicode)
    
    <Wikipeida #article rank: 26-50>
    26 - Hebrew (Hebr; he), 
    28 - Korean (Hang; ko), 
    37 - Farsi (Persian; fa), 
    44 - Greek (Grek; el), 
    50 - Georgian (Geor; ka)

    <Wikipeida #article rank: 51-75>
    54 - Hindi (Deva; hi), 
    60 - Macedonian (Cyrl; mk), 
    61 - Tagalog (Latn; tl), 
    63 - Bengali (Beng; bn), 
    67 - Tamil (Taml; ta)

    <Wikipeida #article rank: 76-108>
    83 - Cantonese (Hant; yue), 
    86 - Urdu (Arab; ur), 
    90 - Malayalam (Mlym; ml), 
    99 - Kannada (Knda; kn), 
    108 - Armenian (Armn; hy) 

(2) To ensure the comparability, we first collect the seed entities that have wiki sitelinks in all the above 15 languages.

(3) Collect one-hop neighboors of the seeds (obtained from (2)) and build relation triples for a specified language if:
    
    (i) the relation has a label written in this language, and 
    (ii) the object has a wiki sitelink of this language.
    
(4) Collect language-specific attributes of the seeds, which labels are written in the specified language.

(5) Collect language-specific descriptions of all the entities in the monolingual KG of the specified language (created by the above).    

#### Issues

(1) In general, these monolingual KGs are quite small because the given seeds are restricted to the intersection of all languages

(2) Similar to (1), the topics of such entities are also more limited
    
(3) Wikidata descriptions are shorter than article abstracts in Wikipedia pages or DBpedia

#### Questions

(1) Do we need to discard some small languages, so the intersection set can be bigger

(2) Do we need the intersection set to ensure comparability? or only the numbers of ILL seeds need to be equal? 

(3) Do we need the textual information from other resource? or we just add label names to the descriptions? 