In [9]:
import pandas as pd
import pickle
import os
from collections import defaultdict
data_directory = "/Users/elliotschumacher/Dropbox/git/clel/datasets/wiki_data"

row_dict = []

for language in ["ar", "ko", "fa", "ru"]:
    with open(os.path.join(data_directory, language, f'wiki_{language}.pkl'), 'rb') as f:
        wiki_info = pickle.load(f)
        en_pages = wiki_info["en_pages"]
        documents = wiki_info["mentions"][language]
        nil_documents = wiki_info["nil_mentions"][language]
        l2_pages = wiki_info["l2_pages"][language]

    for doc_title in documents:
        for (sent_id, link_id), ment_dict in documents[doc_title].items():
            sentence = ment_dict['sent']
            link = ment_dict['link']
            kbid = link['id_ll']
            row_dict.append({
                "kbid" : kbid,
                "language" : language,
                "title" : link["title_ll"]
            })
            
entities_df = pd.DataFrame().from_dict(row_dict)

            

In [7]:
per_lang = pd.pivot_table(entities_df, index=["kbid", "language", "title"], 
                                 values=[], 
                                 aggfunc=len).reset_index()
per_lang = per_lang.rename(columns={0:"count"})
per_lang

Unnamed: 0,kbid,language,title,count
0,100076,fa,MI5,1
1,100161,ar,Bobby Darin,1
2,100180,ko,Iron Cross,2
3,1004,ru,April,30
4,1004834,fa,Isère (river),1
...,...,...,...,...
4101,9965489,ru,Cassidini,1
4102,9988187,ar,Twitter,1
4103,9988187,ko,Twitter,1
4104,9988187,ru,Twitter,1


In [12]:
by_kbid = []
for kbid in per_lang["kbid"].unique():
    kbid_ids = per_lang[per_lang["kbid"] == kbid]
    count = 0
    lang_set = set()
    title = ""
    lang_counts = defaultdict(lambda : 0)
    for i, row in kbid_ids.iterrows():
        count += row["count"]
        lang_set.add(row["language"])
        title = row["title"]
        lang_counts[row['language']] += row["count"]
    by_kbid.append({
        "count" : count,
        "lang" : ",".join(lang_set),
        "kbid" : kbid,
        "title" : title
    })
    for language in ["ar", "ko", "fa", "ru"]:
        if language in lang_counts:
            by_kbid[-1][f"count_{language}"] = lang_counts[language]
        else:
            by_kbid[-1][f"count_{language}"] = 0
kbid_df = pd.DataFrame().from_dict(by_kbid).sort_values(by="count", ascending=False)
kbid_df

Unnamed: 0,count,lang,kbid,title,count_ar,count_ko,count_fa,count_ru
1959,4165,"ru,ko,ar,fa",3434750,United States,1229,908,661,1367
2931,2823,"ru,ko,ar,fa",5405,China,856,796,381,790
2794,1562,"ru,ko,ar,fa",5042481,Cuba,195,310,449,608
1790,514,ko,31717,United Kingdom,0,514,0,0
507,458,"ru,ko,ar,fa",15573,Japan,14,238,93,113
...,...,...,...,...,...,...,...,...
1596,1,ru,28706,SECAM,0,0,0,1
1600,1,ru,28769803,Old Mon script,0,0,0,1
1601,1,ar,287740,James Hunt,1,0,0,0
1603,1,ko,28847431,KBO Futures League,0,1,0,0


In [14]:
by_lang = pd.pivot_table(kbid_df, index=["lang"], 
                                 values=["count"], 
                                 aggfunc=sum).reset_index().sort_values(by="count", ascending=False)
by_lang['perc'] = by_lang['count'] / sum(by_lang['count'])
by_lang

Unnamed: 0,lang,count,perc
13,"ru,ko,ar,fa",12608,0.485296
3,ko,4438,0.170824
0,ar,2544,0.097921
2,fa,2541,0.097806
7,ru,2427,0.093418
9,"ru,ar,fa",286,0.011008
14,"ru,ko,fa",237,0.009122
1,"ar,fa",224,0.008622
8,"ru,ar",156,0.006005
4,"ko,ar",148,0.005697


In [18]:

by_lang = pd.pivot_table(kbid_df, index=["lang"], 
                                 values=["count", "count_ar", "count_ko", "count_ru", "count_fa"], 
                                 aggfunc=sum).reset_index().sort_values(by="count", ascending=False)
by_lang['perc'] = by_lang['count'] / sum(by_lang['count'])
for language in ["ar", "ko", "fa", "ru"]:
    by_lang[f'perc_{language}'] = by_lang[f'count_{language}'] / sum(by_lang[f'count_{language}'])

by_lang

Unnamed: 0,lang,count,count_ar,count_fa,count_ko,count_ru,perc,perc_ar,perc_ko,perc_fa,perc_ru
13,"ru,ko,ar,fa",12608,3156,2298,3267,3887,0.485296,0.515686,0.40945,0.450323,0.573473
3,ko,4438,0,0,4438,0,0.170824,0.0,0.55621,0.0,0.0
0,ar,2544,2544,0,0,0,0.097921,0.415686,0.0,0.0,0.0
2,fa,2541,0,2541,0,0,0.097806,0.0,0.0,0.497942,0.0
7,ru,2427,0,0,0,2427,0.093418,0.0,0.0,0.0,0.35807
9,"ru,ar,fa",286,82,93,0,111,0.011008,0.013399,0.0,0.018225,0.016377
14,"ru,ko,fa",237,0,54,60,123,0.009122,0.0,0.00752,0.010582,0.018147
1,"ar,fa",224,183,41,0,0,0.008622,0.029902,0.0,0.008034,0.0
8,"ru,ar",156,53,0,0,103,0.006005,0.00866,0.0,0.0,0.015196
4,"ko,ar",148,54,0,94,0,0.005697,0.008824,0.011781,0.0,0.0


In [31]:
kbid_acc = pd.pivot_table(kbid_df, index=["lang"], 
                                 values=["kbid", "count"], 
                                 aggfunc={"kbid" : pd.Series.nunique, "count" : sum}).reset_index()
kbid_acc


Unnamed: 0,lang,count,kbid
0,ar,2544,890
1,"ar,fa",224,17
2,fa,2541,891
3,ko,4438,899
4,"ko,ar",148,18
5,"ko,ar,fa",28,6
6,"ko,fa",86,15
7,ru,2427,891
8,"ru,ar",156,17
9,"ru,ar,fa",286,12
