In [2]:
import numpy
import json
import pandas as pd

In [130]:
import random

In [None]:
### 1. For each resource, divide lemmas into quartiles based on N of entities they have
### 2. In each quartile, randomly draw 10 entities excluding related match entities; take only unique entities (40 in total)

In [74]:
# importing query terms with lemmas
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

In [52]:
def _get_lemma_quartiles(lang:str, path_to_n_hits_by_lemma:str) -> dict:
    '''
    Grouping lemmas in quartiles according to N of entities they have in resources
    lang: str, 'en' or 'nl' (relevant only for wikidata and aat)
    path_to_n_hits_by_lemma: str, a path to a csv file with N of hits by lemma (for example, '/LODlit/Wikidata/n_hits_by_lemma.csv')
    Returns dict {'q1':['lemma']}
    '''
    
    results = {}

    resource_hits = pd.read_csv(path_to_n_hits_by_lemma)
    
    if 'lang' in resource_hits.columns:
        resource_hits_by_lang = resource_hits.loc[resource_hits['lang'] == lang]
    # for PWN and ODWN
    else:
        resource_hits_by_lang = resource_hits
    
    # getting quantiles values
    q_values = list(resource_hits_by_lang["total_lemma"].quantile([0,0.25,0.5,0.75,1]))
    
    # getting list of lemmas by quartiles
    # quartile 1
    results['q1'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[0]) <= row[1]['total_lemma'] <= int(q_values[1])]
    # quartile 2
    results['q2'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[1]) <= row[1]['total_lemma'] <= int(q_values[2])]
    # quartile 3
    results['q3'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[2]) <= row[1]['total_lemma'] <= int(q_values[3])]
    # quartile 4
    results['q4'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[3]) <= row[1]['total_lemma'] <= int(q_values[4])]

    return results

In [None]:
# TO DO
# def _get_subset_by_resource()

### Testing Wikidata

In [44]:
path_to_n_hits = '/Users/anesterov/reps/LODlit/Wikidata/n_hits_by_lemma.csv'

In [54]:
wd_en_quartiles = _get_lemma_quartiles('en',path_to_n_hits)

In [55]:
wd_en_top10 = pd.read_csv('/Users/anesterov/reps/LODlit/annotation_sheet_wikidata_en.csv')

In [None]:
# first, generate a file with Top-10 grouped by lemmas

In [157]:
# change path later; the file on GitHub is zipped
wd_en_cs = pd.read_csv('/Users/anesterov/LODlit_local/wd/apr6/wikidata_en_cs.csv')

In [158]:
# dropping NaN values if there are no search results and cs_rm per
wd_en_cs.dropna(subset=['hit_id', 'cs_rm'], how='all', inplace=True)

In [159]:
# remove entities with no CS score
wd_en_cs.drop(wd_en_cs[wd_en_cs['cs_rm'] == 0].index, inplace=True)

In [160]:
# insert the lemmas column
lemmas = []
for row in wd_en_cs.iterrows():
    for lemma, wordforms in query_terms['en'].items():
        if row[1]['term'] in wordforms:
            lemmas.append(lemma)

In [161]:
wd_en_cs.insert(0,"lemma",lemmas)

In [162]:
# drop duplicates
wd_en_cs.drop_duplicates(subset=["lemma","hit_id"], inplace=True)

In [None]:
# sort and get Top-10 by lemma

In [163]:
top_10_by_lemma = pd.DataFrame()

In [164]:
for group in wd_en_cs.groupby("lemma"):
    top_10_by_lemma = top_10_by_lemma.append(group[1].sort_values(by="cs_rm", ascending=False)[0:10])

In [165]:
top_10_by_lemma

Unnamed: 0,lemma,term,hit_id,bow,cs_rm,cs_wm,cs_rm_wm
87351,aboriginal,aboriginal,Q103817,"['culture', 'aboriginal', 'minority', 'descend...",0.947705,0.820495,0.945303
87882,aboriginal,aboriginal,Q96200400,"['statement', 'state', 'territorial', 'buildin...",0.937666,0.814829,0.938606
88901,aboriginal,aboriginal,Q7980672,"['language', 'area', 'event', 'recognised', 'p...",0.929123,0.814358,0.929588
87611,aboriginal,aboriginal,Q28942344,"['aboriginal', 'people', 'community', 'indigen...",0.921723,0.842450,0.924107
88372,aboriginal,aboriginal,Q8039318,"['victoria', 'corporation', 'strait', 'council...",0.920416,0.821256,0.919088
...,...,...,...,...,...,...,...
42897,white,white,Q8125662,"['movement', 'white', 'category', 'people', 'w...",0.893367,0.817041,0.882048
40087,white,white,Q106677040,"['white', 'category', 'people', 'ethnoracial',...",0.892952,0.873420,0.900122
39323,white,white,Q2072081,"['state', 'person', 'white', 'people', 'color'...",0.886583,0.859105,0.894965
40228,white,white,Q2560112,"['separatism', 'movement', 'supremacy', 'white...",0.882116,0.798548,0.872482


In [166]:
len(wd_en_quartiles['q1'])

19

In [167]:
q1_entities = []
q2_entities = []
q3_entities = []
q4_entities = []

for group in top_10_by_lemma.groupby('lemma'):
    if group[0] in wd_en_quartiles['q1']:
        q1_entities.extend(group[1]['hit_id'].to_list())
        
    if group[0] in wd_en_quartiles['q2']:
        q2_entities.extend(group[1]['hit_id'].to_list())
        
    if group[0] in wd_en_quartiles['q3']:
        q3_entities.extend(group[1]['hit_id'].to_list())
        
    if group[0] in wd_en_quartiles['q4']:
        q4_entities.extend(group[1]['hit_id'].to_list())

In [168]:
len(q4_entities)

190

In [193]:
# select random entities from each quartile
random_subset = []

random_subset.extend(random.choices(q1_entities, k=10))
random_subset.extend(random.choices(q2_entities, k=10))
random_subset.extend(random.choices(q3_entities, k=10))
random_subset.extend(random.choices(q4_entities, k=10))

In [194]:
len(set(random_subset))

37

In [171]:
wd_annotated = pd.read_csv('/Users/anesterov/reps/LODlit/Wikidata/annotated/wd_en_rm.csv')

In [195]:
annotated_subset = pd.DataFrame()

for row in wd_annotated.iterrows():
    if row[1]['entity_id'] in random_subset:
        annotated_subset = annotated_subset.append(row[1])

In [196]:
# add lemmas
lemmas = []
for row in annotated_subset.iterrows():
    for lemma, wordforms in query_terms['en'].items():
        if row[1]['term'] in wordforms:
            lemmas.append(lemma)

In [197]:
annotated_subset.insert(0,"lemma",lemmas)

In [198]:
annotated_subset.drop_duplicates(subset=["entity_id","lemma"], inplace=True)

In [199]:
annotated_subset.to_csv('/Users/anesterov/reps/LODlit/Wikidata/annotated/subset_3.csv')

In [183]:
for e_id in random_subset:
    if e_id not in wd_annotated["entity_id"].to_list():
        print(e_id)