In [1]:
import json
import pandas as pd
import random

In [None]:
### 1. For each resource, divide lemmas into quartiles based on N of entities they have
### 2. In each quartile, randomly draw 10 entities excluding related match entities; take only unique entities (40 in total)

In [None]:
def _get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    # change path to GitHub
    
    with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

In [2]:
def _get_lemma_quartiles(lang:str, path_to_n_hits_by_lemma:str) -> dict:
    '''
    Grouping lemmas in quartiles according to N of entities they have in resources
    lang: str, 'en' or 'nl' (relevant only for wikidata and aat)
    path_to_n_hits_by_lemma: str, a path to a csv file with N of hits by lemma (for example, '/LODlit/Wikidata/n_hits_by_lemma.csv')
    Returns dict {'q1':['lemma']}
    '''
    
    results = {}

    resource_hits = pd.read_csv(path_to_n_hits_by_lemma)
    
    # taking entities with hits > 0
    resource_hits = resource_hits[resource_hits['total_lemma'] > 0]
    
    if 'lang' in resource_hits.columns:
        resource_hits_by_lang = resource_hits.loc[resource_hits['lang'] == lang]
    # for PWN and ODWN
    else:
        resource_hits_by_lang = resource_hits
    
    # getting quantiles values
    q_values = list(resource_hits_by_lang["total_lemma"].quantile([0,0.25,0.5,0.75,1]))
    
    # getting list of lemmas by quartiles
    # quartile 1
    results['q1'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[0]) <= row[1]['total_lemma'] <= int(q_values[1])]
    # quartile 2
    results['q2'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[1]) <= row[1]['total_lemma'] <= int(q_values[2])]
    # quartile 3
    results['q3'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[2]) <= row[1]['total_lemma'] <= int(q_values[3])]
    # quartile 4
    results['q4'] = [row[1]['lemma'] for row in resource_hits_by_lang.iterrows() if int(q_values[3]) <= row[1]['total_lemma'] <= int(q_values[4])]

    return results

In [None]:
def _get_unique_entities_per_quartile(lemma_quartiles:dict, entities_per_lemma:dict) -> dict:
    
    entity_quartiles = {}
    entity_quartiles['q1'] = []
    entity_quartiles['q2'] = []
    entity_quartiles['q3'] = []
    entity_quartiles['q4'] = []
    
    for lemma in lemma_quartiles['q1']:
        entity_quartiles['q1'].extend(entities_per_lemma[lemma])
        
    for lemma in lemma_quartiles['q2']:
        entity_quartiles['q2'].extend(entities_per_lemma[lemma])
        
    for lemma in lemma_quartiles['q3']:
        entity_quartiles['q3'].extend(entities_per_lemma[lemma])
    
    for lemma in lemma_quartiles['q4']:
        entity_quartiles['q4'].extend(entities_per_lemma[lemma])
    
    # take only unique entities
    
    unique_e = []
    for q, entities in entity_quartiles.items():
        unique_per_q = []
        for e in entities:
            if e not in unique_e:
                unique_per_q.append(e)
                unique_e.append(e)
        entity_quartiles[q] = unique_per_q
                
    return entity_quartiles

In [None]:
def _draw_random_entities_per_q(entities_per_q:dict) -> dict:
    
    random_per_q = {}
    
    # check len
    if len(entities_per_q['q1']) < 10:
        add_k = 10 - len(entities_per_q['q1'])
        random_per_q['q1'] = entities_per_q['q1']
    else:
        random_per_q['q1'] = random.sample(entities_per_q['q1'], k=10)
        
    random_per_q['q2'] = random.sample(entities_per_q['q2'], k=10)
    random_per_q['q3'] = random.sample(entities_per_q['q3'], k=10)
    random_per_q['q4'] = random.sample(entities_per_q['q4'], k=10)

    return random_per_q

In [None]:
def _get_resource_properties(resource:str,lang:str) -> dict:
    
    resource_props = {}
    
    if resource == 'wikidata':
        resource_props["path_to_n_hits_by_lemma"] = "/Users/anesterov/reps/LODlit/Wikidata/n_hits_by_lemma.csv"
        resource_props["subset_path"] = f"/Users/anesterov/reps/LODlit/Wikidata/wd_{lang}_subset.json"
        resource_props["entity_id_key"] = ["QID"]
        resource_props["lit_1"] = "prefLabel"
        resource_props["lit_2"] = "aliases"
        resource_props["lit_3"] = "description"
        resource_props["lit_4"] = "instance_of"
        resource_props["lit_5"] = "subclass_of"
        
    if resource == 'aat':
        resource_props["path_to_n_hits_by_lemma"] = "/Users/anesterov/reps/LODlit/AAT/n_hits_by_lemma.csv"
        resource_props["subset_path"] = f"/Users/anesterov/reps/LODlit/AAT/aat_{lang}_subset.json"
        resource_props["entity_id_key"] = ["aat_uri"]
        resource_props["lit_1"] = "prefLabel"
        resource_props["lit_2"] = "altLabel"
        resource_props["lit_3"] = "scopeNote"
        resource_props["lit_4"] = "prefLabel_comment"
        resource_props["lit_5"] = "altLabel_comment"
    
    if resource == 'pwn':
        resource_props["path_to_n_hits_by_lemma"] = "/Users/anesterov/reps/LODlit/PWN/pwn31_hits_by_lemma.csv"
        resource_props["subset_path"] = "/Users/anesterov/reps/LODlit/PWN/pwn_subset.json"
        resource_props["entity_id_key"] = ["synset_id"]
        resource_props["lit_1"] = "lemmata"
        resource_props["lit_2"] = "definition"
        resource_props["lit_3"] = "examples"
        
    if resource == 'odwn':
        resource_props["path_to_n_hits_by_lemma"] = "/Users/anesterov/reps/LODlit/ODWN/odwn_hits_by_lemma.csv"
        resource_props["subset_path"] = "/Users/anesterov/reps/LODlit/ODWN/odwn_subset.json"
        resource_props["entity_id_key"] = ["synset_id","le_id"]
        resource_props["lit_1"] = "le_written_form"
        resource_props["lit_2"] = "sense_definition"
        resource_props["lit_3"] = "sense_examples"
        resource_props["lit_4"] = "synonyms"
        resource_props["lit_5"] = "synset_definitions"
        
    return resource_props

In [None]:
def get_sample_by_resource(resource:str,lang:str):
    '''
    lang: str, 'en' or 'nl'
    '''
    subset_df = pd.DataFrame()
    
    # load the resource props
    resource_props = _get_resource_properties(resource,lang)
        
    # import query terms
    with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
    
    # load the subset
    with open(resource_props['subset_path'],'r') as jf:
        subset = json.load(jf)
        
    # load related match entities
    rm_e = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")
    # selecting rm entities per lemma by lang and resource; rewrite rm_e
    rm_e = rm_e[(rm_e["lang"] == lang) & (rm_e["resource"] == resource)]
        
    # get unique entities per lemma
    unique_e_per_lemma = {}

    for lemma, terms in query_terms[lang].items():
        e_per_lemma = []
        for query_term, subset_hits in subset.items():
            if query_term in terms:
                # special condition for ODWN
                if resource == 'odwn':
                    for hit in subset_hits:
                        if resource_props['entity_id_key'][0] not in hit.keys():
                            e_per_lemma.append(hit[resource_props['entity_id_key'][1]])
                        else:
                            e_per_lemma.append(hit[resource_props['entity_id_key'][0]])
                else:
                    e_per_lemma.extend([hit[resource_props['entity_id_key'][0]] for hit in subset_hits])
        unique_e_per_lemma[lemma] = list(set(e_per_lemma))
        
    # get lemma quartiles
    lemma_quartiles = _get_lemma_quartiles(lang,resource_props['path_to_n_hits_by_lemma'])
    # divide entities into quartiles
    entities_per_q = _get_unique_entities_per_quartile(lemma_quartiles,unique_e_per_lemma)
    # get 10 random entities per quartile
    random_per_q = _draw_random_entities_per_q(entities_per_q)
    # generate a df sample

    for q, entities in random_per_q.items():
        for query_term, hits in subset.items():
            lemma = _get_lemma_by_term(query_term,lang)
            for hit in hits:
                # special condition for ODWN
                if resource == 'odwn' and hit.get(resource_props['entity_id_key'][0]) == None:
                    if hit[resource_props['entity_id_key'][1]] in entities and lemma in lemma_quartiles[q]:
                        row = {"term":lemma,"entity_id":hit.get(resource_props['entity_id_key'][1]),\
                               "text_1":hit.get(resource_props["lit_1"]),"text_2":hit.get(resource_props["lit_2"]),\
                              "text_3":hit.get(resource_props["lit_3"]),"text_4":hit.get(resource_props["lit_4"]),\
                               "text_5":hit.get(resource_props["lit_5"])}
                        subset_df = subset_df.append(row,ignore_index=True)
                else:
                    # special condition for PWN
                    if resource == 'pwn':
                        if hit[resource_props['entity_id_key'][0]] in entities and lemma in lemma_quartiles[q]:
                            row = {"term":lemma,"entity_id":hit[resource_props['entity_id_key'][0]],\
                               "text_1":hit[resource_props.get("lit_1")],"text_2":hit[resource_props.get("lit_2")],\
                              "text_3":hit[resource_props.get("lit_3")]}
                            subset_df = subset_df.append(row,ignore_index=True)
                    else:
                        if hit[resource_props['entity_id_key'][0]] in entities and lemma in lemma_quartiles[q]:
                            row = {"term":lemma,"entity_id":hit.get(resource_props['entity_id_key'][0]),\
                               "text_1":hit.get(resource_props.get("lit_1")),"text_2":hit.get(resource_props.get("lit_2")),\
                              "text_3":hit.get(resource_props.get("lit_3")),"text_4":hit.get(resource_props.get("lit_4")),\
                               "text_5":hit.get(resource_props.get("lit_5"))}
                            subset_df = subset_df.append(row,ignore_index=True)
    
    subset_df.drop_duplicates(subset=['entity_id'],inplace=True)
    
    return subset_df

In [None]:
### Generating sample files

In [None]:
sample = get_sample_by_resource('odwn','nl')
sample.to_csv("/Users/anesterov/reps/LODlit/samples/odwn.csv")

In [3]:
_get_lemma_quartiles("en","/Users/anesterov/reps/LODlit/Wikidata/n_hits_by_lemma.csv")

{'q1': ['allochtoon',
  'baboo',
  'bush negro',
  'coolie',
  'developing nations',
  'footmen',
  'full blood',
  'half-blood',
  'half-breed',
  'hottentot',
  'kaffir',
  'lilliputian',
  'low-income countries',
  'mestizo',
  'mohammedan',
  'mongoloid',
  'mulatto',
  'primitivism',
  'transvestite'],
 'q2': ['barbarian',
  'batavia',
  'berber',
  'caucasian',
  'discover',
  'eskimo',
  'exotic',
  'handicap',
  'headhunter',
  'hermaphrodite',
  'homosexual',
  'inuit',
  'maroon',
  'medicine man',
  'métis',
  'pygmy',
  'retarded',
  'southern rhodesia',
  'third world'],
 'q3': ['aboriginal',
  'bombay',
  'burma',
  'calcutta',
  'caucasian',
  'disabled',
  'ethnicity',
  'first world',
  'gay',
  'gypsy',
  'immigrant',
  'madras',
  'negro',
  'page',
  'primitive',
  'queer',
  'roots',
  'second world',
  'slave'],
 'q4': ['black',
  'colored',
  'descent',
  'dwarf',
  'ethnic groups',
  'homo',
  'indian',
  'indigenous',
  'indo',
  'moor',
  'native',
  'oriental