In [2]:
import pandas as pd
import json
import csv
import statistics

In [2]:
# EN

In [3]:
aat_en = pd.read_csv("/Users/anesterov/reps/LODlit/aat_en_cs.csv")

In [15]:
def get_mean_cs(resource:str,lang:str):
    '''
    Generates a csv file with mean CS scores
    resource: str, "wikidata", "aat", "pwn", "odwn"
    lang: str, "en" or "nl"
    '''
    
    # loading appropriate file
    if resource == "aat":
        cs_scores = pd.read_csv(f"/Users/anesterov/reps/LODlit/aat_{lang}_cs.csv")
    if resource == "wikidata":
        cs_scores = pd.read_csv(f"/Users/anesterov/reps/LODlit/wikidata_{lang}_cs.csv")
    if resource == "pwn":
        cs_scores = pd.read_csv(f"/Users/anesterov/reps/LODlit/pwn_cs.csv")
    if resource == "odwn":
        cs_scores = pd.read_csv(f"/Users/anesterov/reps/LODlit/odwn_cs.csv")
    
    with open(f"{resource}_{lang}_mean_cs.csv",'w') as csv_file:
        writer = csv.writer(csv_file)
        header = ["term", "mean_rm", "mean_wm"]
        writer.writerow(header)
    
        for group in cs_scores.groupby("term"):
            cs_rm_per_term = [s for s in list(group[1]["cs_rm"]) if s != None]
            cs_wm_per_term = [s for s in list(group[1]["cs_wm"]) if s != None]

            data = [group[0],statistics.mean(cs_rm_per_term),statistics.mean(cs_wm_per_term)]
            writer.writerow(data)

In [21]:
get_mean_cs("odwn","nl")

In [1]:
# Mean CS of a term

In [None]:
# Mean CS of a lemma

# WD EN, ethnic group: 0,75
# WD EN, black
# WD EN, white

# WD NL, b
# WD NL, zwart
# WD NL, wit


In [6]:
def get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    # change path to GitHub
    
    with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

In [63]:
def get_mean_cs_of_lemma(resource:str,lemma:str,lang:str) -> tuple:
    
    mean_cs_by_resource = pd.read_csv(f"/Users/anesterov/reps/LODlit/{resource}_{lang}_mean_cs.csv")
    
    # adding lemmas column
    lemmas = []
    for row in mean_cs_by_resource.iterrows():
        lemmas.append(get_lemma_by_term(row[1]["term"],lang))
        
    mean_cs_by_resource.insert(0,"lemma",lemmas)
    
    for group in mean_cs_by_resource.groupby("lemma"):
        if group[0] == lemma:
            # replacing NaN
            clean_scores = group[1]["mean_rm"].fillna(0)
            scores_by_term = [s for s in group[1]["mean_rm"] if s > 0]
            mean_score = round(statistics.mean(scores_by_term),2)
            
    return (lemma,mean_score)

In [None]:
# Wikidata

In [34]:
get_mean_cs_of_lemma('wikidata','black','en')

('black', 0.64)

In [64]:
get_mean_cs_of_lemma('wikidata','zwart','nl')

('zwart', 0.59)

In [21]:
get_mean_cs_of_lemma('wikidata','white','en')

('white', 0.56)

In [65]:
get_mean_cs_of_lemma('wikidata','wit','nl')

('wit', 0.46)

In [23]:
get_mean_cs_of_lemma('wikidata','ethnic groups','en')

('ethnic groups', 0.75)

In [24]:
get_mean_cs_of_lemma('wikidata','indigenous','en')

('indigenous', 0.76)

In [66]:
get_mean_cs_of_lemma('wikidata','etnische groep','nl')

('etnische groep', 0.79)

In [67]:
get_mean_cs_of_lemma('wikidata','inheems','nl')

('inheems', 0.73)

In [None]:
# AAT

In [68]:
get_mean_cs_of_lemma('aat','black','en')

('black', 0.78)

In [69]:
get_mean_cs_of_lemma('aat','white','en')

('white', 0.76)

In [70]:
get_mean_cs_of_lemma('aat','zwart','nl')

('zwart', 0.68)

In [71]:
get_mean_cs_of_lemma('aat','wit','nl')

('wit', 0.66)

In [72]:
get_mean_cs_of_lemma('aat','ethnic groups','en')

('ethnic groups', 0.81)

In [74]:
get_mean_cs_of_lemma('aat','indigenous','en')

('indigenous', 0.81)

In [73]:
get_mean_cs_of_lemma('aat','etnische groep','nl')

('etnische groep', 0.87)

In [84]:
get_mean_cs_of_lemma('aat','inheems','nl')

('inheems', 0.66)

In [None]:
# PWN

In [76]:
get_mean_cs_of_lemma('pwn','black','en')

('black', 0.69)

In [77]:
get_mean_cs_of_lemma('pwn','white','en')

('white', 0.6)

In [75]:
get_mean_cs_of_lemma('pwn','indigenous','en')

('indigenous', 0.82)

In [78]:
get_mean_cs_of_lemma('pwn','ethnic groups','en')

('ethnic groups', 0.8)

In [79]:
# ODWN

In [80]:
get_mean_cs_of_lemma('odwn','zwart','nl')

('zwart', 0.66)

In [81]:
get_mean_cs_of_lemma('odwn','wit','nl')

('wit', 0.5)

In [82]:
get_mean_cs_of_lemma('odwn','etnische groep','nl')

('etnische groep', 0.84)

In [83]:
get_mean_cs_of_lemma('odwn','inheems','nl')

('inheems', 0.57)