In [1]:
import json
import csv
import pandas as pd
import sys

In [2]:
# importing Wikidata module
sys.path.append('/Users/anesterov/reps/LODlit/LODlitParser')
import wd

In [3]:
with open('/Users/anesterov/wd/jan31/results_clean_en.json','r') as jf:
    results_en = json.load(jf)

In [28]:
def get_hit_info_by_entity_id(entity_id:str,term:str,search_results:dict) -> list:
    '''
    Getting Wikidata hit info (from the search results file) by entity QID
    entity_id: str, QID, for example "Q100341056"
    term: str, query term
    Returns list of dicts
    '''
    hits_per_entity = []
    
    for hits in search_results.values():
        for hit in hits:
            if hit["QID"] == entity_id and hit["query_term"] == term:
                hits_per_entity.append(hit)
                
    return hits_per_entity

In [10]:
get_hit_info_by_entity_id("Q1150578",results_en)

[{'query_term': 'batavia',
  'lang': 'en',
  'QID': 'Q1150578',
  'prefLabel': '1740 Batavia massacre',
  'aliases': ['Chinezenmoord', 'Geger Pacinan'],
  'description': ['Pogrom of ethnic Chinese in the port city of Batavia (present-day Jakarta) in the Dutch East Indies'],
  'index': 532,
  'found_in': 'prefLabel',
  'instance_of': ['mass murder'],
  'subclass_of': []},
 {'query_term': 'batavia',
  'lang': 'en',
  'QID': 'Q1150578',
  'prefLabel': '1740 Batavia massacre',
  'aliases': ['Chinezenmoord', 'Geger Pacinan'],
  'description': 'Pogrom of ethnic Chinese in the port city of Batavia (present-day Jakarta) in the Dutch East Indies',
  'index': 532,
  'found_in': 'description',
  'instance_of': ['mass murder'],
  'subclass_of': []}]

In [19]:
### RM

In [12]:
annotated_rm = pd.read_csv("/Users/anesterov/reps/LODlit/Wikidata/annotated/wd_en_rm.csv")

In [14]:
annotated_rm.drop(['text_1','text_2','text_3','text_4','text_5'], axis=1, inplace=True)

In [17]:
# taking only True values in check
wd_en_rm_true = annotated_rm[annotated_rm.check]

In [18]:
wd_en_rm_true.head()

Unnamed: 0,term,entity_id,check
0,aboriginal,Q103817,True
1,aboriginal,Q96200400,True
2,aboriginal,Q7980672,True
3,aboriginal,Q28942344,True
4,aboriginal,Q8039318,True


In [20]:
### WM

In [21]:
annotated_wm = pd.read_csv("/Users/anesterov/reps/LODlit/Wikidata/annotated/wd_en_wm.csv")

In [23]:
annotated_wm.drop(['text_1','text_2','text_3','text_4','text_5'], axis=1, inplace=True)

In [24]:
# taking only True values in check
wd_en_wm_true = annotated_wm[annotated_wm.check]

In [26]:
wd_en_wm_true

Unnamed: 0,term,entity_id,check
0,aboriginal,Q1493806,True
1,aboriginal,Q113211688,True
2,aboriginal,Q3699302,True
3,aboriginal,Q115959477,True
4,aboriginal,Q6922014,True
...,...,...,...
570,white,Q6880109,True
572,white,Q639836,True
575,whites,Q59937358,True
576,whites,Q26597222,True


In [27]:
### Generate 2 files with annotated entities: (1) by RM and (2) by WM

In [32]:
#### RM

In [29]:
annotated_wd_en_rm_info = {}

for group in wd_en_rm_true.groupby("term"):
    all_hits_by_term = []
    
    entity_ids = list(group[1]["entity_id"])
    
    for qid in entity_ids:
        all_hits_by_term.extend(get_hit_info_by_entity_id(qid,group[0],results_en))
    
    annotated_wd_en_rm_info[group[0]] = all_hits_by_term

In [31]:
with open('/Users/anesterov/reps/LODlit/Wikidata/annotated/hits_rm.json', 'w') as jf:
    json.dump(annotated_wm_en_rm_info, jf)

In [33]:
#### WM

In [34]:
annotated_wd_en_wm_info = {}

for group in wd_en_wm_true.groupby("term"):
    all_hits_by_term = []
    
    entity_ids = list(group[1]["entity_id"])
    
    for qid in entity_ids:
        all_hits_by_term.extend(get_hit_info_by_entity_id(qid,group[0],results_en))
    
    annotated_wd_en_wm_info[group[0]] = all_hits_by_term

In [35]:
with open('/Users/anesterov/reps/LODlit/Wikidata/annotated/hits_wm.json', 'w') as jf:
    json.dump(annotated_wd_en_wm_info, jf)

In [36]:
### Generate an overview by properties

In [37]:
#### RM

In [39]:
n_hits_by_lemma = wd.get_n_hits_by_properties('/Users/anesterov/reps/LODlit/Wikidata/annotated/hits_rm.json','en',group_by_lemma=True)
n_hits_by_lemma.to_csv("/Users/anesterov/reps/LODlit/Wikidata/annotated/n_hits_by_lemma_en_rm.csv")

In [40]:
#### WM
n_hits_by_lemma_wm = wd.get_n_hits_by_properties('/Users/anesterov/reps/LODlit/Wikidata/annotated/hits_wm.json','en',group_by_lemma=True)
n_hits_by_lemma_wm.to_csv("/Users/anesterov/reps/LODlit/Wikidata/annotated/n_hits_by_lemma_en_wm.csv")

In [41]:
### Bar charts

In [42]:
import plotly.graph_objects as go
from IPython.display import Image

In [49]:
n_hits_by_lemma.sort_values("wd_lemma_total", ascending=True, inplace=True)

In [51]:
all_hits_by_lemma = pd.read_csv("/Users/anesterov/reps/LODlit/Wikidata/n_hits_by_lemma_en.csv")

In [53]:
all_hits_by_lemma.sort_values("wd_lemma_total", ascending=True, inplace=True)

In [150]:
all_hits_by_lemma_nl = pd.read_csv("/Users/anesterov/reps/LODlit/Wikidata/n_hits_by_lemma_nl.csv")

In [151]:
all_hits_by_lemma_nl.sort_values("wd_lemma_total", ascending=True, inplace=True)

### Making proportions

In [155]:
all_hits_by_lemma.head()

Unnamed: 0.1,Unnamed: 0,lemma,lang,wd_prefLabel,wd_aliases,wd_descr,wd_lemma_total
1,1,allochtoon,en,1,0,0,1
9,9,bush negro,en,0,0,1,1
25,25,full blood,en,2,5,1,8
42,42,lilliputian,en,8,2,1,11
43,43,low-income countries,en,12,2,0,14


In [177]:
all_hits_by_lemma_nl_p = all_hits_by_lemma_nl.assign(wd_prefLabel_p=lambda x: x.wd_prefLabel / x.wd_lemma_total * 100,
                                               wd_aliases_p=lambda x: x.wd_aliases / x.wd_lemma_total * 100,
                                               wd_descr_p=lambda x: x.wd_descr / x.wd_lemma_total * 100)

In [178]:
all_hits_by_lemma_nl_p

Unnamed: 0.1,Unnamed: 0,lemma,lang,wd_prefLabel,wd_aliases,wd_descr,wd_lemma_total,wd_prefLabel_p,wd_aliases_p,wd_descr_p
17,17,eerste wereld,nl,1,0,0,1,100.000000,0.000000,0.000000
1,1,achterlijk,nl,1,0,0,1,100.000000,0.000000,0.000000
47,47,lagelonenland,nl,1,0,0,1,100.000000,0.000000,0.000000
32,32,inboorling,nl,1,0,0,1,100.000000,0.000000,0.000000
12,12,boslandcreool,nl,0,1,0,1,0.000000,100.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
81,81,zwart,nl,1057,236,231,1524,69.356955,15.485564,15.157480
28,28,homo,nl,83,28,3123,3234,2.566481,0.865801,96.567718
33,33,indiaan,nl,32,8,9183,9223,0.346959,0.086740,99.566302
20,20,etnische groep,nl,7,1,9241,9249,0.075684,0.010812,99.913504


In [179]:
lemmas = list(all_hits_by_lemma_nl_p['lemma'])
pref = list(all_hits_by_lemma_nl_p['wd_prefLabel_p'])
alias = list(all_hits_by_lemma_nl_p['wd_aliases_p'])
descr = list(all_hits_by_lemma_nl_p['wd_descr_p'])
#ocr = list(fig_4_data['ocr'])
#no_majority = list(fig_4_data['no_majority'])
text_marker = list(all_hits_by_lemma_nl_p['wd_lemma_total'])

bar_chart = go.Figure(data=[
    go.Bar(name="prefLabel",
           x=pref,
           y=lemmas,
           hovertext=list(all_hits_by_lemma_nl_p['wd_prefLabel']),
           orientation='h',
           marker_color='rgb(0,0,0)'),
    
    go.Bar(name="aliases",
           x=alias,
           y=lemmas,
           hovertext=list(all_hits_by_lemma_nl_p['wd_aliases']),
           orientation='h',
           marker_color='rgb(128, 128, 128)'),
    
    go.Bar(name="description",
           x=descr,
           y=lemmas,
           hovertext=list(all_hits_by_lemma_nl_p['wd_descr']),
           orientation='h',
           marker_color='rgb(224, 224, 224)',
           text=text_marker,
           textposition='outside')

])

bar_chart.update_layout(barmode='stack',
                  width=590,
                  height=1230,
                  bargap=0.2,
                  paper_bgcolor='rgb(255, 255, 255)',
                  plot_bgcolor='rgb(255, 255, 255)',
                  title={"text":"Figure 4. NL, Lemmas by properties, all results, proportion",
                        "x":0,
                        "y":1,
                        "yanchor":"top",
                        "yref":"paper",
                        "pad":{"t":-55},
                        "font":{"size":14}},
                  
                  margin=dict(
                    l=0,
                    r=0,
                    b=0,
                    t=60,
                    pad=1),
                  
                  font=dict(
                    family="Arial",
                    size=8,
                    color="black"),
                  
                  legend=dict(
                    traceorder="normal",
                    orientation="h",
                    itemsizing='constant',
                    itemwidth=30,
                    yanchor="top",
                    y=1.03,
                    xanchor="left",
                    x=-0.03,
                    font=dict(
                    family="Arial",
                    size=8,
                    color="black"
                )),
                 )

bar_chart.update_traces(marker=dict(line=dict(width=0.5),line_color="black"),hoverinfo='text',
                 textfont_size=8, selector=dict(type='bar'))

bar_chart.update_yaxes(showgrid=False,
                visible=True)

bar_chart.update_xaxes(showgrid=False,
                visible=False)

bar_chart.show()

In [180]:
bar_chart.write_html('all_hits_by_lemma_perc_nl.html',include_plotlyjs="cdn")