In [None]:
import json
import csv
import pandas as pd
import rdflib
from rdflib import Graph

In [None]:
# reading results EN
with open('aat_query_results_en.json','r') as jf:
    aat_query_results_en = json.load(jf)

In [None]:
# reading results NL
with open('aat_query_results_nl.json','r') as jf:
    aat_query_results_nl = json.load(jf)

In [None]:
# importing query terms to get lemmas
with open('/LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

### 1. N entities by query term

In [None]:
# EN
n_entities_en = {}
for term, results in aat_query_results_en.items():
    n_entities_en[term] = len(set([hit['aat_uri'] for hit in results]))

In [None]:
# NL
n_entities_nl = {}
for term, results in aat_query_results_nl.items():
    n_entities_nl[term] = len(set([hit['aat_uri'] for hit in results]))

In [None]:
# exporting the results in csv
with open('n_entities_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','n_entities']
    writer.writerow(header)
    
    for query_term, e in n_entities_en.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['en'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'en',e]
        writer.writerow(row)
        
    for query_term, e in n_entities_nl.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['nl'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'nl',e]
        writer.writerow(row)

### 2. N entities by lemma

In [None]:
df = pd.read_csv('n_entities_by_term.csv')
# lemmas are not unique in 2 lang, making seaprate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
with open('n_entities_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','n_entities']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['n_entities'])]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['n_entities'])]
        writer.writerow(row)

### 3. N hits (ocurences) of where terms are found by query term
– prefLabel, altLabel, scopeNote, prefLabel_comment, altLabel_comment

In [None]:
# EN

aat_where_terms_found_en = {}

for term, results in aat_query_results_en.items():
    pref = 0
    alt = 0
    scopeNote = 0
    pref_comment = 0
    alt_comment = 0
    for hit in results:
        if hit['found_in'] == 'prefLabel':
            pref += 1
        if hit['found_in'] == 'altLabel':
            alt += 1
        if hit['found_in'] == 'scopeNote':
            scopeNote += 1
        if hit['found_in'] == 'prefLabel_comment':
            pref_comment += 1
        if hit['found_in'] == 'altLabel_comment':
            alt_comment += 1
            
    aat_where_terms_found_en[term] = [pref,alt,scopeNote,pref_comment,alt_comment]

In [None]:
# NL

aat_where_terms_found_nl = {}

for term, results in aat_query_results_nl.items():
    pref = 0
    alt = 0
    scopeNote = 0
    pref_comment = 0
    alt_comment = 0
    for hit in results:
        if hit['found_in'] == 'prefLabel':
            pref += 1
        if hit['found_in'] == 'altLabel':
            alt += 1
        if hit['found_in'] == 'scopeNote':
            scopeNote += 1
        if hit['found_in'] == 'prefLabel_comment':
            pref_comment += 1
        if hit['found_in'] == 'altLabel_comment':
            alt_comment += 1
            
    aat_where_terms_found_nl[term] = [pref,alt,scopeNote,pref_comment,alt_comment]

In [None]:
# exporting a csv of where terms are found

with open('n_hits_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','aat_prefLabel','aat_altLabel','aat_scopeNote',\
              'aat_prefLabel_comment','aat_altLabel_comment','total']
    writer.writerow(header)
    
    for term, stats in aat_where_terms_found_en.items():
        for l, wordforms in query_terms['en'].items():
            if term in wordforms:
                lemma = l
        row = [lemma, term, 'en', stats[0], stats[1], stats[2], stats[3], stats[4], sum(stats)]
        writer.writerow(row)
        
    for term, stats in aat_where_terms_found_nl.items():
        for l, wordforms in query_terms['nl'].items():
            if term in wordforms:
                lemma = l
        row = [lemma, term, 'nl', stats[0], stats[1], stats[2], stats[3], stats[4], sum(stats)]
        writer.writerow(row)

### 4. N hits (ocurences) by lemma

In [None]:
df = pd.read_csv('n_hits_by_term.csv')
# lemmas are not unique in 2 lang, making separate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
# exporting a csv with stats by lemma

with open('n_hits_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','aat_prefLabel','aat_altLabel','aat_scopeNote',\
              'aat_prefLabel_comment','aat_altLabel_comment','total_lemma']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['aat_prefLabel']),sum(group[1]['aat_altLabel']),\
               sum(group[1]['aat_scopeNote']),sum(group[1]['aat_prefLabel_comment']),\
               sum(group[1]['aat_altLabel_comment']),sum(group[1]['total'])]
        
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['aat_prefLabel']),sum(group[1]['aat_altLabel']),\
               sum(group[1]['aat_scopeNote']),sum(group[1]['aat_prefLabel_comment']),\
               sum(group[1]['aat_altLabel_comment']),sum(group[1]['total'])]
        
        writer.writerow(row)

### Counting general number of literals in subgraphs

In [None]:
# These numbers are put in Table 1
# We used subgraphs in the N3 format (compressed on GitHub in /AAT)

In [None]:
aat_subgraph_en = Graph()
aat_subgraph_en.parse("AAT/aat_subgraph_en.n3", format="n3")

In [None]:
aat_subgraph_nl = Graph()
aat_subgraph_nl.parse("AAT/aat_subgraph_nl.n3", format="n3")

In [None]:
prefLabel_lit = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:prefLabel / skosxl:literalForm ?lit_form .
}
"""

In [None]:
altLabel_lit = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:altLabel / skosxl:literalForm ?lit_form .
}
"""

In [None]:
prefLabel_comment = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:prefLabel / rdfs:comment ?lit_form .
}
"""

In [None]:
altLabel_comment = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:altLabel / rdfs:comment ?lit_form .
}
"""

In [None]:
scope_note = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skos:scopeNote / rdf:value ?lit_form .
}"""

In [None]:
aat_subgraph_nl.query(altLabel_lit).bindings[0]["count"].value