In [1]:
import json
import csv
import rdflib
from rdflib import Graph

#### Reading the AAT query results

In [2]:
# reading the results EN
with open('/Users/anesterov/reps/LODlit/AAT/aat_query_results_en.json','r') as jf:
    aat_query_results_en = json.load(jf)

In [8]:
# reading the results NL
with open('/Users/anesterov/reps/LODlit/AAT/aat_query_results_nl.json','r') as jf:
    aat_query_results_nl = json.load(jf)

In [13]:
# importing query terms to get lemmas
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

#### 1. Hits per query term (unique entities)

In [4]:
# EN
hits_en = {}
for term, results in aat_query_results_en.items():
    hits_en[term] = len(set([hit['aat_uri'] for hit in results]))

In [5]:
hits_en

{'batavias': 0,
 'batavia': 1,
 'southern rhodesia': 0,
 'madras': 5,
 'low-income country': 0,
 'low-income countries': 0,
 'headhunters': 0,
 'headhunter': 0,
 'immigrants': 11,
 'immigrant': 2,
 'barbarians': 0,
 'barbarian': 3,
 'moors': 7,
 'moor': 7,
 'indians': 82,
 'indian': 467,
 'homos': 0,
 'homo': 14,
 'exotics': 0,
 'exotic': 17,
 'allochtoons': 0,
 'allochtoon': 0,
 'medicine men': 1,
 'medicine man': 0,
 'third world': 3,
 'servants': 22,
 'servant': 10,
 'hottentots': 2,
 'hottentot': 1,
 'gypsies': 3,
 'gypsys': 0,
 'gypsy': 5,
 'coolies': 0,
 'coolie': 0,
 'whitest': 0,
 'whiter': 3,
 'whites': 5,
 'white': 921,
 'caucasians': 0,
 'caucasian': 19,
 'kaffirs': 0,
 'kaffir': 1,
 'descent': 40,
 'mohammedans': 0,
 'mohammedan': 1,
 'mongoloids': 0,
 'mongoloid': 4,
 'discover': 7,
 'homosexuals': 2,
 'homosexual': 5,
 'baboos': 0,
 'baboo': 0,
 'pygmys': 0,
 'pygmy': 16,
 'pages': 104,
 'page': 77,
 'bush negroes': 0,
 'bush negros': 0,
 'bush negro': 0,
 'second world':

In [9]:
# NL
hits_nl = {}
for term, results in aat_query_results_nl.items():
    hits_nl[term] = len(set([hit['aat_uri'] for hit in results]))

In [10]:
hits_nl

{'politionele acties': 0,
 'politionele actie': 0,
 'birma': 18,
 'exotische': 9,
 'exotischere': 0,
 'exotischer': 0,
 'exotisch': 2,
 'handicaps': 2,
 'handicap': 5,
 'medicijnmannen': 0,
 'medicijnman': 0,
 'lilliputters': 0,
 'lilliputter': 0,
 "homo's": 0,
 'homo': 8,
 'mongools': 9,
 'mongolen': 12,
 'mongool': 0,
 'japs': 0,
 'jappen': 0,
 'jap': 0,
 'islamieten': 1,
 'islamiet': 0,
 'westersere': 0,
 'westerse': 120,
 'westerser': 0,
 'wester': 0,
 'westerste': 0,
 'westerst': 0,
 'westers': 13,
 'etniciteiten': 0,
 'etniciteit': 13,
 'homoseksualiteit': 3,
 'negers': 0,
 'neger': 0,
 'bedienden': 16,
 'bediendes': 2,
 'bediend': 64,
 'bediende': 15,
 'hermafrodieten': 1,
 'hermafrodiet': 3,
 'inuit': 8,
 'inuk': 0,
 'indiaans': 6,
 'indianen': 61,
 'indiaan': 7,
 'afkomsten': 0,
 'afkomst': 42,
 'blankst': 0,
 'blanke': 24,
 'blankste': 0,
 'blankere': 0,
 'blanker': 0,
 'blanks': 0,
 'blanken': 1,
 'blank': 5,
 'bombay': 4,
 'knechten': 6,
 'knechts': 0,
 'knecht': 2,
 'jappe

In [11]:
# exporting the results in csv
with open('hits_aat.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['term','lang','hits']
    writer.writerow(header)
    
    for query_term, hits in hits_en.items():
        row = [query_term,'en',hits]
        writer.writerow(row)
    for query_term, hits in hits_nl.items():
        row = [query_term,'nl',hits]
        writer.writerow(row)

#### 2. Where terms are found (occurences)
– prefLabel, altLabel, scopeNote, prefLabel_comment, altLabel_comment

In [14]:
# EN

aat_where_terms_found_en = {}

for term, results in aat_query_results_en.items():
    pref = 0
    alt = 0
    scopeNote = 0
    pref_comment = 0
    alt_comment = 0
    for hit in results:
        if hit['found_in'] == 'prefLabel':
            pref += 1
        if hit['found_in'] == 'altLabel':
            alt += 1
        if hit['found_in'] == 'scopeNote':
            scopeNote += 1
        if hit['found_in'] == 'prefLabel_comment':
            pref_comment += 1
        if hit['found_in'] == 'altLabel_comment':
            alt_comment += 1
            
    aat_where_terms_found_en[term] = [pref,alt,scopeNote,pref_comment,alt_comment]

In [16]:
# NL

aat_where_terms_found_nl = {}

for term, results in aat_query_results_nl.items():
    pref = 0
    alt = 0
    scopeNote = 0
    pref_comment = 0
    alt_comment = 0
    for hit in results:
        if hit['found_in'] == 'prefLabel':
            pref += 1
        if hit['found_in'] == 'altLabel':
            alt += 1
        if hit['found_in'] == 'scopeNote':
            scopeNote += 1
        if hit['found_in'] == 'prefLabel_comment':
            pref_comment += 1
        if hit['found_in'] == 'altLabel_comment':
            alt_comment += 1
            
    aat_where_terms_found_nl[term] = [pref,alt,scopeNote,pref_comment,alt_comment]

In [18]:
# exporting a csv of where terms are found

with open('aat_where_terms_found.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','aat_prefLabel','aat_altLabel','aat_scopeNote',\
              'aat_prefLabel_comment','aat_altLabel_comment','total']
    writer.writerow(header)
    
    for term, stats in aat_where_terms_found_en.items():
        for l, wordforms in query_terms['en'].items():
            if term in wordforms:
                lemma = l
        row = [lemma, term, 'en', stats[0], stats[1], stats[2], stats[3], stats[4], sum(stats)]
        writer.writerow(row)
        
    for term, stats in aat_where_terms_found_nl.items():
        for l, wordforms in query_terms['nl'].items():
            if term in wordforms:
                lemma = l
        row = [lemma, term, 'nl', stats[0], stats[1], stats[2], stats[3], stats[4], sum(stats)]
        writer.writerow(row)

#### Count by query term EN

In [None]:
with open('aat_count_by_query_term_en.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','query_term','prefLabel','altLabel','scopeNote','prefLabel_comment','altLabel_comment','total_per_query_term']
    writer.writerow(header)

In [8]:
with open('aat_count_by_query_term_en.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','query_term','prefLabel','altLabel','scopeNote','prefLabel_comment','altLabel_comment','total_per_query_term']
    writer.writerow(header)

    for key, forms in query_terms_cont_en.items():

        list_of_query_terms = [] # lemmas and forms of query terms
        list_of_query_terms.append(key)
        list_of_query_terms.extend(forms)

        for query_term in list_of_query_terms:

            pref_count = 0
            alt_count = 0
            note_count = 0
            pref_comment_count = 0
            alt_comment_count = 0

            for lemma, results in aat_en_query_results.items():
                for result in results:
                    if result['query_term'] == query_term:
                        if result['found_in'] == 'prefLabel':
                            pref_count += 1
                        if result['found_in'] == 'altLabel':
                            alt_count += 1
                        if result['found_in'] == 'scopeNote':
                            note_count += 1
                        if result['found_in'] == 'prefLabel_comment':
                            pref_comment_count += 1
                        if result['found_in'] == 'altLabel_comment':
                            alt_comment_count += 1
            total_count = pref_count + alt_count + note_count + pref_comment_count + alt_comment_count

            writer.writerow([key,query_term,pref_count,alt_count,note_count,pref_comment_count,alt_comment_count,total_count])

#### Count by query term NL

In [9]:
with open('aat_count_by_query_term_nl.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','query_term','prefLabel','altLabel','scopeNote','prefLabel_comment','altLabel_comment','total_per_query_term']
    writer.writerow(header)

    for key, forms in query_terms_cont_nl.items():

        list_of_query_terms = [] # lemmas and forms of query terms
        list_of_query_terms.append(key)
        list_of_query_terms.extend(forms)

        for query_term in list_of_query_terms:

            pref_count = 0
            alt_count = 0
            note_count = 0
            pref_comment_count = 0
            alt_comment_count = 0

            for lemma, results in aat_nl_query_results.items():
                for result in results:
                    if result['query_term'] == query_term:
                        if result['found_in'] == 'prefLabel':
                            pref_count += 1
                        if result['found_in'] == 'altLabel':
                            alt_count += 1
                        if result['found_in'] == 'scopeNote':
                            note_count += 1
                        if result['found_in'] == 'prefLabel_comment':
                            pref_comment_count += 1
                        if result['found_in'] == 'altLabel_comment':
                            alt_comment_count += 1
                            
            total_count = pref_count + alt_count + note_count + pref_comment_count + alt_comment_count

            writer.writerow([key,query_term,pref_count,alt_count,note_count,pref_comment_count,alt_comment_count,total_count])

#### Counting general number of literals

In [None]:
aat_subgraph_en = Graph()
aat_subgraph_en.parse("aat/aat_subgraph_en.n3", format="n3")

In [None]:
aat_subgraph_nl = Graph()
aat_subgraph_nl.parse("aat/aat_subgraph_nl.n3", format="n3")

In [None]:
prefLabel_lit = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:prefLabel / skosxl:literalForm ?lit_form .
}
"""

In [None]:
altLabel_lit = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:altLabel / skosxl:literalForm ?lit_form .
}
"""

In [None]:
prefLabel_comment = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:prefLabel / rdfs:comment ?lit_form .
}
"""

In [None]:
altLabel_comment = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skosxl:altLabel / rdfs:comment ?lit_form .
}
"""

In [None]:
scope_note = """
SELECT (COUNT(*) AS ?count)

WHERE {
?concept skos:scopeNote / rdf:value ?lit_form .
}"""

In [None]:
count = aat_subgraph_en.query(prefLabel_lit).bindings[0]