## Getting backround information about query terms from the Words Matter knowledge graph
* – getting Contentious Issues description texts
* – getting suggested terms for contentious terms
* – making files with descriptions and suggestions for every lemma of query terms
* – getting top terms from the description text based on their TF-IDF scores
* – shaping the files with WM bag of words for every lemma
* – this notebook produces the following files:
    * (1) 'CI_description.json'
    * (2) 'suggested_terms_bows.json'
    * (3) 'en_lemmas_wm_info.json'
    * (4) 'nl_lemmas_wm_info.json'
    * (5) 'en_wm_bows.json'
    * (6) 'nl_wm_bows.json'
    * (7) 'en_wm_bows_tf_idf.json'
    * (8) 'nl_wm_bows_tf_idf.json'

In [None]:
import json
import math
from nltk.corpus import stopwords
import simplemma
import re
from nltk.stem import WordNetLemmatizer
import rdflib
from rdflib import Graph
from rdflib.namespace import Namespace
from rdflib.namespace import SKOS, RDF

In [None]:
wnl = WordNetLemmatizer()

### 1. Collecting WM description texts: querying WM KG

In [None]:
# Setting custom namespaces
culco = Namespace("https://w3id.org/culco#")
skosxl = Namespace("http://www.w3.org/2008/05/skos-xl#")
dcterms = Namespace("http://purl.org/dc/terms/")

In [None]:
# change path
path_to_wm = '/Users/anesterov/reps/wordsmatter/glossary.ttl'

In [None]:
# loading the graph
wm = Graph()
wm.parse(path_to_wm, format="turtle")

In [None]:
# SPARQL to get Contentious Issues descriptions

descr_text = wm.query(
    """        
    SELECT ?CI ?descr_text (GROUP_CONCAT(?cont_label_uri;SEPARATOR=",") AS ?cont_label_list)

    WHERE {

      ?CI dcterms:description ?descr_text ;
            culco:hasContentiousLabel ?cont_label_uri .
    }
    GROUP BY ?CI
    """,
    
    initNs={'culco': culco, 'dcterms':dcterms}
    
)

In [None]:
descr_text_dict = {}
for row in descr_text:
    prefix = "https://w3id.org/culco/wordsmatter/"
    cont_labels = row.cont_label_list.split(",")
    cl = [l.replace(prefix,"") for l in cont_labels]
    descr_text_dict[row.CI.replace(prefix,"")] = {"descr":str(row.descr_text), "cont_labels":cl}

In [None]:
# exporting
with open('CI_description.json', 'w') as jf:
    json.dump(descr_text_dict, jf)

### 2. Getting suggested terms

In [None]:
suggested = wm.query(
    """        
    SELECT ?cont_label (GROUP_CONCAT(?sug_label_lit;SEPARATOR=" ") AS ?sug_label_lit_list)

    WHERE {

      ?Suggestion culco:suggestedFor ?cont_label ;
                  culco:hasSuggestedLabel ?sug_label .
                  
      ?sug_label skosxl:literalForm ?sug_label_lit .
    }
    GROUP BY ?cont_label
    """,
    
    initNs={'culco': culco, 'skosxl':skosxl}
    
)

In [None]:
suggested_labels = {}
for row in suggested:
    label_id = row.cont_label.replace("https://w3id.org/culco/wordsmatter/","")
    suggested_labels[label_id] = str(row.sug_label_lit_list)

In [None]:
for label_uri, sug in suggested_labels.items():
    sug_list = sug.lower().replace("-"," ").replace("(","").replace(")","").replace("\xad","").split(" ")
    no_stop_words = [s for s in sug_list if s not in stopwords.words('dutch') \
                     and s not in stopwords.words('english')]
    suggested_labels[label_uri] = list(set(no_stop_words))

In [None]:
# exporting
with open('suggested_terms_bows.json', 'w') as jf:
    json.dump(suggested_labels, jf)

### 3. Generating files with WM info per lemma

In [None]:
# CI with labels and descriptions
with open("/Users/anesterov/reps/LODlit/CI_description.json",'r') as jf:
    wm_descr = json.load(jf)

In [None]:
# suggestions
with open("/Users/anesterov/reps/LODlit/suggested_terms_bows.json",'r') as jf:
    wm_suggestions = json.load(jf)

In [None]:
# importing lemmas with label URIs
with open('/Users/anesterov/reps/LODlit/en_lemmas_with_label_uris.json','r') as jf:
    en_lemmas_with_label_uris = json.load(jf)
    
with open('/Users/anesterov/reps/LODlit/nl_lemmas_with_label_uris.json','r') as jf:
    nl_lemmas_with_label_uris = json.load(jf)

#### EN: file with lemmas, their corresponding labels, WM text, suggestions

In [None]:
# {'lemma': {'wm_text':[''], 'suggestions':[''], 'label_uris':['']}}

en_lemmas_wm_text = {}

for lemma, label_uris in en_lemmas_with_label_uris.items():
    dict_per_lemma = {}
    descr_list_per_lemma = []
    for label in label_uris:
        for CI, info in wm_descr.items():
            if label in info["cont_labels"]:
                descr_list_per_lemma.append(info["descr"])
    
    dict_per_lemma["wm_text"] = list(set(descr_list_per_lemma))
    dict_per_lemma["label_uris"] = label_uris
    en_lemmas_wm_text[lemma] = dict_per_lemma

In [None]:
# adding suggestions

for lemma, info in en_lemmas_wm_text.items():
    suggestions_per_lemma = []
    for label_uri in info["label_uris"]:
        suggestion_list = wm_suggestions.get(label_uri)
        if suggestion_list != None:
            suggestions_per_lemma.extend(suggestion_list)
    info["suggestions"] = suggestions_per_lemma

In [None]:
# exporting
with open('en_lemmas_wm_info.json', 'w') as jf:
	json.dump(en_lemmas_wm_text, jf)

#### NL: file with lemmas, their corresponding labels, WM text, suggestions

In [None]:
# {'lemma': {'wm_text':[''], 'suggestions':[''], 'label_uris':['']}}

nl_lemmas_wm_text = {}

for lemma, label_uris in nl_lemmas_with_label_uris.items():
    dict_per_lemma = {}
    descr_list_per_lemma = []
    for label in label_uris:
        for CI, info in wm_descr.items():
            # checking if CI is in Dutch (has _nl suffix)
            if "_nl" in CI and label in info["cont_labels"]:
                descr_list_per_lemma.append(info["descr"])
    
    dict_per_lemma["wm_text"] = list(set(descr_list_per_lemma)) # taking only unque desr texts
    dict_per_lemma["label_uris"] = label_uris
    nl_lemmas_wm_text[lemma] = dict_per_lemma

In [None]:
# adding NL suggestions
for lemma, info in nl_lemmas_wm_text.items():
    suggestions_per_lemma = []
    for label_uri in info["label_uris"]:
        suggestion_list = wm_suggestions.get(label_uri)
        if suggestion_list != None:
            suggestions_per_lemma.extend(suggestion_list)
    info["suggestions"] = suggestions_per_lemma

In [None]:
# exporting
with open('nl_lemmas_wm_info.json', 'w') as jf:
    json.dump(nl_lemmas_wm_text, jf)

In [None]:
#### tokenise, lower-case, remove non-word characters, lemmatise

In [None]:
def make_bow(text:list, lang:str, merge_bows=False) -> list:
    '''
    Makes a BoW from a list of str:
    removes non-word charachters (incl. punctuation, numbers),
    removes stop-words (nltk),
    lowercases, tokenises (split by space), lemmatises (NLTK lemmatiser for EN; simplemma for NL),
    removes tokens with fewer than 3 characters;
    text: a list of str to make BoWs from
    lang: str, language of strings, 'en' or 'nl'
    merge_bows: bool, if there are multiple texts, merge them in one BoW (True) or not (False), default False 
    Returns a BoW (list of lists)
    '''
    joint_bow = []
    
    for t in text:
        no_w_text = re.sub('(\W|\d)',' ',t)
        text_bow = no_w_text.split(' ')
        
        # checking lang
        if lang == 'en':
            text_bow_clean = [wnl.lemmatize(token.lower()) for token in text_bow if token.lower() not in stopwords.words('english') \
                                 and token != '' and len(token) > 2]
        if lang == 'nl':
            # Dutch lemmatizer can output uppercase lemmas
            text_bow_clean_not_lowercased = [simplemma.lemmatize(token.lower(),lang='nl') for token in text_bow if token.lower() not in stopwords.words('dutch') \
                                 and token != '' and len(token) > 2]
            text_bow_clean = [t.lower() for t in text_bow_clean_not_lowercased]
            
        if merge_bows == True:
            joint_bow.extend(text_bow_clean)
        else:
            joint_bow.append(text_bow_clean)
            
    return joint_bow

#### EN: make a file with WM bows

In [None]:
for lemma, wm_info in en_lemmas_wm_text.items():
    bow = []
    bow.extend(make_bow(wm_info["wm_text"],"en"))
    wm_info["bow"] = bow
    
    # suggestions should be lemmatised
    lem_sug = [wnl.lemmatize(s) for s in wm_info["suggestions"]]
    wm_info["suggestions"] = lem_sug

In [None]:
# exporting EN json file
with open('en_wm_bows.json', 'w') as jf:
    json.dump(en_lemmas_wm_text, jf)

#### NL: make a file with WM bows

In [None]:
for lemma, wm_info in nl_lemmas_wm_text.items():
    bow = []
    bow.extend(make_bow(wm_info["wm_text"],"nl"))
    wm_info["bow"] = bow
    
    # suggestions should be lemmatised
    lem_sug = [simplemma.lemmatize(s,lang='nl') for s in wm_info["suggestions"]]
    wm_info["suggestions"] = [s.lower() for s in lem_sug]

In [None]:
# exporting NL json file
with open('nl_wm_bows.json', 'w') as jf:
    json.dump(nl_lemmas_wm_text, jf)

### 4. Getting WM BoWs with TF-IDF scores

In [None]:
# importing WM text EN
with open("/Users/anesterov/reps/LODlit/en_wm_bows.json","r") as jf:
    en_wm_bows = json.load(jf)
    
# importing WM text NL
with open("/Users/anesterov/reps/LODlit/nl_wm_bows.json","r") as jf:
    nl_wm_bows = json.load(jf)

#### TF-IDF functions

In [None]:
def tf(doc:list, token:str) -> float:
    '''
    Calculates term frequency
    doc: list, alll documents
    token: str, a token to get the TF score of
    Returns float
    '''
    n_found = len([t for t in doc if t == token])
    tf_score = n_found / len(doc)
    
    return tf_score

In [None]:
def idf(token:str, doc_freq:dict, n_docs:int) -> float:
    '''
    Calculates inverse document frequency:
        adds 1 to DF to avoid zero division
    token: str, a token to get the IDF score of
    doc_freq: dict, document frequency, in how many documents tokens appear
    n_docs: int, a number of documents
    Returns float
    '''
    idf_score = math.log(n_docs / (doc_freq[token] + 1))
        
    return idf_score

In [None]:
def get_top_tokens_tfidf(bow:list,doc_freq:dict,n_docs:int) -> list:
    '''
    Getting top tokens based on their TF-IDF weighting in one BoW
    Depends on the tf and idf functions
    bow: list of str, tokens in one BoW
    doc_freq: dict, document frequency, in how many documents tokens appear
    n_docs: int, a number of documents
    Returns list: Top 10 tokens in a bow by their TF-IDF scores
    '''
    top_tokens = []
    tf_idf_scores = {}
    
    for token in bow:
        tf_idf = tf(bow,token) * idf(token,doc_freq,n_docs)
        tf_idf_scores[token] = tf_idf
        
    tokens_scores = sorted(tf_idf_scores.items(), key=lambda x:x[1], reverse=True)
    
    if len(tokens_scores) < 10:
        top_tokens = [t[0] for t in tokens_scores]
    else:
        #cut_off_score = tokens_scores[9][1] # taking top 10 scores
        #top_tokens = [t[0] for t in tokens_scores if t[1] >= cut_off_score]
        #if len(top_tokens) > 10:
        top_tokens = [t[0] for t in tokens_scores[0:10]]
    
    return top_tokens

In [None]:
def get_unique_tokens_and_docs(source:dict) -> tuple:
    '''
    Gets unique tokens and documents in a file
    source: dict
    Prints N of unique tokens
    Returns a tuple, where 0: list of tokens (str), 1: list of documents (list)
    '''
    all_docs = []
    all_tokens = []
    
    for value in source.values():
        # taking only unique bows
        for bow in value["bow"]:
            if bow not in all_docs:
                all_docs.append(bow)
                # collecting all unique tokens
                for token in bow:
                    if token not in all_tokens:
                        all_tokens.append(token)
    
    print(f"Unique tokens: {len(all_tokens)}")
    
    tokens_docs = (all_tokens, all_docs) 
    
    return tokens_docs

#### EN

In [None]:
# list of unique tokens and number of documents in WM EN
tokens_docs_en = get_unique_tokens_and_docs(en_wm_bows)

In [None]:
all_tokens_en = tokens_docs_en[0]
all_docs_en = tokens_docs_en[1]
n_docs_en = len(all_docs_en)

In [None]:
# Making a dict with document frequency (DF) scores for every unique token
en_df = {}
for token in all_tokens_en:
    token_count = 0
    for bow in all_docs_en:
        if token in bow:
            token_count += 1
    en_df[token] = token_count

In [None]:
# adding two new bows to the file 'en_wm_bows':
# (1) top tokens based on TF-IDF; (2) joint bow with the 1 + suggestions

for value in en_wm_bows.values():
    
    # there can be muttiple bows for one term
    top_tokens = [] 
    
    for bow in value["bow"]:
        top_tokens.extend(get_top_tokens_tfidf(bow,en_df,n_docs_en))
        
    value["bow_tf_idf"] = top_tokens
    
    # metging top tokens and suggestions
    joint_bow = []
    joint_bow.extend(top_tokens)
    joint_bow.extend(value["suggestions"])
    
    value["bow_joint"] = joint_bow

In [None]:
# exporting
with open('en_wm_bows_tf_idf.json', 'w') as jf:
    json.dump(en_wm_bows, jf)

#### NL

In [None]:
# list of unique tokens and number of documents in WM NL
tokens_docs_nl = get_unique_tokens_and_docs(nl_wm_bows)

In [None]:
all_tokens_nl = tokens_docs_nl[0]
all_docs_nl = tokens_docs_nl[1]
n_docs_nl = len(all_docs_nl)

In [None]:
# Making a dict with document frequency (DF) scores for every unique token
nl_df = {}
for token in all_tokens_nl:
    token_count = 0
    for bow in all_docs_nl:
        if token in bow:
            token_count += 1
    nl_df[token] = token_count

In [None]:
# adding two new bows to the file 'nl_wm_bows':
# (1) top tokens based on TF-IDF; (2) joint bow with the 1 + suggestions

for value in nl_wm_bows.values():
    
    # there can be muttiple bows for one term
    top_tokens = [] 
    
    for bow in value["bow"]:
        top_tokens.extend(get_top_tokens_tfidf(bow,nl_df,n_docs_nl))
        
    value["bow_tf_idf"] = top_tokens
    
    # metging top tokens and suggestions
    joint_bow = []
    joint_bow.extend(top_tokens)
    joint_bow.extend(value["suggestions"])
    
    value["bow_joint"] = joint_bow

In [None]:
# exporting
with open('nl_wm_bows_tf_idf.json', 'w') as jf:
    json.dump(nl_wm_bows, jf)