In [1]:
import json
from collections import Counter
import math
from sklearn import preprocessing

In [None]:
# normalising 1D array
#preprocessing.normalize([scores])

#### New RM bows

In [2]:
# importing related matches
with open("/Users/anesterov/reps/LODlit/bg/rm_bows_all.json","r") as jf:
    rm = json.load(jf)

In [18]:
# importing WM text EN
with open("/Users/anesterov/reps/LODlit/en_wm_bows.json","r") as jf:
    en_wm_bows = json.load(jf)

In [19]:
# importing WM text NL
with open("/Users/anesterov/reps/LODlit/nl_wm_bows.json","r") as jf:
    nl_wm_bows = json.load(jf)

In [5]:
# importing query terms
with open("/Users/anesterov/reps/LODlit/query_terms.json") as jf:
    query_terms = json.load(jf)

### TF-IDF functions

In [6]:
def tf(doc:list, token:str) -> float:
    '''
    Calculates term frequency
    doc: list, alll documents
    token: str, a token to get the TF score of
    Returns float
    '''
    n_found = len([t for t in doc if t == token])
    tf_score = n_found / len(doc)
    
    return tf_score

In [7]:
def idf(token:str, doc_freq:dict, n_docs:int) -> float:
    '''
    Calculates inverse document frequency:
        adds 1 to DF to avoid zero division
    token: str, a token to get the IDF score of
    doc_freq: dict, document frequency, in how many documents tokens appear
    n_docs: int, a number of documents
    Returns float
    '''
    idf_score = math.log(n_docs / (doc_freq[token] + 1))
        
    return idf_score

In [16]:
def get_top_tokens_tfidf(bow:list,doc_freq:dict,n_docs:int) -> list:
    '''
    Getting top tokens based on their TF-IDF weighting in one BoW
    Depends on the tf and idf functions
    bow: list of str, tokens in one BoW
    doc_freq: dict, document frequency, in how many documents tokens appear
    n_docs: int, a number of documents
    Returns list: Top 10 tokens in a bow by their TF-IDF scores
    '''
    top_tokens = []
    tf_idf_scores = {}
    
    for token in bow:
        tf_idf = tf(bow,token) * idf(token,doc_freq,n_docs)
        tf_idf_scores[token] = tf_idf
        
    tokens_scores = sorted(tf_idf_scores.items(), key=lambda x:x[1], reverse=True)
    
    if len(tokens_scores) < 10:
        top_tokens = [t[0] for t in tokens_scores]
    else:
        #cut_off_score = tokens_scores[9][1] # taking top 10 scores
        #top_tokens = [t[0] for t in tokens_scores if t[1] >= cut_off_score]
        #if len(top_tokens) > 10:
        top_tokens = [t[0] for t in tokens_scores[0:10]]
    
    return top_tokens

In [37]:
def get_unique_tokens_and_docs(source:dict) -> tuple:
    '''
    Gets unique tokens and documents in a file
    source: dict
    Prints N of unique tokens
    Returns a tuple, where 0: list of tokens (str), 1: list of documents (list)
    '''
    all_docs = []
    all_tokens = []
    
    for value in source.values():
        # taking only unique bows
        for bow in value["bow"]:
            if bow not in all_docs:
                all_docs.append(bow)
                # collecting all unique tokens
                for token in bow:
                    if token not in all_tokens:
                        all_tokens.append(token)
    
    print(f"Unique tokens: {len(all_tokens)}")
    
    tokens_docs = (all_tokens, all_docs) 
    
    return tokens_docs

#### EN

In [47]:
# list of unique tokens and number of documents in WM EN
tokens_docs_en = get_unique_tokens_and_docs(en_wm_bows)

Unique tokens: 1166


In [48]:
all_tokens_en = tokens_docs_en[0]
all_docs_en = tokens_docs_en[1]
n_docs_en = len(all_docs_en)

In [49]:
# Making a dict with document frequency (DF) scores for every unique token
en_df = {}
for token in all_tokens_en:
    token_count = 0
    for bow in all_docs_en:
        if token in bow:
            token_count += 1
    en_df[token] = token_count

In [29]:
# adding two new bows to the file 'en_wm_bows':
# (1) top tokens based on TF-IDF; (2) joint bow with the 1 + suggestions

for value in en_wm_bows.values():
    
    # there can be muttiple bows for one term
    top_tokens = [] 
    
    for bow in value["bow"]:
        top_tokens.extend(get_top_tokens_tfidf(bow,en_df,n_docs_en))
        
    value["bow_tf_idf"] = top_tokens
    
    # metging top tokens and suggestions
    joint_bow = []
    joint_bow.extend(top_tokens)
    joint_bow.extend(value["suggestions"])
    
    value["bow_joint"] = joint_bow

In [31]:
# exporting
with open('en_wm_bows_tf_idf.json', 'w') as jf:
    json.dump(en_wm_bows, jf)

#### NL

In [51]:
# list of unique tokens and number of documents in WM NL
tokens_docs_nl = get_unique_tokens_and_docs(nl_wm_bows)

Unique tokens: 1182


In [52]:
all_tokens_nl = tokens_docs_nl[0]
all_docs_nl = tokens_docs_nl[1]
n_docs_nl = len(all_docs_nl)

In [53]:
# Making a dict with document frequency (DF) scores for every unique token
nl_df = {}
for token in all_tokens_nl:
    token_count = 0
    for bow in all_docs_nl:
        if token in bow:
            token_count += 1
    nl_df[token] = token_count

In [55]:
# adding two new bows to the file 'nl_wm_bows':
# (1) top tokens based on TF-IDF; (2) joint bow with the 1 + suggestions

for value in nl_wm_bows.values():
    
    # there can be muttiple bows for one term
    top_tokens = [] 
    
    for bow in value["bow"]:
        top_tokens.extend(get_top_tokens_tfidf(bow,nl_df,n_docs_nl))
        
    value["bow_tf_idf"] = top_tokens
    
    # metging top tokens and suggestions
    joint_bow = []
    joint_bow.extend(top_tokens)
    joint_bow.extend(value["suggestions"])
    
    value["bow_joint"] = joint_bow

In [57]:
# exporting
with open('nl_wm_bows_tf_idf.json', 'w') as jf:
    json.dump(nl_wm_bows, jf)