In [1]:
import pandas as pd
import json
import csv

In [2]:
import spacy # numpy 1.21

In [3]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "attribute_ruler", "lemmatizer"])

In [4]:
def calculate_cs(bow_1:list, bow_2:list, nlp) -> float:
    '''
    Calculates cosine similarity between two bags of words;
    based on the spacy vectors
    bow_1 and bow_2: list, two bags of words;
    nlp: spacy nlp class loaded through spacy.load
    '''

    # converting list to str, spaces as a separator

    bow_1_str = ""
    for t in bow_1:
        bow_1_str += f"{t} "
        
    bow_2_str = ""
    for t in bow_2:
        bow_2_str += f"{t} "
        
    bow_1 = nlp(bow_1_str)
    bow_2 = nlp(bow_2_str)
        
    sim = round(bow_1.similarity(bow_2),3)
    
    return sim

In [5]:
# importing search results
with open('/Users/anesterov/wd/jan31/wd_bows_en.json','r') as jf:
    wd_en = json.load(jf)
# importing related matches
with open('/Users/anesterov/reps/LODlit/bg/rm_bows_tfidf_all.json','r') as jf:
    rm = json.load(jf)

In [6]:
lang = "en"

1. Background info combined: related matches + WM text
2. New related matches only: TF-IDF of AAT scopeNotes
3. WM bows only

In [7]:
wd_df = pd.DataFrame(columns=['term', 'QID', 'cs_combined', 'cs_rm_only', 'cs_wm_only'])

for term, hits in wd_en.items():
    
    if term in rm[lang].keys():
    
        # EN collecting related matches per term
        # 1 bow_combined: related matches + WM text
        bow_1 = []
        # 2 RM only
        bow_2 = []
        # 3 WM only
        bow_3 = []
        
        if rm[lang][term].get('aat'):
            bow_1.extend(rm[lang][term]['aat'])
            bow_2.extend(rm[lang][term]['aat'])
            
        if rm[lang][term].get('wikidata'):
            bow_1.extend(rm[lang][term]['wikidata'])
            bow_2.extend(rm[lang][term]['wikidata'])
            
        if rm[lang][term].get('pwn'):
            bow_1.extend(rm[lang][term]['pwn'])
            bow_2.extend(rm[lang][term]['pwn'])
            
        if rm[lang][term].get('wm'):
            bow_1.extend(rm[lang][term]['wm'])
            bow_3.extend(rm[lang][term]['wm'])

        # making sets
        bow_1_set = list(set(bow_1))
        bow_2_set = list(set(bow_2))
        bow_3_set = list(set(bow_3))
            
        # if there are search results
        if len(hits) > 0:
            for hit in hits:
                for i, bow in hit.items():
                    # making a set
                    wd_bow = list(set(bow))
                    if len(wd_bow) > 0:
                        
                        # calculate cs
                        cs_combined = calculate_cs(bow_1_set,wd_bow,nlp)
                        cs_rm_only = calculate_cs(bow_2_set,wd_bow,nlp)
                        cs_wm_only = calculate_cs(bow_3_set,wd_bow,nlp)
                        
                        wd_df.loc[len(wd_df)] = [term,i,cs_combined,cs_rm_only,cs_wm_only]
                    
                    # if there are no tokens, cs == None
                    else:
                        wd_df.loc[len(wd_df)] = [term,i,None,None,None]
                    
    # if there are no related matches, cs == None
    else:
        wd_df.loc[len(wd_df)] = [term,i,None,None,None]

In [8]:
wd_df.to_csv('bg_cs.csv')

### Merging all CS scores

In [11]:
wd_df.head()

Unnamed: 0,term,QID,cs_combined,cs_rm_only,cs_wm_only
0,batavia,Q100341056,0.275,0.254,0.254
1,batavia,Q100341171,0.587,0.454,0.587
2,batavia,Q100342162,0.587,0.454,0.587
3,batavia,Q102227440,0.534,0.671,0.361
4,batavia,Q102227447,0.102,0.1,0.091


In [14]:
# reading the previous table
cs_old = pd.read_csv('/Users/anesterov/reps/LODlit/spacy_sim_experiment.csv')

In [15]:
cs_old.head()

Unnamed: 0.1,Unnamed: 0,term,QID,wd_bow,spacy_cs
0,0,batavia,Q100341056,"['tentoonstelling', 'ter', 'herdenking', 'van'...",0.238
1,1,batavia,Q100341171,"['topografische', 'dienst', 'temporary', 'exhi...",0.453
2,2,batavia,Q100342162,"['kaart', 'exhibition', 'temporary', 'exhibiti...",0.439
3,3,batavia,Q102227440,"['gerardus', 'van', 'groll', 'sep', 'kaapstad'...",0.59
4,4,batavia,Q102227447,"['pieter', 'hendrick', 'breton', 'est', 'oct']",0.088


In [16]:
all_scores = cs_old.merge(wd_df,how="left",on=["term","QID"])

In [17]:
all_scores.head()

Unnamed: 0.1,Unnamed: 0,term,QID,wd_bow,spacy_cs,cs_combined,cs_rm_only,cs_wm_only
0,0,batavia,Q100341056,"['tentoonstelling', 'ter', 'herdenking', 'van'...",0.238,0.275,0.254,0.254
1,1,batavia,Q100341171,"['topografische', 'dienst', 'temporary', 'exhi...",0.453,0.587,0.454,0.587
2,2,batavia,Q100342162,"['kaart', 'exhibition', 'temporary', 'exhibiti...",0.439,0.587,0.454,0.587
3,3,batavia,Q102227440,"['gerardus', 'van', 'groll', 'sep', 'kaapstad'...",0.59,0.534,0.671,0.361
4,4,batavia,Q102227447,"['pieter', 'hendrick', 'breton', 'est', 'oct']",0.088,0.102,0.1,0.091


In [18]:
all_scores.to_csv('cs_all.csv')