## Exercize 4
### Semantic similarity (Italian) using Nasari and BabelNet
#### Francesco Sannicola

Method shared able to return a interval depending of my surname.

In [1]:
import hashlib
import matplotlib.pyplot as plt


def get_range(surname):
    nof_elements = 500
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % 10)
    idx_intervallo = base_idx * 50+1
    return idx_intervallo
 

input_name = "sannicola"

values = []
sx = get_range(input_name)
values.append(sx)
dx = sx+50-1
intervallo = "" + str(sx) + "-" + str(dx)
print('{:15}:\tcoppie nell\'intervallo {}'.format(input_name, intervallo))

sannicola      :	coppie nell'intervallo 351-400


Csv library useful to read all the files.

In [2]:
import csv

Simply reads a tsv handwritten (me and my sister) annotation file.
- 0: totally dissimilar
- 1: dissimilar
- 2: Slightly similar
- 3: Similar
- 4: Very similar

In [3]:
def read_annotation_file(file):
    res = []
    
    with open(file) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            res.append(row)
    
    return res

Read a embedded version of Nasari and returns a dictionary like {bn:xxxxxx : [score1, score2]}

In [4]:
def get_nasari_dict(file):
    nasari_dict = dict()
    
    with open(file) as fd:
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            row_splitted=row[0].split('_')[0]
            nasari_dict[row_splitted] = row[1:]
    
    return nasari_dict

The above method reads a file containing italian's terms associated with a list of BabelID.

Obtain a dict of {word_it: [bn:xxxxxx1, bn:xxxxxx2]} tuples.

In [5]:
def get_italian_synsets(file):

    italian_synset = dict()
    word = ""
    
    with open(file, 'r', encoding="utf8") as fd:
        
        rd = csv.reader(fd, delimiter="\t", quotechar='"')
        for row in rd:
            if row[0][0] == "#":
                word = row[0][1:].lower()
                italian_synset[word] = []
            else:
                italian_synset[word].extend(row)
                
    return italian_synset

Import cosine similarity metric from sklearn library and numpy.

In [6]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

Compute similarity (cosine) for two lists BabelIDs given nasari vectors as dict.

In [7]:
def cosine_similarity_vector(nasari_dict, bab_ids_word1, bab_ids_word2):
    """
    :param nasari_dict: NASARI dictionary
    :param babel_id_word1: list of BabelID
    :param babel_id_word2: list of BabelID
    """

    best_senses = (None, None)
    max_similarity = 0

    
    for b_id1 in bab_ids_word1:
        for b_id2 in bab_ids_word2:
            # Check first there are entry in nasary dict with this babelnet ids
            if b_id1 in nasari_dict.keys():
                if b_id2 in nasari_dict.keys():
                    
                    # Obtain nasari dict from babelnet id and reshape to fit all in 1 row
                    v1 = np.array(nasari_dict[b_id1]).reshape(1, len(nasari_dict[b_id1]))
                    v2 = np.array(nasari_dict[b_id2]).reshape(1, len(nasari_dict[b_id2]))

                    # use cosine similarity of sklearn
                    similarity = cosine_similarity(v1, v2)[0]
                    
                    # save only the 2 best babelnet IDs
                    if similarity > max_similarity:
                        max_similarity = similarity
                        best_senses = (b_id1, b_id2)

    return best_senses, max_similarity

Get lists and dicts of scores, golds, targets, synsets and nasari vectors from files.

In [8]:
# Read all files
annotation1 = read_annotation_file("utils/data351-400.tsv")
annotation2 = read_annotation_file("utils/data351-400-sister.tsv")

nasari_dict = get_nasari_dict("utils/mini_NASARI.tsv")

gold = get_italian_synsets("utils/SemEval17_IT_senses2synsets.txt")

base, target = [], []
annotation1_score, annotation2_score = [], []

# Get all couples and the semantic similarity scores (mine and my sister)
for el in annotation1:
    base.append(el[0].lower())
    target.append(el[1].lower())
    annotation1_score.append(float(el[2]))
    
for el in annotation2:
    annotation2_score.append(float(el[2]))


Import two evaluation metrics form scipy.

In [9]:
from scipy.stats import pearsonr, spearmanr

In [10]:
golds_score = []

for i in range(0, len(annotation1)):
    _ , score = cosine_similarity_vector(nasari_dict, gold[base[i]], gold[target[i]])
    golds_score.append(float(score))

    
# inter rate agreement between first and second annotation
print("Spearman and Person scores between human annotations\n")

print("Spearman: ", spearmanr(annotation1_score, annotation2_score)[0])
print("Pearson: ", pearsonr(annotation1_score, annotation2_score)[0])

# Comparison between my_score and gold score
print("\n\nSpearman and Person scores between human annotation and scores computed")

print("\n"+'\033[1m' + "Annotation 1:" + '\033[0m')
print("Spearman: ", pearsonr(golds_score, annotation1_score)[0])
print("Pearson: ", spearmanr(golds_score, annotation1_score)[0])

print("\n"+'\033[1m' + "Annotation 2:" + '\033[0m')
print("Spearman: ", pearsonr(golds_score, annotation2_score)[0])
print("Pearson: ", spearmanr(golds_score, annotation2_score)[0])


Spearman and Person scores between human annotations

Spearman:  0.917764855859008
Pearson:  0.9353309809796745


Spearman and Person scores between human annotation and scores computed

[1mAnnotation 1:[0m
Spearman:  0.47012559867068004
Pearson:  0.607574621930617

[1mAnnotation 2:[0m
Spearman:  0.4110861020947786
Pearson:  0.550105156422494


Import another metric

In [11]:
from sklearn.metrics import cohen_kappa_score

Calculate annotation agreement with Cohen score

In [12]:
int_annotation1_score = [int(i) for i in annotation1_score]
int_annotation2_score = [int(i) for i in annotation2_score]

cohen_score = cohen_kappa_score(int_annotation1_score, int_annotation2_score)
print("Cohen score between human annotations\n")
print("Cohen: ", cohen_score)

Cohen score between human annotations

Cohen:  0.5841995841995842


Next goal is to find which is the senses used for similarity judgment.

The next method calls BabelNet HTTP API with a babelnet synset ID as a parameter.

It returns three terms associated to the given synset.

In [13]:
import re
import requests

def get_synset_terms_HTTP(babel_id):
    """
    :param babel_id: sense's BabelID
    """

    url = "https://babelnet.io/v5/getSynset"
    params = {
        "id": babel_id,
        "key": "fd6e7223-95d6-40ce-96b4-8d3bf9123200",
        "targetLang": "IT"
    }

    # Perform the request
    req = requests.get(url=url, params=params)
    
    # Obtain response in json
    response = req.json()

    i, limit = 0, 0
    terms = []
    
    while i < len(response["senses"]) and limit < 3:
        
        # Obtain a term associated to the given synset. After do some string processing.
        term = response["senses"][i]["properties"]["fullLemma"]
        term = re.sub('\_', ' ', term)
        term = term.lower()

        if term not in terms:
            terms.append(term)
            limit += 1

        i += 1

    if len(terms) == 0:
        return "Null"
    else:
        return terms

Three major tasks:
1. Calculates best couples of synset given base and target using nasari.
2. Get from BabelNet endpoint three terms maximum associated to the more similar couple of synsets (cosine metric).
3. Write on file the result (Columns: Term1 Term2 BS1 BS2 Terms_in_BS1 Terms_in_BS2)

In [14]:
with open("utils/result.tsv", "w", encoding="utf-8") as of:
    
    for i in range(0, len(annotation1)):
        
        # take the best couple of synset of a couple of terms
        (syn_1, syn_2), _ = cosine_similarity_vector(nasari_dict, gold[base[i]], gold[target[i]])
        
        if syn_1 is not None and syn_2 is not None:
            
            # inizialize output buffer with 4 columns (base, target, best babel synset1, best babel synset2)
            of.write("{}\t{}\t{}\t{}\t".format(base[i], target[i], syn_1, syn_2))
            
            # obtain tree terms associated to the synset1 and synset2
            terms_1 = get_synset_terms_HTTP(syn_1)
            terms_2 = get_synset_terms_HTTP(syn_2)

            for term_1 in terms_1:
                if term_1 == terms_1[len(terms_1) - 1]:
                    # add a column whith the terms in babel synset1 
                    of.write(term_1 + "\t") 
                else:
                    of.write(term_1 + ",") 
            
            for term_2 in terms_2:
                if term_2 == terms_2[len(terms_2) - 1]:
                     # add a column whith the terms in babel synset2 
                    of.write(term_2 + "\n")
                else:
                    of.write(term_2 + ",") 
                    
        else:
            # write a blank row
            of.write("{}\t{}\tNull\tNull\tNull\tNull\n".format(base[i], target[i]))
        
    print("Operation successful")
    print("Result written in ./utils/result.tsv")

Operation successful
Result written in ./utils/result.tsv
