## Exercize 2
### Mapping Framenet-WordNet
#### Francesco Sannicola

In [1]:
from nltk.corpus import framenet as fn

---
### getFrameSetForStudent

Funzione per assegnare a ciascuno un insieme di frame.

In [2]:
import hashlib
import random
from random import randint
from random import seed

def print_frames_with_IDs():
    for x in fn.frames():
        print('{}\t{}'.format(x.ID, x.name))

def get_frams_IDs():
    return [f.ID for f in fn.frames()]   

def getFrameSetForStudent(surname, list_len=5):
    nof_frames = len(fn.frames())
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames)
    print('\nstudent: ' + surname)
    framenet_IDs = get_frams_IDs()
    i = 0
    offset = 0 
    seed(1)
    id_list = []
    while i < list_len:
        fID = framenet_IDs[(base_idx+offset)%nof_frames]
        f = fn.frame(fID)
        fNAME = f.name
        print('\tID: {a:4d}\tframe: {framename}'.format(a=fID, framename=fNAME))
        id_list.append(fID)
        offset = randint(0, nof_frames)
        i += 1        
    return id_list


Get frame's IDs

In [3]:
frame_IDs = getFrameSetForStudent('sannicola') # + getFrameSetForStudent('Francesco')


student: sannicola
	ID: 1597	frame: Familiarity
	ID: 1017	frame: Noise_makers
	ID:   58	frame: Emptying
	ID:  386	frame: Ingest_substance
	ID: 2838	frame: Scheduling


Get frame by frame id

In [4]:
def get_frame(frame_id):
    return fn.frame_by_id(frame_id)

Get synset by word

In [5]:
from nltk.corpus import wordnet as wn

def get_synsets(word):
    synsets = wn.synsets(word)
    # if there isn't a synset for the word it returns "Null"
    if synsets == "":
        return "Null"
    else:
        return synsets

Use spacy for additional features for our terms

In [6]:
import spacy

sp = spacy.load("en_core_web_sm")

Get of the main word from a composed frame name (it contains "-" or "_").

In [7]:
def get_main_word(frame_name):
    """
    :param frame_name: frame name
    :return: the main word inside the frame name
    """
    if '-' or '_' in frame_name:
        
        main_word = ""
        
        # remove some punctualizations
        frame_name = frame_name.replace('_', ' ').replace('-', ' ')

        # Convert string into iterable obj with additional features (easy PoS tagging), using spacy
        frame_name = sp(frame_name)
        
        for term in frame_name:
            # Main word is the NN or NNS in a sentence
            if term.tag_ == "NNS" or term.tag_ == "NN":
                main_word = term.text
            else:
                # Select the "root" term
                if term.dep_ == 'ROOT':
                    main_word = term.text
        return main_word
    else:
        return frame_name

Some string processing. It creates a list with name + definition.

In [8]:
def process(name, definition):
    '''
    :param name: FE or LU name
    :param definiton: FE or LU definition
    :return: processed list (name + definition)
    '''
    res = []
    
    # string to delete
    other_str_delete = ["fe", "fn", "cod", "'", "$"]
    
    # set string to lowercase
    definition = definition.lower()
    
    # transform string into Doc with spacy
    definition = sp(definition)

    # Now append the name
    res.append(name.lower())
    for term in definition:
        # Save only no stop and no punctalization term
        if not term.is_stop:
            if term.pos_ != "PUNCT" and term.pos_ != "NUM":
                res.append(term.lemma_)

    # In the end delete strings into other_str_delete variable
    res = list(filter(lambda x: x not in other_str_delete, res))

    return res

Get frame name, frame elements and lexical units for each frame.

In [9]:
import re

fn_list = []
fe_list = []
lu_list = []

for id in frame_IDs:
    #print ("-----------------Frame name-----------------")
    f = get_frame(id)
    fn_list.append(process(get_main_word(f.name), f.definition))
    #print ("-----------------Frame elements-----------------")
    for key in f.FE:
        definition = f.FE[key].definition
        main_word = get_main_word(key)
        fe_list.append(process(main_word, definition))
    #print ("-----------------Lexical units-----------------")
    for key in f.lexUnit:
        lu_key = re.sub('\.[a-z]+', '', key)
        main_word = get_main_word(lu_key)
        definition = f.lexUnit[key].definition
        lu_list.append(process(main_word, definition))

Given a synset, obtain a list of processed examples.

In [10]:
def get_examples(synset):
    examples = []
    if synset.examples():
        for example in synset.examples():
            examples.append(process("", example))
    
    # Compose the result
    res = [item for sub_list in examples for item in sub_list]
    return res

Calculate the context of a synset depending of his hyponyms and hypernyms (with limit = 3).

In [11]:
def get_context(synset):
    '''
    :param synset: WN synset
    :return: synset's context
    '''
    
    synset_def = []
    current_examples = []
    
    # take the synset definition and process it
    definition = synset.definition()
    synset_def = process("", definition)

    # get examples of that synset
    current_examples = get_examples(synset)

    #check hyponyms
    hyponyms = synset.hyponyms()
    hypon_list = []
    hy_def = []
    
    if hyponyms !=0:
        
        limit = 0
        for hypon in hyponyms:
            if limit == 3:
                break
                
            # save hyponym definition and process it
            hy_def.append(process("", hypon.definition()))
            
            # save hyponym examples
            hypon_list.append(get_examples(hypon))

            limit += 1
            
    #check hypernyms
    hypernyms = synset.hypernyms()
    hyper_list = []
    
    if hypernyms != 0:
        limit = 0
        for hyper in hypernyms:
            if limit == 3:
                break
                
            # save hypernym definition and process it
            hy_def.append(process("", hyper.definition()))
            
            # save hypernym examples
            hyper_list.append(get_examples(hyper))
            
            limit += 1

    # Compose hypernym and hyponym definitions and examples
    hy_def = [item for sub_list in hy_def for item in sub_list]
    hyper_list = [item for sub_list in hyper_list for item in sub_list]
    hypon_list = [item for sub_list in hypon_list for item in sub_list]


    # Context composed by examples and definiton of current synset + hypernym and hyponym definitions + hypernym and hyponym definitions
    return current_examples + synset_def + hy_def + hypon_list + hyper_list

Read a CSV file and return a list.

In [12]:
import csv 

def read_csv(path):
    '''
    :param path: reference to a CSV file.
    :return: a list with all file's row.
    '''
    with open(path, 'r', encoding='utf-8-sig') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter='\t')
        res = []
        for row in csv_reader:
            res.append(row)
    return res

Custom implementation of the "Bag of Word" algorithm.
It calculates best sense given a word and a WN synsets for that word based on context.

In [13]:
def BOW_best_sense(word, synsets):
    '''
    :param word: word to map
    :param synsets: synsets of that word
    :return: best synset
    '''
    max_score = 0
    bag = list(word)
    best_synset = None
    
    if len(synsets) > 1:
        for synset in synsets:
            # Get context of the synset
            context = get_context(synset)
            context = list(context)

            # Calculate the intersection between word and context (+1 for Smoothing)
            score = len(set(word).intersection(context)) + 1
                        
            # Save the synset with higher score
            if score > max_score:
                max_score = score
                best_synset = synset
    else:
        best_synset = synsets

    return best_synset

Reads the hand written golds and compute bag of word between user's annotation and frame's name, frame's elements and lexical units.

In [14]:
fn_gold_path = 'input/fn_hand.tsv'
fe_gold_path = 'input/fe_hand.tsv'
lu_gold_path = 'input/lu_hand.tsv'

fn_gold = read_csv(fn_gold_path)
fe_gold = read_csv(fe_gold_path)
lu_gold = read_csv(lu_gold_path)

# List with all computed senses
bow_res = []

for i in range(0, len(fn_gold)):
    synsets = wn.synsets(fn_gold[i][0])
    best_synset = BOW_best_sense(fn_list[i], synsets)
    bow_res.append(best_synset)

for i in range(0, len(fe_gold)):
    synsets = wn.synsets(fe_gold[i][0])
    best_synset = BOW_best_sense(fe_list[i], synsets)
    bow_res.append(best_synset)

for i in range(0, len(lu_gold)):
    synsets = wn.synsets(lu_gold[i][0])
    best_synset = BOW_best_sense(lu_list[i], synsets)
    bow_res.append(best_synset)

Last block calculates system accuracy.

In [15]:
all_gold = fn_gold + fe_gold + lu_gold
right = 0

for i in range(0, len(bow_res)):
    current_mapping = str(bow_res[i]).replace('[', '').replace(']', '').replace("Synset('", '').replace("')", '')
    
    # Comparison between computed senses and hand written senses
    if current_mapping == all_gold[i][1]:
        right += 1
        
print("Right mappings: ", right, "\nWrong mappings: ", len(bow_res)-right)
print("Accuracy: ", round(right/len(bow_res), 2))

Right mappings:  67 
Wrong mappings:  68
Accuracy:  0.5
