In [1]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import string

In [2]:
def pre_processing(sentence):
    return remove_stopwords(remove_punctuation(word_tokenize(sentence)))

# Remove stopwords from a list of words
def remove_stopwords(words_list):
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return [value.lower() for value in words_list if value.lower() not in stopwords_list]


# Remove punctuation from a list of words
def remove_punctuation(words_list):
    new_words_list = []
    for word in words_list:
        temp = word
        if not temp.strip(string.punctuation) == "":
            new_word = word.lower()
            new_word = new_word.replace("'", "")
            new_words_list.append(new_word)
    return new_words_list

#### First exercize
It calculates average definition lenght for each section (nouns, verbs, adjectives and adverbs)

In [3]:
from statistics import mean

def avg_len_section_definitons():
    pos_tag_list = ['n', 'v', 'a', 'r']
    average_lenghts = []

    for pos_tag in pos_tag_list:
        synsets_lenght = []
        for synset in list(wn.all_synsets(pos_tag)):
            synsets_lenght.append(len(synset.definition().split(" ")))
        average_lenghts.append((pos_tag, mean(synsets_lenght)))

    print("\n", average_lenghts, "\n")


#### Second exercize
The variation of the length along the path of the hyperonyms that lead from a given synset to its root

In [4]:
def all_hypernym_paths(word):
    
    def_lens = []
    
    for syn in wn.synsets(word):

        single_path = []
        
        hyp_path = syn.hypernym_paths()
        
        for i in range (0, len(hyp_path[0])):
            
            single_path.append((hyp_path[0][i],len((hyp_path[0][i].definition()).split())))

        def_lens.append(single_path)
        
    return def_lens

#### Third exercize
Distance from the word's root and words within the definitions

In [5]:
def calculate_distance_root(synset):
    return (min([len(path) for path in synset.hypernym_paths()]))

def distance_root(word):
    
    output = dict()
    
    for syn in wn.synsets(word):
        
        actual_syn_dis = calculate_distance_root(syn)
        output[syn] = {word :actual_syn_dis} 
                
        syn_definition_processed = pre_processing(syn.definition())
        for def_word in syn_definition_processed:
            for def_syn in wn.synsets(def_word):
                output[syn].update({def_word : calculate_distance_root(def_syn)})
                        
    return output

#### Fourth exercize
Calculate bleu and rouges score between hypernyms and hyponyms of the words definition  

In [6]:
from nltk.translate.bleu_score import sentence_bleu

from bleu import multi_list_bleu

from rouge import Rouge

rouge = Rouge()

def definition_overlap(word):
    
    for syn in wn.synsets(word):
        
        bleu_count = 0
        f_count = 0
        
        actual_def_processed = syn.definition()
        
        print ("\n--------------------\nDefinition of", syn,  "=", actual_def_processed)
        print ()
        
        hyper_list = syn.hypernyms()
        
        for hy in hyper_list:
            hy_def = hy.definition()
            
            bleu_count += sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0))
            #print("BLEU score: ", sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0)))
            
            rouge_scores = rouge.get_scores(' '.join(hy_def), ' '.join(actual_def_processed))
            #print("Rogue scores: ", rouge_scores)
            f_count += rouge_scores[0]['rouge-1']['f']

        if (len(hyper_list) != 0):
            print ("Bleu score for hypernyms (1-gram):", bleu_count / len(hyper_list))
            print ("Rogue f1 for hypernyms (1-gram):", f_count / len(hyper_list))
        else:
            print("No hypernyms")

               
        print ()
        
        bleu_count = 0
        f_count = 0

        hypo_list = syn.hyponyms()
               
        for hy in hypo_list:
            hy_def = hy.definition()
            
            bleu_count += sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0))
            #print("BLEU score: ", sentence_bleu([actual_def_processed], hy_def, weights=(1, 0, 0, 0)))
            
            rouge_scores = rouge.get_scores(' '.join(hy_def), ' '.join(actual_def_processed))
            #print("Rogue scores: ", rouge_scores)
            f_count += rouge_scores[0]['rouge-1']['f']

        if (len(hypo_list) != 0):
            print ("Bleu score for hyponyms (1-gram):", bleu_count / len(hypo_list))
            print ("Rogue f1 for hyponyms (1-gram):", f_count / len(hypo_list))
        else:
            print("No hyponyms")

In [7]:
avg_len_section_definitons()


 [('n', 11.470035925226815), ('v', 6.146655044672042), ('a', 7.238433575677462), ('r', 5.028169014084507)] 



In [8]:
all_hypernym_paths("paper")

[[(Synset('entity.n.01'), 17),
  (Synset('physical_entity.n.01'), 6),
  (Synset('matter.n.03'), 7),
  (Synset('substance.n.01'), 11),
  (Synset('material.n.01'), 12),
  (Synset('paper.n.01'), 15)],
 [(Synset('entity.n.01'), 17),
  (Synset('abstraction.n.06'), 11),
  (Synset('communication.n.02'), 12),
  (Synset('written_communication.n.01'), 10),
  (Synset('writing.n.02'), 24),
  (Synset('essay.n.01'), 6),
  (Synset('composition.n.08'), 8)],
 [(Synset('entity.n.01'), 17),
  (Synset('physical_entity.n.01'), 6),
  (Synset('object.n.01'), 12),
  (Synset('whole.n.02'), 11),
  (Synset('artifact.n.01'), 7),
  (Synset('instrumentality.n.03'), 13),
  (Synset('medium.n.01'), 9),
  (Synset('print_media.n.01'), 6),
  (Synset('press.n.02'), 16),
  (Synset('newspaper.n.01'), 14)],
 [(Synset('entity.n.01'), 17),
  (Synset('physical_entity.n.01'), 6),
  (Synset('object.n.01'), 12),
  (Synset('whole.n.02'), 11),
  (Synset('artifact.n.01'), 7),
  (Synset('instrumentality.n.03'), 13),
  (Synset('medium.

In [9]:
distance_root("courage")

{Synset('courage.n.01'): {'courage': 7,
  'quality': 1,
  'spirit': 5,
  'enables': 2,
  'face': 2,
  'danger': 7,
  'pain': 4,
  'showing': 3,
  'fear': 6}}

In [10]:
definition_overlap("courage")


--------------------
Definition of Synset('courage.n.01') = a quality of spirit that enables you to face danger or pain without showing fear

Bleu score for hypernyms (1-gram): 0.6990638273805326
Rogue f1 for hypernyms (1-gram): 0.8205128155161079

Bleu score for hyponyms (1-gram): 0.32899310492431155
Rogue f1 for hyponyms (1-gram): 0.7829971823603259


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
