In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from collections import Counter
from tabulate import tabulate

In [3]:
lemmatizer = WordNetLemmatizer()
vector_dimension = 3
stop_words = stopwords.words('english')
stop_words.extend(['someone', 'something', 'e', 'e.g', 'u', 'ha', 'e.i', 'others', "'s"])
string_punctuation = string.punctuation + '’'

In [4]:
concept_definitions = pd.read_excel('../resources/definizioni.xlsx', index_col=0).T.to_dict('list')

In [5]:
def text_preprocessing(text):
    words = list()
    for word in word_tokenize(text.lower()):
        lemma = lemmatizer.lemmatize(word)
        if lemma not in stop_words and lemma not in string_punctuation:
            words.append(lemma)
    return words

In [6]:
def get_common_words():
    concept_common_words = dict()
    for key, value in concept_definitions.items():
        common_words = list()
        for definition in value:
            if definition == definition:
                common_words.extend(text_preprocessing(definition))
        concept_common_words[key] = [lemma for lemma, count in Counter(common_words).most_common(vector_dimension)]
    return concept_common_words


concept_common_words = get_common_words()
print(concept_common_words)

{'Emotion': ['feeling', 'human', 'feel'], 'Person': ['human', 'person', 'living'], 'Revenge': ['anger', 'feeling', 'action'], 'Brick': ['used', 'object', 'material']}


In [7]:
def get_synsets_from_genus():
    concept_genus_synsets = dict()
    for key, value in concept_common_words.items():
        genus_synstes = list()
        for word in value:
            genus_synstes.extend(wn.synsets(word))
            
        genus_synstes = list(dict.fromkeys(genus_synstes))
        concept_genus_synsets[key] = genus_synstes
    return concept_genus_synsets


concept_genus_synsets = get_synsets_from_genus()

In [8]:
def get_hyponyms_from_genus():
    for key, value in concept_genus_synsets.items():
        for synset in value:
            concept_genus_synsets[key].extend(synset.hyponyms())
            concept_genus_synsets[key] = list(dict.fromkeys(concept_genus_synsets[key]))

get_hyponyms_from_genus()

In [9]:
def get_context(synset):
    context = list()
    context.extend(text_preprocessing(synset.definition()))
    for example in synset.examples():
        context.extend(text_preprocessing(example))
    context = list(dict.fromkeys(context))
    print(context)
    return context


def weighted_overlap_score(v1, v2, key):
    weight = 1
    overlap = set(v1) & set(v2)
    #for word in overlap:
        #weight += concept_common_words[key].index(word)
    score = float(len(overlap)) / (len(set(v1) | set(v2)))
    return score


def rank_synsets(synsets, key):
    synsets_ranked = list()
    common_words = concept_common_words[key]
    for synset in synsets:
        synsets_ranked.append((synset, weighted_overlap_score(common_words, get_context(synset), key)))
    return sorted([item for item in synsets_ranked if item[1] > 0], key=lambda item: item[1], reverse=True)

In [10]:
def get_genus_synsets_ranked():
    concept_genus_synsets_ranked = dict()
    for key, value in concept_genus_synsets.items():
        concept_genus_synsets_ranked[key] = rank_synsets(value, key)[:5]
    return concept_genus_synsets_ranked


concept_genus_synsets_ranked = get_genus_synsets_ranked()

In [11]:
for key, value in concept_genus_synsets_ranked.items():
    to_print = list()
    for item in value:
        rows = list(item)
        rows.insert(1, item[0].definition().capitalize())
        to_print.append(rows)

    print(f'----- Best 5 sense for {key} -----')
    print()
    print(tabulate(to_print, headers=['Synset', 'Definition', 'Weighted Overlap Score'], tablefmt='orgtbl'))
    print()

----- Best 5 sense for Emotion -----



AttributeError: 'str' object has no attribute '_name'