In [34]:
import pandas as pd
from collections import Counter
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords


nltk.download('wordnet')
nltk.download('punkt')

csv=pd.read_csv('data\\TLN-definitions-24.csv')[:14]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mgiov\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mgiov\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
stop = stopwords.words('english')
char_rem = list(set("#$%'()*+,./:;<=>?[\]^_`{|}~" + "“" + "”"))
definizioni=pd.read_csv('data\\TLN-definitions-24.csv')
definizioni=definizioni.drop('Annotator', axis=1)
definizioni=definizioni.dropna()
definizioni=definizioni.map(lambda x: word_tokenize(str(x).lower()))
definizioni=definizioni.map(lambda x: [item for item in x if(item not in stop and item not in char_rem)])
definizioni['pen'] = definizioni['pen'].apply(lambda x: [item for item in x if item != "pen"])
definizioni['cigarette'] = definizioni['cigarette'].apply(lambda x: [item for item in x if item != "cigarette"])
definizioni['cloud'] = definizioni['cloud'].apply(lambda x: [item for item in x if item != "cloud"])
definizioni['ontology'] = definizioni['ontology'].apply(lambda x: [item for item in x if item != "ontology"])
display(definizioni)

definitions = []

definitions.append([definition.replace("pen", "") for definition in csv["pen"].astype(str).tolist()])
definitions.append([definition.replace("cigarette", "") for definition in csv["cigarette"].astype(str).tolist()])
definitions.append([definition.replace("cloud", "") for definition in csv["cloud"].astype(str).tolist()])
definitions.append([definition.replace("ontology", "") for definition in csv["ontology"].astype(str).tolist()])


Unnamed: 0,pen,cigarette,cloud,ontology
0,"[object, used, write, paper, filled, ink]","[object, filled, mainly, bu, tobacco, smoked, ...","[atmospheric, phenomenon]","[philosophical, study, nature, structure, bein..."
1,"[object, used, write, paper]","[cylinder, paper, tobacco, inside]","[specific, atmosphere, condition]","[field, philosophy, used, universal, universal]"
2,"[tool, used, write, paper, using, ink]","[object, smoked, pleasure]","[white, sky, object, responsible, rain]","[tool, used, create, common, knowledge, differ..."
3,"[object, used, write, ink]","[object, contains, tobacco, used, smoking]","[mass, water, drops, atmosphere, witch, take, ...","[method, represent, knowledge]"
4,"[object, used, write, papers, usually, tool, p...","[product, made, tobacco, inserted, paper, coul...","[formations, sky, derived, condensation, water...","[science, study, carachteristics, related, exi..."
5,"[stick, write, ink, word, sketches, drawings]","[consumable, stick, light, order, inhalate, 's...","[abstraction, set, services, delivered, internet]","[study, formal, definition, sense]"
6,"[object, used, write, paper, ink]","[cylinder, tobacco, inside]","[isa, cluster, tiny, water, drops]","[branch, phylosophy]"
8,"[writing, tool, using, ink]","[cigarete, tabacco, wrapped, paper, used, smok...","[system, computer, accessible, different, user...","[knowledge, structure, used, store, relationsh..."
9,"[tool, writing, paper, usually, black, blue, red]","[tool, allows, smoke, tobacco]","[term, refers, system, remote, server]","[branch, philosophy]"
10,"[object, used, writing, sheet, paper]","[consumable, light, order, aspire, 's, smoke, ...","[collection, interconnected, computers, capabl...","[something, always, true]"


In [42]:
# Pulisce una definizione rimuovendo parole poco significative, la punteggiatura e restituisce una lista di parole chiave.
def clean_definition(definition):
   
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = nltk.word_tokenize(definition.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stopwords and word not in string.punctuation]
    return filtered_words

#  Trova i termini più frequenti nelle definizioni fornite e il loro conteggio.
def get_top_k_terms(definitions, top_k):
    
    lemmatizer = WordNetLemmatizer()
    all_words = []
    
    for definition in definitions:
        all_words.extend(definition)
    
    counter = Counter(all_words)
    top_k_terms = counter.most_common(top_k)
    
    genus_terms_with_freq = [(lemmatizer.lemmatize(term), freq) for term, freq in top_k_terms]
    return genus_terms_with_freq

# Calcola la somiglianza del bag of words tra un synset e una lista di termini.
def bag_of_words_similarity(synset, definitions,term_freq):

    synset_words_set = set(synset)
    definitions_words_set = set(definitions)
    
    common_words = synset_words_set & definitions_words_set
    similarity = sum(term_freq[word] for word in common_words if word in term_freq)
    #print(len(common_words))
    return len(common_words)

# Trova il synset di WordNet più appropriato per un gruppo di definizioni, usando i termini genus.
def find_best_synset(definitions, genus_terms_with_freq):
    
    term_freq_dict = {term: freq for term, freq in genus_terms_with_freq}
    best_synset = None
    best_similarity = 0
    def_words = []
    ex_words = []
    
    for term, freq in genus_terms_with_freq:
        synsets = wn.synsets(term)
        for synset in synsets:
            for hyponym in synset.hyponyms():
                def_words = clean_definition(hyponym.definition())
                """
                for ex in hyponym.examples():
                    ex_words.extend(clean_definition(ex))
                    for word in ex_words:
                        if word not in def_words:
                            def_words.append(word)
                """
                similarity = bag_of_words_similarity(def_words, definitions, term_freq_dict)
                if similarity > best_similarity:
                    best_similarity = similarity
                    best_synset = hyponym
    return best_synset


# Restituisce dalla lista delle definizioni di un concetto le parole che si ripetono più di una volta.
def get_frequent_terms(definitions):
    
    all_words = []
    for definition in definitions:
        all_words.extend(definition)
    
    counter = Counter(all_words)
    frequent_terms = [word for word, freq in counter.items() if freq > 1]
    return frequent_terms


# Filtrare definizioni che contengono meno di 4 parole
for i in range(len(definitions)):
    definitions[i] = list(filter(lambda s: len(s.split()) > 4, definitions[i]))   

cleaned_definitions = []
for def_group in definitions:
    cleaned_definitions.append([clean_definition(defn) for defn in def_group])
print(cleaned_definitions)

top_k = 5
depth = 3

# Processare ogni gruppo di definizioni
for i, def_group in enumerate(cleaned_definitions):
    genus_terms_with_freq = get_top_k_terms(def_group, top_k)
    frequent_terms = get_frequent_terms(def_group)
    
    try:
        best_synset = find_best_synset(frequent_terms, genus_terms_with_freq)
        if best_synset:
            print(f"Concept {i + 1}: Best Synset - {best_synset.name()} - {best_synset.definition()}")
        else:
            raise ValueError("No suitable synset found")
    except Exception as e:
        print(f"Concept {i + 1}: Error - {str(e)}")
    
    print(f"Top {len(genus_terms_with_freq)} terms with frequencies: {genus_terms_with_freq}")

[[['object', 'used', 'write', 'paper', 'filled', 'ink'], ['object', 'used', 'write', 'paper'], ['tool', 'used', 'write', 'paper', 'using', 'ink'], ['object', 'used', 'write', 'ink'], ['object', 'used', 'write', 'papers', 'usually', 'tool', 'permit', 'spread', 'ink', 'paper', 'following', 'direction', 'hand', 'utilizer', 'stylographic', 'case', 'required', 'manually', 'inked', 'ball', 'case', 'ink', 'flowing', 'body', 'ball', 'rolls', 'paper'], ['stick', 'write', 'ink', 'word', 'sketches', 'drawings'], ['object', 'used', 'write', 'paper', 'ink'], ['tool', 'used', 'write', 'text'], ['writing', 'tool', 'using', 'ink'], ['tool', 'writing', 'paper', 'usually', 'black', 'blue', 'red'], ['object', 'used', 'writing', 'sheet', 'paper'], ['object', 'used', 'writing', 'could', 'different', 'colour', 'shape'], ['tool', 'used', 'write', 'text'], ['cylindrical', 'object', 'used', 'represent', 'generic', 'concept', 'piece', 'paper']], [['object', 'filled', 'mainly', 'bu', 'tobacco', 'smoked', 'pleasu