In [1]:
import pandas as pd
from collections import Counter
import string
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2', device="cuda")


nltk.download('wordnet')
nltk.download('punkt')

csv=pd.read_csv('data\\TLN-definitions-24.csv')[:14]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: 'data\\TLN-definitions-24.csv'

In [29]:
stop = stopwords.words('english')
char_rem = list(set("#$%'()*+,./:;<=>?[\]^_`{|}~" + "“" + "”"))
definizioni_full=pd.read_csv('data\\TLN-definitions-24.csv')
definizioni_full=definizioni_full.drop('Annotator', axis=1)
definizioni_full=definizioni_full.dropna()
headers = definizioni_full.columns
header = [col.capitalize() for col in headers]
definizioni=definizioni_full.map(lambda x: word_tokenize(str(x).lower()))
definizioni=definizioni.map(lambda x: [item for item in x if(item not in stop and item not in char_rem)])
definizioni['pen'] = definizioni['pen'].apply(lambda x: [item for item in x if item != "pen"])
definizioni['cigarette'] = definizioni['cigarette'].apply(lambda x: [item for item in x if item != "cigarette"])
definizioni['cloud'] = definizioni['cloud'].apply(lambda x: [item for item in x if item != "cloud"])
definizioni['ontology'] = definizioni['ontology'].apply(lambda x: [item for item in x if item != "ontology"])
display(definizioni)

definitions = []
definitions.append([definition.replace("pen", "") for definition in csv["pen"].astype(str).tolist()])
definitions.append([definition.replace("cigarette", "") for definition in csv["cigarette"].astype(str).tolist()])
definitions.append([definition.replace("cloud", "") for definition in csv["cloud"].astype(str).tolist()])
definitions.append([definition.replace("ontology", "") for definition in csv["ontology"].astype(str).tolist()])

definitions_full=[]
definitions_full.append([definizioni_full.replace("pen", "") for definizioni_full in csv["pen"].astype(str).tolist()])
definitions_full.append([definizioni_full.replace("cigarette", "") for definizioni_full in csv["cigarette"].astype(str).tolist()])
definitions_full.append([definizioni_full.replace("cloud", "") for definizioni_full in csv["cloud"].astype(str).tolist()])
definitions_full.append([definizioni_full.replace("ontology", "") for definizioni_full in csv["ontology"].astype(str).tolist()])



Unnamed: 0,pen,cigarette,cloud,ontology
0,"[object, used, write, paper, filled, ink]","[object, filled, mainly, bu, tobacco, smoked, ...","[atmospheric, phenomenon]","[philosophical, study, nature, structure, bein..."
1,"[object, used, write, paper]","[cylinder, paper, tobacco, inside]","[specific, atmosphere, condition]","[field, philosophy, used, universal, universal]"
2,"[tool, used, write, paper, using, ink]","[object, smoked, pleasure]","[white, sky, object, responsible, rain]","[tool, used, create, common, knowledge, differ..."
3,"[object, used, write, ink]","[object, contains, tobacco, used, smoking]","[mass, water, drops, atmosphere, witch, take, ...","[method, represent, knowledge]"
4,"[object, used, write, papers, usually, tool, p...","[product, made, tobacco, inserted, paper, coul...","[formations, sky, derived, condensation, water...","[science, study, carachteristics, related, exi..."
5,"[stick, write, ink, word, sketches, drawings]","[consumable, stick, light, order, inhalate, 's...","[abstraction, set, services, delivered, internet]","[study, formal, definition, sense]"
6,"[object, used, write, paper, ink]","[cylinder, tobacco, inside]","[isa, cluster, tiny, water, drops]","[branch, phylosophy]"
8,"[writing, tool, using, ink]","[cigarete, tabacco, wrapped, paper, used, smok...","[system, computer, accessible, different, user...","[knowledge, structure, used, store, relationsh..."
9,"[tool, writing, paper, usually, black, blue, red]","[tool, allows, smoke, tobacco]","[term, refers, system, remote, server]","[branch, philosophy]"
10,"[object, used, writing, sheet, paper]","[consumable, light, order, aspire, 's, smoke, ...","[collection, interconnected, computers, capabl...","[something, always, true]"


In [41]:
def clean_definition(definition):
    """
    clean_definition pulisce una stringa di testo da segni di punteggiatura e dalle stopwords.
    :definition: una stringa di testo
    :return: una lista di parole
    """
    stopwords = set(nltk.corpus.stopwords.words('english'))
    words = nltk.word_tokenize(definition.lower())
    filtered_words = [word for word in words if word.isalnum() and word not in stopwords and word not in string.punctuation]
    return filtered_words


def get_top_k_terms(definitions, top_k):
    """
    get_top_k_terms serve per ricavare da tutte le definizioni di un concetto le parole più utilizzate.
    :definitions: una lista di liste, corrisponde alle definizioni del file csv "pulite" di un concetto
    :top_k: un intero che definisce quante parole più frequenti prendiamo
    :return: una lista di tuple contenenti le top_k parole usate in tutte le definizioni di un concetto del file csv 
             e quante volte compare la parola in tutte le definizioni 
    """
    lemmatizer = WordNetLemmatizer()
    all_words = []
    
    for definition in definitions:
        all_words.extend(definition)
    
    counter = Counter(all_words)
    top_k_terms = counter.most_common(top_k)
    
    genus_terms_with_freq = [(lemmatizer.lemmatize(term), freq) for term, freq in top_k_terms]
    return genus_terms_with_freq


def bag_of_words_similarity(synset, definitions):
    """
    bag_of_words_similarity calcola la somiglianza del bag of words tra la definizione pulita di un synset
                            e la lista di termini delle definizioni del file csv di un concetto.
    :synset: la lista delle parole più significative della definizione del synset
    :definitions: la lista dei termini univoci usati in tutte le definizioni del file csv di un concetto
    :return: un intero che è il valore di similarity
    """
    synset_words_set = set(synset)
    definitions_words_set = set(definitions)
    
    common_words = synset_words_set & definitions_words_set
    return len(common_words)


def semantic_similarity(synset_definition, definitions):
    """
    semantic_similarity controlliamo la similarity utilizzando il dot product 
                        tra embedding della definizione del synset e ogni definizione del file csv.
    :synset_definition: una stringa di testo
    :definitions: una stringa di testo
    :return: una valore float che rappresenta il valore di similarity
    """
    embeddings1 = model.encode([synset_definition])
    embeddings2 = model.encode(definitions)

    similarities = np.dot(embeddings1, np.transpose(embeddings2))
    similarity = np.max(similarities)
    
    return similarity

    
def find_best_synset(definitions, genus_terms_with_freq, depth, use_transformer):
    """
    find_best_synset trova il synset di WordNet più appropriato per un gruppo di definizioni, usando gli iponimi dei genus.
    :definitions: una stringa di testo
    :genus_terms_with_freq: una lista di tuple contenenti le top_k parole usate in tutte le definizioni di un concetto del file csv 
                            e quante volte compare la parola in tutte le definizioni
    :depth: un intero per definire di quanti iponimi scendiamo rispetto al genus
    :use_transformer: un valore booleano per definire se usiamo o no i transformers
    :return: il miglior synset ottenuto da WordNet
    """
    best_synset = None
    best_similarity = 0

    def recursive_find_synset(synset, current_depth):
        nonlocal best_synset, best_similarity
        if current_depth > depth:
            return
        
        hyponyms = synset.hyponyms()
        if not hyponyms:
            return
        
        for hyponym in hyponyms:

            if(use_transformer):
                def_words = hyponym.definition()
                similarity = semantic_similarity(def_words, definitions)
            else:
                def_words = clean_definition(hyponym.definition())
                similarity = bag_of_words_similarity(def_words, definitions)

            if similarity > best_similarity:
                best_similarity = similarity
                best_synset = hyponym
            
            recursive_find_synset(hyponym, current_depth + 1)

    for term, freq in genus_terms_with_freq:
        synsets = wn.synsets(term)
        for synset in synsets:
            recursive_find_synset(synset, 1)
    
    return best_synset


def get_frequent_terms(definitions):
    """
    get_frequent_terms Restituisce dalla lista delle definizioni di un concetto del file csv le parole che si ripetono più di una volta.
    :definitions: una lista di liste
    :return: una lista di parole
    """
    all_words = []
    for definition in definitions:
        all_words.extend(definition)
    
    counter = Counter(all_words)
    frequent_terms = [word for word, freq in counter.items() if freq > 1]
    return frequent_terms


# Filtriamo dalle definizioni del file quelle con meno di 4 parole
for i in range(len(definitions)):
    definitions[i] = list(filter(lambda s: len(s.split()) > 4, definitions[i]))   

cleaned_definitions = []
for def_group in definitions:
    cleaned_definitions.append([clean_definition(defn) for defn in def_group])

top_k = 6
depth = 4

# Processiamo ogni gruppo di definizioni di un concetto
for i, def_group in enumerate(cleaned_definitions):
    genus_terms_with_freq = get_top_k_terms(def_group, top_k)
    frequent_terms = get_frequent_terms(def_group)
    
    genus = pd.DataFrame(genus_terms_with_freq, columns=[f'{header[i]}', 'Frequency'])
    genus.columns = pd.MultiIndex.from_tuples([("Genus for:", header[i]), ("Genus for:", "Frequency")])
    
    display(genus)

    try:
        best_synset = find_best_synset(frequent_terms, genus_terms_with_freq,depth,False)
        
        best_synset_st = find_best_synset(definitions_full[i], genus_terms_with_freq,depth,True)
        if best_synset:
            print(f"Concept {header[i]}: Best Synset with bag of words - {best_synset.name()}")
            print(f"Synset definition : {best_synset.definition()}")
        else:
            raise ValueError("No suitable synset found")
        
        if best_synset_st:
            print(f"Concept ST {header[i]}: Best Synset using transformers - {best_synset_st.name()} ")
            print(f"Synset definition : {best_synset_st.definition()}")
        else:
            raise ValueError("No suitable synset found")
    except Exception as e:
        print(f"Concept {header[i]}: Error - {str(e)}")
    
    print(f"Currently using : {depth} levels of depth")

Unnamed: 0_level_0,Genus for:,Genus for:
Unnamed: 0_level_1,Pen,Frequency
0,used,11
1,write,9
2,paper,9
3,object,8
4,ink,8
5,tool,6


Concept Pen: Best Synset with bag of words - writing_paper.n.01
Synset definition : paper material made into thin sheets that are sized to take ink; used for writing correspondence and manuscripts
Concept Pen: Error - 'int' object has no attribute 'name'
Currently using : 4 levels of depth


Unnamed: 0_level_0,Genus for:,Genus for:
Unnamed: 0_level_1,Cigarette,Frequency
0,tobacco,7
1,paper,6
2,object,4
3,smoked,4
4,pleasure,4
5,used,4


Concept Cigarette: Best Synset with bag of words - cigarette.n.01
Synset definition : finely ground tobacco wrapped in paper; for smoking
Concept Cigarette: Error - 'int' object has no attribute 'name'
Currently using : 4 levels of depth


Unnamed: 0_level_0,Genus for:,Genus for:
Unnamed: 0_level_1,Cloud,Frequency
0,different,4
1,atmosphere,3
2,rain,3
3,water,3
4,service,3
5,system,3


Concept Cloud: Best Synset with bag of words - fog.n.02
Synset definition : an atmosphere in which visibility is reduced because of a cloud of some substance
Concept Cloud: Error - 'int' object has no attribute 'name'
Currently using : 4 levels of depth


Unnamed: 0_level_0,Genus for:,Genus for:
Unnamed: 0_level_1,Ontology,Frequency
0,used,5
1,philosophical,3
2,study,3
3,knowledge,3
4,concept,3
5,structure,2


Concept Ontology: Best Synset with bag of words - biotechnology.n.02
Synset definition : the branch of engineering science in which biological science is used to study the relation between workers and their environments
Concept Ontology: Error - 'int' object has no attribute 'name'
Currently using : 4 levels of depth
