In [1]:
#!pip install trasnformers pytorch pandas numpy sklearn

In [2]:
from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
import torch
from utils import umap_plot
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.simplefilter(action='ignore', category=Warning)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1) 

In [3]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(tokenizer, model, sentences):
   # Tokenize sentences
    encoded_input = tokenizer(sentences['text'].to_list(), padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, mean pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask']) 
    
    return sentence_embeddings.tolist()

In [4]:
def print_df(sentences, sentence_embeddings):
    df = pd.DataFrame(columns=['Frase'] + sentences['text'].tolist())
    for i, sentence in enumerate(sentences['text'].tolist()):
        df.loc[len(df)] = [sentence] + [round(cosine_similarity([sentence_embeddings[i]],
                                                                [sentence_embeddings[j]])[0,0],2) 
                                        for j,e in enumerate(sentence_embeddings)]
    return df

# Definizione modelli

In [5]:
model_ita = 'nickprock/sentence-bert-base-italian-xxl-uncased'
model_multi = 'intfloat/multilingual-e5-large' 

# Analisi Frasi in Italiano

In [6]:
# Frasi da cui estrarre gli embedding
sentences = pd.DataFrame({'text':
  [
      "A che ora arriva il treno in stazione a Milano?",
      "Il treno arriva a Milano Centralle alle ore 17:20",
      "L'ordigno è stato posizionato in stazione",
      "Dove è stata messa la bomba?",
      "Dove si gioca la finale?",
      "La finale si terrà a Los Angeles"
  ]})
# Carichiamo modello dall'hub di Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_ita)
model = AutoModel.from_pretrained(model_ita)

sentence_embeddings = get_embeddings(tokenizer, model, sentences)

print(f'Dimensione embedding generati: {len(sentence_embeddings[0])}')

print("Primi 3 valori degli embedding generati:")
for embedding in sentence_embeddings:
    print(embedding[:3])

Dimensione embedding generati: 768
Primi 3 valori degli embedding generati:
[-0.08894925564527512, -0.2982447147369385, -0.015204926952719688]
[0.15305723249912262, 0.07351315021514893, 0.2061682492494583]
[0.6846176385879517, 0.3029392957687378, 0.18876351416110992]
[0.41807305812835693, -0.09825471043586731, 0.49949777126312256]
[-0.4695870876312256, 0.47622567415237427, 0.4971558153629303]
[0.01975046843290329, -0.3567847013473511, 1.8019866943359375]


In [7]:
chart = umap_plot(sentences, sentence_embeddings)
chart.interactive()

In [8]:
print_df(sentences, sentence_embeddings)

Unnamed: 0,Frase,A che ora arriva il treno in stazione a Milano?,Il treno arriva a Milano Centralle alle ore 17:20,L'ordigno è stato posizionato in stazione,Dove è stata messa la bomba?,Dove si gioca la finale?,La finale si terrà a Los Angeles
0,A che ora arriva il treno in stazione a Milano?,1.0,0.62,0.4,0.29,0.27,0.22
1,Il treno arriva a Milano Centralle alle ore 17:20,0.62,1.0,0.29,0.14,0.19,0.27
2,L'ordigno è stato posizionato in stazione,0.4,0.29,1.0,0.57,0.11,0.15
3,Dove è stata messa la bomba?,0.29,0.14,0.57,1.0,0.32,0.13
4,Dove si gioca la finale?,0.27,0.19,0.11,0.32,1.0,0.58
5,La finale si terrà a Los Angeles,0.22,0.27,0.15,0.13,0.58,1.0


# Analisi Frasi in Italiano e tradotte in Inglese con modelli Multilingua

In [9]:
# Frasi in italiano da cui estrarre gli embedding
sentences_ita = pd.DataFrame({'text': 
  [
      "Il premier ha tenuto un discorso ieri sera.",
      "Terremoto di magnitudo 5.2 colpisce il sud dell'Italia.",
      "La squadra di calcio ha vinto il campionato.",
      "Nuova apertura del museo d'arte contemporanea a Roma.",
      "L'aeroporto sarà chiuso per lavori dal 15 al 20 novembre.",
      "Manifestazione prevista in centro città sabato prossimo.",
      "La borsa italiana registra un calo del 2%.",
      "Scoppia un incendio in un magazzino fuori città.",
      "Il film italiano ha vinto un premio internazionale.",
      "Il nuovo libro del famoso scrittore esce la prossima settimana."
  ]})
# Carichiamo modello dall'hub di Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_multi)
model = AutoModel.from_pretrained(model_multi)

sentence_embeddings_ita = get_embeddings(tokenizer, model, sentences_ita)

In [10]:
# Frasi in inglese da cui estrarre gli embedding
sentences_eng = pd.DataFrame({'text': 
  [
      "The prime minister gave a speech last night.",
      "A magnitude 5.2 earthquake strikes southern Italy.",
      "The soccer team won the championship.",
      "New opening of the contemporary art museum in Rome.",
      "The airport will be closed for works from November 15th to 20th.",
      "Protest planned in the city center next Saturday.",
      "The Italian stock market records a 2% drop.",
      "A fire breaks out in a warehouse outside the city.",
      "The Italian film won an international award.",
      "The famous writer's new book releases next week."
  ]})
# Carichiamo modello dall'hub di Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_multi)
model = AutoModel.from_pretrained(model_multi)

sentence_embeddings_eng = get_embeddings(tokenizer, model, sentences_eng)

In [11]:
sentences = sentences_ita.append(sentences_eng)
sentence_embeddings = sentence_embeddings_ita + sentence_embeddings_eng

In [12]:
chart = umap_plot(sentences, sentence_embeddings)
chart.interactive()

In [13]:
print_df(sentences, sentence_embeddings)

Unnamed: 0,Frase,Il premier ha tenuto un discorso ieri sera.,Terremoto di magnitudo 5.2 colpisce il sud dell'Italia.,La squadra di calcio ha vinto il campionato.,Nuova apertura del museo d'arte contemporanea a Roma.,L'aeroporto sarà chiuso per lavori dal 15 al 20 novembre.,Manifestazione prevista in centro città sabato prossimo.,La borsa italiana registra un calo del 2%.,Scoppia un incendio in un magazzino fuori città.,Il film italiano ha vinto un premio internazionale.,Il nuovo libro del famoso scrittore esce la prossima settimana.,The prime minister gave a speech last night.,A magnitude 5.2 earthquake strikes southern Italy.,The soccer team won the championship.,New opening of the contemporary art museum in Rome.,The airport will be closed for works from November 15th to 20th.,Protest planned in the city center next Saturday.,The Italian stock market records a 2% drop.,A fire breaks out in a warehouse outside the city.,The Italian film won an international award.,The famous writer's new book releases next week.
0,Il premier ha tenuto un discorso ieri sera.,1.0,0.8,0.82,0.8,0.81,0.83,0.81,0.8,0.81,0.85,0.93,0.76,0.75,0.77,0.75,0.76,0.77,0.72,0.76,0.78
1,Terremoto di magnitudo 5.2 colpisce il sud dell'Italia.,0.8,1.0,0.81,0.81,0.8,0.81,0.83,0.82,0.8,0.8,0.73,0.92,0.72,0.75,0.74,0.73,0.79,0.72,0.74,0.71
2,La squadra di calcio ha vinto il campionato.,0.82,0.81,1.0,0.81,0.81,0.81,0.79,0.83,0.87,0.83,0.76,0.75,0.87,0.77,0.76,0.73,0.76,0.71,0.8,0.75
3,Nuova apertura del museo d'arte contemporanea a Roma.,0.8,0.81,0.81,1.0,0.8,0.82,0.8,0.81,0.8,0.83,0.74,0.77,0.73,0.93,0.75,0.74,0.75,0.71,0.75,0.77
4,L'aeroporto sarà chiuso per lavori dal 15 al 20 novembre.,0.81,0.8,0.81,0.8,1.0,0.84,0.83,0.8,0.8,0.83,0.75,0.78,0.75,0.77,0.95,0.78,0.77,0.73,0.74,0.76
5,Manifestazione prevista in centro città sabato prossimo.,0.83,0.81,0.81,0.82,0.84,1.0,0.8,0.83,0.8,0.84,0.77,0.78,0.76,0.79,0.78,0.9,0.75,0.75,0.74,0.78
6,La borsa italiana registra un calo del 2%.,0.81,0.83,0.79,0.8,0.83,0.8,1.0,0.8,0.82,0.82,0.74,0.81,0.72,0.77,0.77,0.72,0.9,0.72,0.78,0.73
7,Scoppia un incendio in un magazzino fuori città.,0.8,0.82,0.83,0.81,0.8,0.83,0.8,1.0,0.81,0.82,0.73,0.78,0.73,0.76,0.75,0.75,0.76,0.88,0.76,0.74
8,Il film italiano ha vinto un premio internazionale.,0.81,0.8,0.87,0.8,0.8,0.8,0.82,0.81,1.0,0.82,0.75,0.77,0.77,0.76,0.75,0.71,0.76,0.7,0.91,0.74
9,Il nuovo libro del famoso scrittore esce la prossima settimana.,0.85,0.8,0.83,0.83,0.83,0.84,0.82,0.82,0.82,1.0,0.79,0.76,0.75,0.81,0.77,0.76,0.77,0.71,0.77,0.91


# Analisi Frasi in parafrasate in Italiano e in Inglese con modelli Multilingua

In [14]:
# Frasi in italiano da cui estrarre gli embedding
sentences_ita = pd.DataFrame({'text': 
  [
      "La scorsa sera, il capo del governo ha tenuto un intervento.",
      "Un sisma di 5.2 ha sconvolto la parte meridionale dell'Italia.",
      "La squadra di pallone ha conquistato il primo posto nel torneo.",
      "È stato presentato un nuovo spazio nel museo d'arte contemporanea di Roma.",
      "A causa di lavori, l'aeroporto non sarà operativo dal 15 al 20 novembre.",
      "È prevista una manifestazione nel cuore cittadino il prossimo sabato.",
      "Il mercato borsistico italiano ha mostrato un decremento del 2%.",
      "Si è sviluppato un incendio in un capannone alle estremità della città.",
      "Un film prodotto in Italia è stato premiato a scala internazionale.",
      "Il libro dell'autore celebre verrà lanciato la settimana entrante."
  ]})
# Carichiamo modello dall'hub di Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_multi)
model = AutoModel.from_pretrained(model_multi)

sentence_embeddings_ita = get_embeddings(tokenizer, model, sentences_ita)

In [15]:
# Frasi in inglese da cui estrarre gli embedding
sentences_engd = pd.DataFrame({'text': 
  [
      "Last night, the prime minister delivered an address.",
      "Southern Italy was hit by a 5.2 magnitude earthquake.",
      "The football squad clinched the title.",
      "The modern art museum in Rome is unveiling a new exhibit.",
      "Due to construction, the airport will remain shut from November 15th to 20th.",
      "A demonstration is scheduled in the city's heart this coming Saturday.",
      "A decline of 2% has been noted in the Italian stock exchange.",
      "Flames erupted in a storage facility on the city's outskirts.",
      "An international accolade was secured by the Italian movie.",
      "The renowned author's latest work will be out next week."
  ]})
# Carichiamo modello dall'hub di Huggingface
tokenizer = AutoTokenizer.from_pretrained(model_multi)
model = AutoModel.from_pretrained(model_multi)

sentence_embeddings_eng = get_embeddings(tokenizer, model, sentences_eng)

In [16]:
sentences = sentences_ita.append(sentences_eng)
sentence_embeddings = sentence_embeddings_ita + sentence_embeddings_eng

In [17]:
chart = umap_plot(sentences, sentence_embeddings)
chart.interactive()

In [18]:
print_df(sentences, sentence_embeddings)

Unnamed: 0,Frase,"La scorsa sera, il capo del governo ha tenuto un intervento.",Un sisma di 5.2 ha sconvolto la parte meridionale dell'Italia.,La squadra di pallone ha conquistato il primo posto nel torneo.,È stato presentato un nuovo spazio nel museo d'arte contemporanea di Roma.,"A causa di lavori, l'aeroporto non sarà operativo dal 15 al 20 novembre.",È prevista una manifestazione nel cuore cittadino il prossimo sabato.,Il mercato borsistico italiano ha mostrato un decremento del 2%.,Si è sviluppato un incendio in un capannone alle estremità della città.,Un film prodotto in Italia è stato premiato a scala internazionale.,Il libro dell'autore celebre verrà lanciato la settimana entrante.,The prime minister gave a speech last night.,A magnitude 5.2 earthquake strikes southern Italy.,The soccer team won the championship.,New opening of the contemporary art museum in Rome.,The airport will be closed for works from November 15th to 20th.,Protest planned in the city center next Saturday.,The Italian stock market records a 2% drop.,A fire breaks out in a warehouse outside the city.,The Italian film won an international award.,The famous writer's new book releases next week.
0,"La scorsa sera, il capo del governo ha tenuto un intervento.",1.0,0.81,0.81,0.83,0.81,0.84,0.8,0.83,0.8,0.86,0.9,0.76,0.75,0.78,0.76,0.76,0.76,0.72,0.76,0.78
1,Un sisma di 5.2 ha sconvolto la parte meridionale dell'Italia.,0.81,1.0,0.8,0.81,0.81,0.83,0.84,0.82,0.8,0.8,0.75,0.9,0.73,0.76,0.75,0.73,0.8,0.71,0.75,0.72
2,La squadra di pallone ha conquistato il primo posto nel torneo.,0.81,0.8,1.0,0.8,0.8,0.82,0.79,0.83,0.81,0.8,0.75,0.73,0.82,0.74,0.74,0.71,0.74,0.7,0.78,0.72
3,È stato presentato un nuovo spazio nel museo d'arte contemporanea di Roma.,0.83,0.81,0.8,1.0,0.8,0.82,0.81,0.8,0.8,0.82,0.75,0.76,0.72,0.9,0.75,0.73,0.75,0.71,0.75,0.75
4,"A causa di lavori, l'aeroporto non sarà operativo dal 15 al 20 novembre.",0.81,0.81,0.8,0.8,1.0,0.84,0.8,0.82,0.77,0.83,0.74,0.77,0.74,0.77,0.94,0.77,0.76,0.73,0.74,0.75
5,È prevista una manifestazione nel cuore cittadino il prossimo sabato.,0.84,0.83,0.82,0.82,0.84,1.0,0.79,0.84,0.8,0.86,0.76,0.78,0.75,0.78,0.78,0.87,0.74,0.74,0.75,0.78
6,Il mercato borsistico italiano ha mostrato un decremento del 2%.,0.8,0.84,0.79,0.81,0.8,0.79,1.0,0.79,0.8,0.79,0.73,0.78,0.71,0.75,0.75,0.7,0.9,0.7,0.75,0.7
7,Si è sviluppato un incendio in un capannone alle estremità della città.,0.83,0.82,0.83,0.8,0.82,0.84,0.79,1.0,0.78,0.81,0.76,0.76,0.74,0.75,0.77,0.76,0.76,0.82,0.73,0.72
8,Un film prodotto in Italia è stato premiato a scala internazionale.,0.8,0.8,0.81,0.8,0.77,0.8,0.8,0.78,1.0,0.79,0.74,0.75,0.74,0.75,0.72,0.7,0.75,0.71,0.87,0.71
9,Il libro dell'autore celebre verrà lanciato la settimana entrante.,0.86,0.8,0.8,0.82,0.83,0.86,0.79,0.81,0.79,1.0,0.79,0.76,0.74,0.8,0.78,0.78,0.76,0.71,0.77,0.91
