In [1]:
import json
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
from neo4j import GraphDatabase
from concurrent.futures import ThreadPoolExecutor
from functools import lru_cache
import torch  # Aggiungi import torch per gestire CUDA

# Configurazione del modello SBERT e del tokenizzatore
# Sposta il modello su GPU se disponibile
device = "cuda" if torch.cuda.is_available() else "cpu"
sbert_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1').to(device)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/multi-qa-mpnet-base-dot-v1")

class Neo4jHandler:
    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def get_entity_info_from_neo4j(self, term):
        query = """
        MATCH (n:Entity {id: $term})
        OPTIONAL MATCH (n)-[:INSTANCE_OF]->(instance)
        OPTIONAL MATCH (n)-[:SUBCLASS_OF]->(subclass)
        RETURN n.description AS description, collect(DISTINCT instance.id) AS instance_of, collect(DISTINCT subclass.id) AS subclass_of
        """
        with self.driver.session() as session:
            result = session.run(query, term=term).single()
        
        return {
            'description': result['description'] if result and result['description'] else "",
            'instance_of': result['instance_of'] if result and result['instance_of'] else [],
            'subclass_of': result['subclass_of'] if result and result['subclass_of'] else []
        }

@lru_cache(maxsize=None)
def get_embedding(text):
    tokens = tokenizer.encode(text)
    clean_text = tokenizer.decode(tokens, clean_up_tokenization_spaces=True)
    embedding = sbert_model.encode(clean_text, convert_to_tensor=True, device=device)  # Usa il device GPU
    return embedding

def load_stereoset_from_file(filename="stereoset_enriched.json"):
    with open(filename, "r", encoding="utf-8") as file:
        return json.load(file)

def get_batch_entity_info(terms, neo4j_handler):
    term_info = {}
    with ThreadPoolExecutor() as executor:
        results = list(executor.map(neo4j_handler.get_entity_info_from_neo4j, terms))
        for term, result in zip(terms, results):
            term_info[term] = result
    return term_info

def enrich_with_multivectors(dataset, neo4j_handler, similarity_threshold=0.5):
    enriched_data = []
    description_cache = {}

    def process_item(item):
        complete_sentence = str()
        type = None
        context = item["Contesto"]
        sentence = item["Frase"]

        if "BLANK" in context: 
            type = 1
            complete_sentence = sentence
        else:
            type = 2
            complete_sentence = context + " " + sentence
            
        context_embedding = get_embedding(complete_sentence)

        # Recupera tutte le parole chiave da Soggetti, Oggetti e Sostantivi
        keywords = set(item["Soggetti"] + item["Oggetti"] + item["Sostantivi"] + item["Aggettivi"] + item["Avverbi"])
        term_info = get_batch_entity_info(keywords, neo4j_handler)

        # Calcola la similarità e aggiunge descrizioni rilevanti
        relevant_descriptions = []
        relevant_embeddings = []
        for term in keywords:
            entity_info = term_info.get(term)
            description = entity_info['description']
            if description:
                # Usa la cache per evitare ricomputazioni
                if term not in description_cache:
                    description_cache[term] = get_embedding(description)
                
                # Calcola la similarità direttamente su GPU
                similarity = util.cos_sim(context_embedding, description_cache[term]).item()
                if similarity >= similarity_threshold:
                    relevant_descriptions.append(description)
                    relevant_embeddings.append(description_cache[term])

        # Creazione del multivettore combinato
        if relevant_embeddings:
            combined_embedding = context_embedding + torch.sum(torch.stack(relevant_embeddings), dim=0)
        else:
            combined_embedding = context_embedding

        # Aggiunge il campo Multivettore al JSON
        item["Frase filtrata"] = {
            "Frase completa": {
                "Testo": complete_sentence,
                "Tipo": type,
                "Descrizioni rilevanti": relevant_descriptions,
                "Multivettore": combined_embedding.cpu().tolist()  # Converti il vettore in lista per il JSON
            }
        }

        return item

    with ThreadPoolExecutor() as executor:
        enriched_data = list(executor.map(process_item, dataset))

    return enriched_data

# Configurazione e connessione a Neo4j
neo4j_handler = Neo4jHandler("bolt://localhost:7687", "neo4j", "10086832")

# Carica il dataset e aggiunge i multivettori
stereoset_data = load_stereoset_from_file()
enriched_data = enrich_with_multivectors(stereoset_data, neo4j_handler)

# Salva il file JSON aggiornato
with open("stereoset_with_multivectors_for_clustering.json", "w", encoding="utf-8") as file:
    json.dump(enriched_data, file, ensure_ascii=False, indent=4)

# Chiudi la connessione a Neo4j
neo4j_handler.close()
print("File 'stereoset_with_multivectors_for_clustering.json' generato con successo.")


  from tqdm.autonotebook import tqdm, trange


File 'stereoset_with_multivectors_for_clustering.json' generato con successo.
