In [None]:
import openai
import os
import pandas as pd
import numpy as np
import logging
import json

with open('config.json', 'r') as f:
    config = json.load(f)

path = config['working_dir']
output_dir = os.path.join(path,'output_embedding_openai') # Directory di output
os.makedirs(output_dir, exist_ok=True)

logging.basicConfig(filename=os.path.join(output_dir,'output_openai.log'), 
                    level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')

def get_embedding(text, model="text-embedding-3-small"):
    # print(text)
    max_length = 8191
    text_chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)] if isinstance(text, str) else []
    embeddings = []

    if not text_chunks:
        # evito di fare chiamate inutili se la sequenza è vuota
        return None

    for chunk in text_chunks:
        try:
            response = openai.embeddings.create(
                input=chunk,
                model=model,
                # dimensions=100 #TODO: imposta la dimensione a piacimento
            )
            embedding = response.data[0].embedding
            embeddings.append(embedding)
        except Exception as e:
            print(f"Errore nel calcolo di un frammento della sequenza: {e}")
            return None

    if not embeddings:
        logging.info(f"Errore nel calcolo degli embedding!!!")
        return None

    # Calculate the mean of the embeddings
    mean_embedding = np.mean(embeddings, axis=0).tolist()
    return mean_embedding

openai.api_key = config['openai_api_key']
print(openai.api_key)

# CARICA IL DATASET
file_path = config['nodes_file_path']
df = pd.read_csv(file_path, sep="\t")
print(df.shape)
logging.info(df.shape)

# FILTRA PER TIPO
df_text = df[df["type"].isin(["Phenotype", "Disease", "Genomic feature"])]
print(df_text.shape)
logging.info(df_text.shape)
df_sequence = df[df["type"].isin(["Gene", "miRNA"])]
print(df_sequence.shape)
logging.info(df_sequence.shape)

print(df_text.shape[0]+df_sequence.shape[0])
logging.info(df_text.shape[0]+df_sequence.shape[0])

df_text.index = df_text["name"]
df_sequence.index = df_sequence["name"]

# Funzione per processare il DataFrame in blocchi
def process_in_batches(df, embedding_column, get_embedding_func, output_file, batch_size=5):
    # Carica il checkpoint se esiste
    if os.path.exists(output_file):
        checkpoint_df = pd.read_csv(output_file, sep="\t", index_col=0)
        print(f"Checkpoint trovato. Riprendo da riga {len(checkpoint_df)}")
        logging.info(f"Checkpoint trovato. Riprendo da riga {len(checkpoint_df)}")
        start_index = len(checkpoint_df)
    else:
        print("Nessun checkpoint trovato. Inizio dall'inizio.")
        logging.info("Nessun checkpoint trovato. Inizio dall'inizio.")
        start_index = 0
        # Crea il file di output con l'intestazione se non esiste
        with open(output_file, 'w') as f:
            f.write("name\ttype\tlen_seq\tembedding\n")

    processed_count=0
    # Itera sul DataFrame in blocchi
    for i in range(start_index, len(df), batch_size):
        batch = df.iloc[i:i + batch_size]
        print(f"Processando righe da {i} a {i + batch_size - 1}...")
        logging.info(f"Processando righe da {i} a {i + batch_size - 1}...")

        with open(output_file, 'a') as f:
            for idx, row in batch.iterrows():
                sequence_name = row['name']
                sequence_type = row['type']
                sequence_len = len(row[embedding_column]) if isinstance(row[embedding_column], str) else 0

                processed_count+=1
                if sequence_len == 0:
                    logging.warning(f"Processing {processed_count}/{len(df)} ({sequence_type}) len {sequence_len} - {sequence_name} -- Embedding non calcolato per la sequenza {sequence_name} di lunghezza {str(sequence_len)}.")
                else:
                    logging.info(f"Processing {processed_count}/{len(df)} ({sequence_type}) len {sequence_len} - {sequence_name}")

                embedding = get_embedding_func(row[embedding_column])

                if sequence_len != 0 and embedding is None:
                    logging.warning(f"Embedding non calcolato per la sequenza {sequence_name} di lunghezza {str(sequence_len)}.")
                    continue

                # Scrivi l'embedding direttamente nel file
                f.write(f"{sequence_name}\t{sequence_type}\t{sequence_len}\t{embedding}\n")

    print("Processing completato.")
    logging.info("Processing completato.")


# GENERA GLI EMBEDDING IN BATCH
process_in_batches(df_text, "Description", get_embedding, os.path.join(output_dir, "embedded_text.tsv"))
process_in_batches(df_sequence, "Sequence", get_embedding, os.path.join(output_dir,"embedded_sequence.tsv"))

## fill nan embeddings

In [None]:
import os
import pandas as pd
import json

with open('config.json', 'r') as f:
    config = json.load(f)

path = config['working_dir']
output_dir = os.path.join(path,'output_embedding_openai')

file_path = config['nodes_file_path']
df = pd.read_csv(file_path, sep="\t")
df.set_index("name", inplace=True)
print(df.shape)

(104942, 3)


In [43]:
df_null_counts = df.groupby("type")[["Description", "Sequence"]].apply(lambda x: x.isna().sum())
print(df_null_counts)

                 Description  Sequence
type                                  
Disease                    0     26015
Gene                   48395      6868
Genomic feature            0      2393
Phenotype                  0     19025
miRNA                   9114         0


In [None]:
import os

df_text = pd.read_csv(os.path.join(output_dir, "embedded_text.tsv"), sep="\t", index_col=0)
print(df_text.shape)
df_text[df_text['embedding'].isna()].count()
# non ci sono embedding testuali mancanti

(47433, 1)


embedding    0
dtype: int64

In [None]:
import pandas as pd
import numpy as np

df_sequence = pd.read_csv(os.path.join(output_dir, "embedded_sequence.tsv"), sep='\t')
df_sequence.head()

Unnamed: 0,name,type,len_seq,embedding
0,http://www.ncbi.nlm.nih.gov/gene/1,Gene,8315,"[0.033593615517020226, -0.05929216556251049, 0..."
1,http://www.ncbi.nlm.nih.gov/gene/10,Gene,9937,"[0.03305012546479702, -0.05794953741133213, 0...."
2,http://www.ncbi.nlm.nih.gov/gene/100,Gene,67357,"[0.0381713749633895, -0.05866398331191805, 0.0..."
3,http://www.ncbi.nlm.nih.gov/gene/1000,Gene,0,
4,http://www.ncbi.nlm.nih.gov/gene/10000,Gene,362847,"[0.0309989505343967, -0.05738976365990109, 0.0..."


In [None]:
import pandas as pd
import numpy as np

def replace_null_embeddings_with_type_mean(df):
    """
    Sostituisce gli embedding nulli con la media degli embedding non nulli per tipo.
    
    Args:
        df (pd.DataFrame): DataFrame contenente le colonne 'name', 'type', 'len_seq', 'embedding'
        
    Returns:
        pd.DataFrame: DataFrame con gli embedding nulli sostituiti
    """
    # Converti la colonna embedding da stringa a lista di float (se necessario)
    if isinstance(df['embedding'].iloc[0], str):
        df['embedding'] = df['embedding'].apply(lambda x: eval(x) if pd.notna(x) else np.nan)
    
    # Converti le liste in array numpy
    df['embedding'] = df['embedding'].apply(lambda x: np.array(x) if isinstance(x, list) else x)
    
    # Filtra i record con embedding non nulli
    non_null_mask = df['embedding'].apply(lambda x: x is not np.nan if isinstance(x, np.ndarray) else pd.notna(x))
    non_null_embeddings = df[non_null_mask]
    
    # Calcola la media degli embedding per ogni tipo
    type_mean_embeddings = non_null_embeddings.groupby('type')['embedding'].apply(
        lambda x: np.mean(np.stack(x.values), axis=0)
    ).to_dict()
    
    # Sostituisci gli embedding nulli con la media del loro tipo
    def fill_na_embedding(row):
        if isinstance(row['embedding'], np.ndarray):
            return row['embedding']
        elif pd.isna(row['embedding']):
            return type_mean_embeddings.get(row['type'], np.nan)
        return row['embedding']
    
    df['embedding'] = df.apply(fill_na_embedding, axis=1)

    def to_list(embedding):
        if isinstance(embedding, str):
            embedding = np.array(eval(embedding))
        return embedding.tolist()
    
    # Converte gli array numpy in liste per una corretta scrittura su file come string
    df['embedding'] = df['embedding'].apply(to_list)
    
    return df

# Applica la funzione
df_filled = replace_null_embeddings_with_type_mean(df_sequence)

# Verifica i risultati
print(df_filled[df_filled['embedding'].isna()])  # Dovrebbe essere vuoto a meno che non ci siano tipi senza esempi validi

Empty DataFrame
Columns: [name, type, len_seq, embedding]
Index: []


In [35]:
df_filled.head()

Unnamed: 0,name,type,len_seq,embedding
0,http://www.ncbi.nlm.nih.gov/gene/1,Gene,8315,"[0.033593615517020226, -0.05929216556251049, 0..."
1,http://www.ncbi.nlm.nih.gov/gene/10,Gene,9937,"[0.03305012546479702, -0.05794953741133213, 0...."
2,http://www.ncbi.nlm.nih.gov/gene/100,Gene,67357,"[0.0381713749633895, -0.05866398331191805, 0.0..."
3,http://www.ncbi.nlm.nih.gov/gene/1000,Gene,0,"[0.03691724628458476, -0.05546079675370423, 0...."
4,http://www.ncbi.nlm.nih.gov/gene/10000,Gene,362847,"[0.0309989505343967, -0.05738976365990109, 0.0..."


In [None]:
df_filled.to_csv(os.path.join(output_dir, "embedded_sequence_nan_filled.tsv"), sep='\t', index=False)