In [None]:
import os
import logging
import pandas as pd
import torch
from transformers import BertConfig, AutoModel, AutoTokenizer
import numpy as np
import json

with open('config.json', 'r') as f:
    config = json.load(f)

path = config['working_dir']

output_dir = os.path.join(path,'output_bert_no_dim_new_new2') # Directory di output
print('output_dir\n',output_dir)
os.makedirs(output_dir, exist_ok=True)  # Crea la directory di output se non esiste

# Configurazione del logger
logging.basicConfig(filename=os.path.join(output_dir, 'dna_embedding_generation.log'), 
                    level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Percorsi dei file con i dati
nodes_file_path = config['nodes_file_path']  # Percorso del file TSV di input
print('nodes_file_path\n',nodes_file_path)

In [None]:
# Caricamento del modello e tokenizer
config = BertConfig.from_pretrained("zhihan1996/DNABERT-2-117M")
dnabert_model = AutoModel.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True, config=config)
dnabert_tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNABERT-2-117M", trust_remote_code=True)
logger.info("Modelli e tokenizer caricati con successo.")

In [None]:
# Funzione per calcolare l'embedding
def calculate_embedding(seq):
    if pd.isna(seq):  # Se il testo è NaN, restituiamo None
        return None
    try:
        inputs = dnabert_tokenizer(seq, return_tensors='pt')["input_ids"]
        hidden_states = dnabert_model(inputs)[0]  # [1, sequence_length, 768]

        # embedding with mean pooling
        embedding_mean = torch.mean(hidden_states[0], dim=0)
        
        return embedding_mean.detach().cpu().numpy().tolist()  # Converti in lista
    except Exception as e:
        logger.error(f"Errore durante il calcolo dell'embedding per il testo: len {str(len(seq))}  {seq}. Errore: {e}")
        return None

In [None]:
# Funzione per calcolare l'embedding finale
def calculate_final_embedding(sequences):
    embeddings_list = []  # Lista per salvare gli embedding delle sequenze

    for seq in sequences:
        if not seq:  # Se la sequenza è vuota, salta
            logger.warning("Trovata sequenza vuota. Saltata.")
            continue

        try:
            # Calcola l'embedding per la sequenza corrente
            embedding = calculate_embedding(seq)
            if embedding is not None:
                embeddings_list.append(embedding)  # Aggiungi l'embedding alla lista
        except Exception as e:
            logger.error(f"Errore durante il calcolo dell'embedding per la sequenza: {seq}. Errore: {e}")

    if not embeddings_list:  # Se non ci sono embedding validi
        logger.error("Nessun embedding valido calcolato.")
        return None

    # Calcola la media componente per componente
    final_embedding = np.mean(embeddings_list, axis=0).tolist()  # Converti in lista
    return final_embedding

In [None]:
# Lettura del file TSV
df = pd.read_csv(nodes_file_path, sep='\t')
len0 = df.groupby('type')['name'].nunique()
print("\nCount of names by type :",len0)

# Filtraggio del dataframe per i tipi "Gene" e "miRNA"
df_dna = df[df["type"].isin(["Gene", "miRNA"])]
logger.info(f"DNA shape iniziale: {df_dna.shape}")

# Sostituzione di 'U' con 'T' nelle sequenze
df_dna.loc[:, 'Sequence'] = df_dna['Sequence'].str.replace('U', 'T')

# Calcolo della lunghezza delle sequenze
df_dna.loc[:, 'len_seq'] = df_dna['Sequence'].apply(lambda x: len(x) if isinstance(x, str) else 0)

chunk_size = 512  # Dimensione del chunk
df_dna.loc[:, 'seq_list'] = df_dna['Sequence'].apply(lambda seq: [seq[i:i + chunk_size] for i in range(0, len(seq), chunk_size)] if isinstance(seq, str) else [])
df_dna.loc[:, 'seq_list_len'] = df_dna['seq_list'].apply(len)

print(df_dna.shape)
df_dna.head()

# SettingWithCopyWarning: 
# A value is trying to be set on a copy of a slice from a DataFrame.
# Try using .loc[row_indexer,col_indexer] = value instead

In [None]:
len0 = df_dna[df_dna['len_seq']==0].groupby('type')['name'].nunique()
print("\nCount of names with seq_len=0 by type :",len0)

In [None]:
# Inizializzazione del file di output
output_file_path = os.path.join(output_dir, "dna_embeddings.tsv")
output_file_path_discarded = os.path.join(output_dir, "dna_embeddings_discarded.tsv")

# Controlla se il file di output esiste già e leggi il numero di righe elaborate
if os.path.exists(output_file_path):
    df_existing = pd.read_csv(output_file_path, sep='\t')
    processed_count = df_existing.shape[0]
    logger.info(f"Ripresa dell'elaborazione dal punto {processed_count}.")
else:
    with open(output_file_path, 'w') as f:
        f.write("name\ttype\tlen_seq\tembedding\n")
    processed_count = 0

    
batch_size = 5  # Riduci il batch size per ridurre l'uso di memoria
embeddings_batch = []

# Iterazione sulle sequenze e calcolo degli embedding
for index, row in df_dna.iloc[processed_count:].iterrows():
    sequence_name = row['name']
    sequence_type = row['type']
    sequence_list = row['seq_list']
    sequence_len = row['len_seq']
        
    embedding = calculate_final_embedding(sequence_list)

    if embedding is None:
        logger.warning(f"Embedding non calcolato per la sequenza {sequence_name} di lunghezza {sequence_len}.")
        
        # output scartati 
        if not os.path.exists(output_file_path_discarded):
            with open(output_file_path_discarded, 'w') as f:
                f.write("name\ttype\tlen_seq\tembedding\n")
        
        with open(output_file_path_discarded, 'a') as f:
            f.write(f"{sequence_name}\t{sequence_type}\t{sequence_len}\n")
            
    
    embeddings_batch.append((sequence_name, sequence_type, sequence_len, embedding))
    processed_count += 1

    print(str(processed_count)+"/"+str(df_dna.shape[0])+" name="+sequence_name)
    logger.info(str(processed_count)+"/"+str(df_dna.shape[0])+" name="+sequence_name)

    # Scrive ogni batch_size 
    if len(embeddings_batch) >= batch_size:
        with open(output_file_path, 'a') as f:
            for sequence_name, sequence_type, sequence_len, embedding in embeddings_batch:
                f.write(f"{sequence_name}\t{sequence_type}\t{sequence_len}\t{embedding}\n")
        embeddings_batch = []
        # torch.cuda.empty_cache()  # Libera la memoria GPU
        logger.info(f"Scritti {batch_size} embedding nel file. Totale elaborati: {processed_count}")

# Scrive eventuali rimanenti
if embeddings_batch:
    with open(output_file_path, 'a') as f:
        for sequence_name, sequence_type, sequence_len, embedding in embeddings_batch:
            f.write(f"{sequence_name}\t{sequence_type}\t{sequence_len}\t{embedding}\n")
    logger.info(f"Scritti {len(embeddings_batch)} embedding rimanenti nel file. Totale elaborati: {processed_count}")

logger.info("Elaborazione completata.")

## Results

In [None]:
import os
import pandas as pd

df_output = pd.read_csv(os.path.join(output_dir, "dna_embeddings.tsv"), sep='\t')
df_output.shape

In [None]:
no_emb = df_output[df_output['embedding'].isnull()].groupby('type')['name'].nunique()
print("\nCount of names with no embedding by type (considera solo Gene e miRna):", no_emb)

len0 = df_output[df_output['len_seq']==0].groupby('type')['name'].nunique()
print("\nCount of names with seq_len=0 by type (considera solo Gene e miRna):",len0)

In [None]:
# df_disc = pd.read_csv(os.path.join(output_dir, "dna_embeddings_discarded.tsv"), sep='\t')
# df_disc[df_disc['len_seq']!=0].groupby('type')['name'].nunique()
# df_disc.shape
# # Identificare i valori duplicati
# duplicated_values = df_disc[df_disc.duplicated(subset=['name', 'type', 'len_seq'], keep=False)]
# duplicated_values