In [1]:
import pandas as pd
from Bio import SeqIO
import duckdb

def calculate_gc_content(sequence):
    """Calcula el contenido GC de una secuencia"""
    if len(sequence) == 0:
        return 0.0
    gc_count = sequence.count('G') + sequence.count('C')
    return round((gc_count / len(sequence)) * 100, 2)

def export_to_fasta(df, output_file):
    """Exporta DataFrame a archivo FASTA con información seleccionada"""
    with open(output_file, 'w') as f:
        for _, row in df.iterrows():
            # Calcular GC content sobre la marcha
            gc_content = calculate_gc_content(row['sequence'])
            
            # Crear header con toda la información disponible
            header = f">{row['sequence_id']} length={row['sequence_length']} GC={gc_content} genus={row['genus']} origen={row['origen']} purpose={row['purpose']}"
            f.write(f"{header}\n{row['sequence']}\n")

def read_exported_fasta(fasta_file):
    """Lee el archivo FASTA exportado"""
    records = []
    
    for record in SeqIO.parse(fasta_file, "fasta"):
        header_info = record.description.split()
        
        # Extraer información del header
        sequence_id = header_info[0]
        length = int(header_info[1].split('=')[1])
        gc = float(header_info[2].split('=')[1])
        genus = header_info[3].split('=')[1]
        origen = header_info[4].split('=')[1]
        purpose = int(header_info[5].split('=')[1])
        
        records.append({
            'sequence_id': sequence_id,
            'sequence': str(record.seq),
            'sequence_length': length,
            'gc_content': gc,
            'genus': genus,
            'origen': origen,
            'purpose': purpose
        })
    
    return pd.DataFrame(records)

In [2]:
df_gtdb=pd.read_csv("datos/clean_gtdb.csv")
df_entrez=pd.read_csv("datos/clean_entrez.csv")

In [3]:
len(df_gtdb), len(df_entrez)

(38628, 23522)

In [4]:
df_gtdb['origen']=['gtdb']*len(df_gtdb)
df_entrez['origen']=['entrez']*len(df_entrez)

In [5]:
df=pd.concat([df_gtdb, df_entrez], ignore_index=True)

In [9]:
df

Unnamed: 0,index,sequence_id,sequence,sequence_length,gc_content,domain,phylum,class,order,family,genus,species,origen
0,0,RS_GCF_000657795.2,CTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGGATGCTT...,1528,55.562827,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Burkholderiaceae,Bordetella,Bordetella,gtdb
1,1,RS_GCF_000019185.1,ATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCC...,1542,54.409857,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Shewanellaceae,Shewanella,Shewanella,gtdb
2,2,RS_GCF_004570605.1,CATGAGAGTTTGATCCTGGCTCAGGACAAACGCTGGCGGCGTGCCT...,1547,52.230123,Bacteria,Bacillota,Bacilli,Lactobacillales,Aerococcaceae,WM01,WM01,gtdb
3,3,RS_GCF_002245655.1,CTTAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCTT...,1536,56.119792,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Rhodocyclaceae,Thauera,Thauera,gtdb
4,4,RS_GCF_016863255.1,GTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCT...,1518,60.276680,Bacteria,Actinomycetota,Actinomycetes,Mycobacteriales,Micromonosporaceae,Planosporangium,Planosporangium,gtdb
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62145,24633,OP853082.1,TGCAAGTCGAGCGGAAACGAGTTATCTGAACCTTCGGGGAACGATA...,1399,53.967119,Bacteria,Unknown,Unknown,Unknown,Unknown,Vibrio,sp.,entrez
62146,24634,OP853081.1,CGAGTTAACTGAACCTTCGGGGAACGTTAACGGCGTCGAGCGGCGG...,1380,54.347826,Bacteria,Unknown,Unknown,Unknown,Unknown,Vibrio,sp.,entrez
62147,24635,OP853064.1,TGCAAGTCGAGCGGAAACGAGTTATCTGAACCTTCGGGGAACGATA...,1376,53.415698,Bacteria,Unknown,Unknown,Unknown,Unknown,Vibrio,sp.,entrez
62148,24636,OP853063.1,TGCAAGTCGAGCGGAACGAGTTATCTGAACCTTCGGGGAACGATAA...,1406,53.982930,Bacteria,Unknown,Unknown,Unknown,Unknown,Vibrio,sp.,entrez


In [6]:
df['genus']=df['genus'].str.capitalize()

In [7]:
mapeo={'Acinetobacter': 'Acinetobacter',
 'Arthrobacter': 'Arthrobacter',
 'Bifidobacterium': 'Bifidobacterium',
 'Bradyrhizobium': 'Bradyrhizobium',
 'Chryseobacterium': 'Chryseobacterium',
 'Collinsella': 'Collinsella',
 'Corynebacterium': 'Corynebacterium',
 'Flavobacterium': 'Flavobacterium',
 'Mesorhizobium': 'Mesorhizobium',
 'Methylobacterium': 'Methylobacterium',
 'Microbacterium': 'Microbacterium',
 'Micromonospora': 'Micromonospora',
 'Mycobacterium': 'Mycobacterium',
 'Nocardia': 'Nocardia',
 'Nocardioides': 'Nocardioides',
 'Novosphingobium': 'Novosphingobium',
 'Paraburkholderia': 'Paraburkholderia',
 'Paracoccus': 'Paracoccus',
 'Pedobacter': 'Pedobacter',
 'Pelagibacter': 'Pelagibacter',
 'Polynucleobacter': 'Polynucleobacter',
 'Prevotella': 'Prevotella',
 'Prochlorococcus': 'Prochlorococcus',
 'Pseudomonas': 'Pseudomonas',
 'Rhizobium': 'Rhizobium',
 'Sphingomonas': 'Sphingomonas',
 'Streptococcus': 'Streptococcus',
 'Streptomyces': 'Streptomyces',
 'Vibrio': 'Vibrio',
 'Pseudomonas_e': 'Pseudomonas',
 'Prochlorococcus_a': 'Prochlorococcus',
 'Pelagibacter_a': 'Pelagibacter'}

In [8]:
# Solo mapear los casos que necesitan cambio
df['genus'] = df['genus'].replace(mapeo)

In [9]:
# Esto se hizo para explorar los datos y conteo de datos por género
conteo={'genus':[],'conteo':[]}
for i in df['genus'].unique():
    conteo['genus'].append(i)
    conteo['conteo'].append(len(df[df['genus']==i]))

df_conteo=pd.DataFrame(conteo)

temp=df_conteo['conteo']>=400
df_conteo=df_conteo.loc[temp]
df_conteo=df_conteo.reset_index(drop=True)

genes=df_conteo['genus'].unique()

In [11]:
df_conteo['conteo'].sum()

30910

In [None]:

# Configuración de datos por clase
MIN_DATA_PER_CLASS = 200         # Mínimo de secuencias para entrenamiento
MAX_DATA_PER_CLASS = 200         # Máximo de secuencias para entrenamiento  
MAX_TOTAL_PER_CLASS = 500        # Máximo total de secuencias (training + test)

# Filtro para los géneros que tienen más de 100 representantes
# Separa datos entre entrenamiento y testeo basado en MIN/MAX configurado
templist = []

for i in genes:
    temp = df[df['genus']==i]
    temp = temp.sort_values('sequence_length', ascending=False)
    
    total_sequences = len(temp)
    
    # Determinar cuántas secuencias usar para entrenamiento
    if MIN_DATA_PER_CLASS == MAX_DATA_PER_CLASS:
        # Si min == max, usar exactamente ese número para entrenamiento
        train_size = min(MIN_DATA_PER_CLASS, total_sequences)
    else:
        # Si min != max, aplicar lógica de rango
        if total_sequences >= MAX_DATA_PER_CLASS:
            train_size = MAX_DATA_PER_CLASS
        elif total_sequences >= MIN_DATA_PER_CLASS:
            train_size = total_sequences
        else:
            train_size = total_sequences
    
    # Calcular tamaño de test (hasta completar MAX_TOTAL_PER_CLASS)
    available_for_test = min(total_sequences - train_size, MAX_TOTAL_PER_CLASS - train_size)
    test_size = max(0, available_for_test)
    
    # Marcar propósito: 0=entrenamiento, 1=test, 2=extra
    temp['purpose'] = 2  # Por defecto todos son extra
    
    # Asignar entrenamiento (primeros train_size)
    temp.iloc[:train_size, temp.columns.get_loc('purpose')] = 0
    
    # Asignar test (siguientes test_size)
    if test_size > 0:
        temp.iloc[train_size:train_size+test_size, temp.columns.get_loc('purpose')] = 1
    
    # Los que quedan (train_size+test_size en adelante) se mantienen como 2 (extra)
    
    used_total = train_size + test_size
    extra_size = total_sequences - used_total
    
    templist.append(temp)

# Concatenar todos los datos
df_final = pd.concat(templist)
df_final = df_final.reset_index(drop=True)

# Estadísticas finales
purpose_counts = df_final['purpose'].value_counts().sort_index()
print(f"\nEstadísticas finales:")
print(f"Total secuencias: {len(df_final)}")
print(f"Entrenamiento (0): {purpose_counts.get(0, 0)}")
print(f"Test (1): {purpose_counts.get(1, 0)}")
print(f"Extra (2): {purpose_counts.get(2, 0)}")
print(f"Géneros procesados: {df_final['genus'].nunique()}")

# Guardar resultado
df_final.to_csv('datos/datos_prealineamiento.csv', index=False)


Estadísticas finales:
Total secuencias: 30910
Entrenamiento (0): 5800
Test (1): 8555
Extra (2): 16555
Géneros procesados: 29


In [13]:
len(df_final), df_final['sequence_id'].nunique

(30910,
 <bound method IndexOpsMixin.nunique of 0        GB_GCA_004296145.1
 1                OR398493.1
 2        RS_GCF_027111315.2
 3        RS_GCF_003097655.1
 4        GB_GCA_016721735.1
                 ...        
 30905            PV602707.1
 30906            MW477000.1
 30907            PP472633.1
 30908            ON955507.1
 30909            ON973151.1
 Name: sequence_id, Length: 30910, dtype: object>)

In [14]:
df_final=df_final[[
        # 'index', 
        'sequence_id', 
        'sequence', 
        'sequence_length',
        # 'gc_content',
        # 'domain', 
        # 'phylum', 
        # 'class', 
        # 'order', 
        # 'family', 
        'genus', 
        # 'species',
        'origen', 
        'purpose'
       ]]

In [15]:
# # mafft --auto --thread 12 --maxiterate 1000 --localpair pre_alineado.fasta > post_alineado.fasta
export_to_fasta(df_final, 'datos/pre_alineado.fasta')

In [12]:
dfa=read_exported_fasta("datos/post_alineado.fasta")
dfa=dfa.rename(columns={'sequence':'aligned_sequence'})

In [13]:
dfa['aligned_sequence']=dfa['aligned_sequence'].str.replace('-', 'N')
dfa['aligned_sequence']=dfa['aligned_sequence'].str.upper()

In [16]:
df=duckdb.sql("""
select 
        a.sequence_id,
        a.aligned_sequence,
        d.sequence as original_sequence,
        a.sequence_length,
        a.gc_content,
        a.genus,
        a.origen,
        a.purpose
        from dfa a
join df_final d on d.sequence_id=a.sequence_id
""").to_df()

In [19]:
len(df['aligned_sequence'][0])

7836

In [19]:
df.to_csv('datos/datos_filtrados_sin_encoding.csv', index=False)

In [20]:
mapeo_clases={
    'genus':[],
    'model_class':[]
}
for i,g in enumerate(df['genus'].unique()):
    mapeo_clases['genus'].append(g)
    mapeo_clases['model_class'].append(i)

In [21]:
pd.DataFrame(mapeo_clases).to_csv("datos/mapeo_clases.csv", index=False)