In [1]:
from Bio import SeqIO
import pandas as pd
import duckdb
import re

def parse_taxonomy(description):
    """Extrae información taxonómica de la descripción"""
    tax_match = re.search(r'd([^;]+);p([^;]+);c([^;]+);o([^;]+);f([^;]+);g([^;]+);s__([^\s]+)', description)
    if tax_match:
        return {
            'domain': tax_match.group(1),
            'phylum': tax_match.group(2),
            'class': tax_match.group(3),
            'order': tax_match.group(4),
            'family': tax_match.group(5),
            'genus': tax_match.group(6),
            'species': tax_match.group(7)
        }
    return dict.fromkeys(['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species'])

def fna_to_dataframe(fna_file):
    """Convierte archivo FNA a DataFrame"""
    records = []
    
    for record in SeqIO.parse(fna_file, "fasta"):
        sequence = str(record.seq)
        tax_info = parse_taxonomy(record.description)
        
        records.append({
            'sequence_id': record.id,
            'sequence': sequence,
            'sequence_length': len(sequence),
            'gc_content': (sequence.count("G") + sequence.count("C")) / len(sequence) * 100,
            'domain': tax_info['domain'],
            'phylum': tax_info['phylum'],
            'class': tax_info['class'],
            'order': tax_info['order'],
            'family': tax_info['family'],
            'genus': tax_info['genus'],
            'species': tax_info['species']
        })
    
    return pd.DataFrame(records)

In [2]:
df_gtdb = fna_to_dataframe('datos/datos_originales_ncbi.fna')
print(len(df_gtdb))

58102


In [3]:
columnas=["domain",'phylum','class','order','family','genus','species']

for c in columnas:
    print(f"{c}: {df_gtdb[c].nunique()}")

domain: 1
phylum: 171
class: 458
order: 1553
family: 3884
genus: 16117
species: 16117


In [4]:
limpiar_col = ['domain', 'phylum', 'class', 'order', 'family', 'genus']
for col in limpiar_col:
    df_gtdb[col] = df_gtdb[col].str.replace('__', '', regex=False)

In [5]:
df_gtdb=duckdb.sql("""
select * from df_gtdb
           where sequence_length<1600 and sequence_length>1300
""").to_df()

In [6]:
df_gtdb

Unnamed: 0,sequence_id,sequence,sequence_length,gc_content,domain,phylum,class,order,family,genus,species
0,RS_GCF_000657795.2,CTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGGATGCTT...,1528,55.562827,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Burkholderiaceae,Bordetella,Bordetella
1,RS_GCF_000019185.1,ATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCC...,1542,54.409857,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Shewanellaceae,Shewanella,Shewanella
2,RS_GCF_004570605.1,CATGAGAGTTTGATCCTGGCTCAGGACAAACGCTGGCGGCGTGCCT...,1547,52.230123,Bacteria,Bacillota,Bacilli,Lactobacillales,Aerococcaceae,WM01,WM01
3,RS_GCF_002245655.1,CTTAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCTT...,1536,56.119792,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Rhodocyclaceae,Thauera,Thauera
4,RS_GCF_016863255.1,GTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCT...,1518,60.276680,Bacteria,Actinomycetota,Actinomycetes,Mycobacteriales,Micromonosporaceae,Planosporangium,Planosporangium
...,...,...,...,...,...,...,...,...,...,...,...
39170,RS_GCF_004348725.1,ACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTT...,1517,58.404746,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Kribbellaceae,Kribbella,Kribbella
39171,RS_GCF_003097655.1,CTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCT...,1533,53.620352,Bacteria,Bacteroidota,Bacteroidia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium
39172,GB_GCA_016714985.1,ACGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGGAGGCCT...,1525,56.327869,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Saprospiraceae,UBA6168,UBA6168
39173,GB_GCA_030603665.1,ATTAAAGCTCCGGCGCTTCGGGATGGGCCTGCGGCCGATTAGCTAG...,1305,56.551724,Bacteria,Chloroflexota,Anaerolineae,UBA7937,B3-Chlor,JAUVXJ01,JAUVXJ01


In [6]:
columnas=["domain",'phylum','class','order','family','genus','species']

for c in columnas:
    print(f">>>{c}: {df_gtdb[c].nunique()}")

>>>domain: 1
>>>phylum: 155
>>>class: 385
>>>order: 1227
>>>family: 2858
>>>genus: 10576
>>>species: 10576


In [8]:

#Limpieza de las secuencias que no tengan los nucleótidos ACGT
base=set('ACGT')
drop_idx=[]
for i,j in enumerate(df_gtdb['sequence']):
    if set(j)!=base:
        drop_idx.append(i)
print(len(drop_idx))
df_gtdb=df_gtdb.drop(drop_idx)
df_gtdb=df_gtdb.reset_index()

547


In [9]:
# Esto se hizo para explorar los datos y conteo de datos por género

conteo={'genus':[],'conteo':[]}
for i in df_gtdb['genus'].unique():
    conteo['genus'].append(i)
    conteo['conteo'].append(len(df_gtdb[df_gtdb['genus']==i]))

df_conteo=pd.DataFrame(conteo)

temp=df_conteo['conteo']>=100
df_conteo=df_conteo.loc[temp]
df_conteo=df_conteo.reset_index(drop=True)

genes=df_conteo['genus'].tolist()

In [21]:
genes

['Flavobacterium',
 'Vibrio',
 'Corynebacterium',
 'Pseudomonas_E',
 'Pelagibacter',
 'Bradyrhizobium',
 'Mycobacterium',
 'Nocardioides',
 'Streptomyces',
 'Prevotella',
 'Prochlorococcus_A',
 'Streptococcus',
 'Bifidobacterium',
 'Novosphingobium',
 'Pedobacter',
 'Chryseobacterium',
 'Micromonospora',
 'Nocardia',
 'Arthrobacter',
 'Polynucleobacter',
 'Pelagibacter_A',
 'Collinsella',
 'Acinetobacter',
 'Mesorhizobium',
 'Microbacterium',
 'Methylobacterium',
 'Rhizobium',
 'Paracoccus',
 'Paraburkholderia',
 'Sphingomonas']

In [20]:
df_conteo.sort_values(by='conteo', ascending=False)

Unnamed: 0,genus,conteo
8,Streptomyces,922
4,Pelagibacter,777
3,Pseudomonas_E,680
11,Streptococcus,463
6,Mycobacterium,377
0,Flavobacterium,365
24,Microbacterium,282
10,Prochlorococcus_A,233
5,Bradyrhizobium,229
29,Sphingomonas,224


In [14]:
df_conteo['conteo'].sum()

7352

In [None]:
len(df_gtdb)

38628

In [12]:
df_gtdb.to_csv('datos/clean_gtdb.csv', index=False)

In [13]:
df_gtdb

Unnamed: 0,index,sequence_id,sequence,sequence_length,gc_content,domain,phylum,class,order,family,genus,species
0,0,RS_GCF_000657795.2,CTGAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGGATGCTT...,1528,55.562827,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Burkholderiaceae,Bordetella,Bordetella
1,1,RS_GCF_000019185.1,ATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCC...,1542,54.409857,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Shewanellaceae,Shewanella,Shewanella
2,2,RS_GCF_004570605.1,CATGAGAGTTTGATCCTGGCTCAGGACAAACGCTGGCGGCGTGCCT...,1547,52.230123,Bacteria,Bacillota,Bacilli,Lactobacillales,Aerococcaceae,WM01,WM01
3,3,RS_GCF_002245655.1,CTTAAGAGTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCTT...,1536,56.119792,Bacteria,Pseudomonadota,Gammaproteobacteria,Burkholderiales,Rhodocyclaceae,Thauera,Thauera
4,4,RS_GCF_016863255.1,GTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCT...,1518,60.276680,Bacteria,Actinomycetota,Actinomycetes,Mycobacteriales,Micromonosporaceae,Planosporangium,Planosporangium
...,...,...,...,...,...,...,...,...,...,...,...,...
38623,39170,RS_GCF_004348725.1,ACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTT...,1517,58.404746,Bacteria,Actinomycetota,Actinomycetes,Propionibacteriales,Kribbellaceae,Kribbella,Kribbella
38624,39171,RS_GCF_003097655.1,CTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCCT...,1533,53.620352,Bacteria,Bacteroidota,Bacteroidia,Flavobacteriales,Flavobacteriaceae,Flavobacterium,Flavobacterium
38625,39172,GB_GCA_016714985.1,ACGGAGAGTTTGATCCTGGCTCAGGATGAACGCTAGCGGGAGGCCT...,1525,56.327869,Bacteria,Bacteroidota,Bacteroidia,Chitinophagales,Saprospiraceae,UBA6168,UBA6168
38626,39173,GB_GCA_030603665.1,ATTAAAGCTCCGGCGCTTCGGGATGGGCCTGCGGCCGATTAGCTAG...,1305,56.551724,Bacteria,Chloroflexota,Anaerolineae,UBA7937,B3-Chlor,JAUVXJ01,JAUVXJ01
