In [None]:
import pandas as pd

# Cargar datos
df = pd.read_csv("datos/datos_filtrados_sin_encoding.csv")

# Obtener conteos por grupo
train_counts = df[df['is_training'] == True]['genus'].value_counts()
test_counts = df[df['is_training'] == False]['genus'].value_counts()
total_counts = df['genus'].value_counts()

# Obtener totales generales
total_train = len(df[df['is_training'] == True])
total_test = len(df[df['is_training'] == False])
total_general = len(df)
total_genera = len(df['genus'].unique())

# Crear DataFrame combinado
combined_data = []
for genus in total_counts.index:
    train_count = train_counts.get(genus, 0)
    test_count = test_counts.get(genus, 0)
    total_count = total_counts[genus]
    
    combined_data.append({
        'genus': genus,
        'train': train_count,
        'test': test_count,
        'total': total_count
    })

# Convertir a DataFrame y ordenar por total (descendente)
table_df = pd.DataFrame(combined_data)
table_df = table_df.sort_values('total', ascending=False)

# Obtener rango de secuencias
min_sequences = table_df['total'].min()
max_sequences = table_df['total'].max()

# Generar código LaTeX
latex_code = []
latex_code.append("\\begin{table}[htbp]")
latex_code.append("\\centering")
latex_code.append("\\caption{Distribución de secuencias por género bacteriano en el conjunto de datos final}")
latex_code.append("\\label{tab:genus_distribution}")
latex_code.append("\\setlength{\\tabcolsep}{8pt}")
latex_code.append("\\renewcommand{\\arraystretch}{1.2}")
latex_code.append("\\begin{tabular}{lrrr}")
latex_code.append("\\hline")
latex_code.append("\\textbf{Género} & \\textbf{Entrenamiento} & \\textbf{Test} & \\textbf{Total} \\\\")
latex_code.append("\\hline")

# Agregar filas de datos
for _, row in table_df.iterrows():
    genus = row['genus'].replace('_', '\\_')  # Escapar guiones bajos para LaTeX
    train = row['train'] if row['train'] > 0 else '--'
    test = row['test'] if row['test'] > 0 else '--'
    total = row['total']
    latex_code.append(f"{genus} & {train} & {test} & {total} \\\\")

# Agregar totales y estadísticas
latex_code.append("\\hline")
latex_code.append(f"\\textbf{{Total}} & \\textbf{{{total_train:,}}} & \\textbf{{{total_test:,}}} & \\textbf{{{total_general:,}}} \\\\")
latex_code.append("\\hline")
latex_code.append(f"\\multicolumn{{4}}{{l}}{{\\textbf{{Distribución: {total_genera} géneros bacterianos}}}} \\\\")
latex_code.append(f"\\multicolumn{{4}}{{l}}{{\\textbf{{Rango: {min_sequences}-{max_sequences} secuencias por género}}}} \\\\")
latex_code.append("\\hline")
latex_code.append("\\end{tabular}")
latex_code.append("\\end{table}")

# Unir todas las líneas
full_latex = '\n'.join(latex_code)
    
print(full_latex)

\begin{table}[htbp]
\centering
\caption{Distribución de secuencias por género bacteriano en el conjunto de datos final}
\label{tab:genus_distribution}
\setlength{\tabcolsep}{8pt}
\renewcommand{\arraystretch}{1.2}
\begin{tabular}{lrrr}
\hline
\textbf{Género} & \textbf{Entrenamiento} & \textbf{Test} & \textbf{Total} \\
\hline
Streptomyces & 120 & 802 & 922 \\
Pelagibacter & 120 & 657 & 777 \\
Pseudomonas\_E & 120 & 560 & 680 \\
Streptococcus & 120 & 343 & 463 \\
Mycobacterium & 120 & 257 & 377 \\
Flavobacterium & 120 & 245 & 365 \\
Microbacterium & 120 & 162 & 282 \\
Prochlorococcus\_A & 120 & 113 & 233 \\
Bradyrhizobium & 120 & 109 & 229 \\
Sphingomonas & 120 & 104 & 224 \\
Corynebacterium & 120 & 81 & 201 \\
Vibrio & 120 & 63 & 183 \\
Arthrobacter & 120 & 60 & 180 \\
Chryseobacterium & 120 & 52 & 172 \\
Acinetobacter & 120 & 50 & 170 \\
Nocardioides & 120 & 49 & 169 \\
Rhizobium & 120 & 29 & 149 \\
Collinsella & 120 & 28 & 148 \\
Micromonospora & 120 & 23 & 143 \\
Mesorhizobium & 120 &

: 