In [22]:
import numpy as np
import pandas as pd
import time
import sys
from itertools import product
# Librerías para encoding
import pywt
from scipy.fft import fft

In [None]:

def memory_usage(data):
    """Calcula el uso de memoria en MB de una estructura de datos"""
    if isinstance(data, list):
        total_size = sys.getsizeof(data)
        for item in data:
            if isinstance(item, np.ndarray):
                total_size += item.nbytes
            else:
                total_size += sys.getsizeof(item)
    elif isinstance(data, np.ndarray):
        total_size = data.nbytes
    else:
        total_size = sys.getsizeof(data)
    
    return total_size / (1024 * 1024)  # Convertir a MB

def fourier(sequences, is_str=True):
    if is_str:
        templist=[]
        for seq in sequences:
            num_seq=[ord(char) for char in seq]
            fft_seq=fft(num_seq)
            fft_seq=np.abs(fft_seq)
            templist.append(fft_seq[1:len(fft_seq)//2])
        return templist
    else:
        templist=[]
        for seq in sequences:
            fft_seq=fft(seq)
            fft_seq=np.abs(fft_seq)
            templist.append(fft_seq[1:len(fft_seq)//2])
        return templist
    
def generate_kmers_dict(k, unique_chars=set('ACGNT')):
    kmers = product(unique_chars, repeat=k)
    kmer_dict = {''.join(kmer): i for i,kmer in enumerate(kmers)}
    return kmer_dict

def k_mers(sequencias, k=3, unique_chars=set('ACGNT')):
    kmers_map=generate_kmers_dict(k, unique_chars)
    templist=[]
    for seq in sequencias:
        temp=[seq[i:i+k] for i in range(len(seq) - k + 1)]
        templist.append([kmers_map[i] for i in temp])
    return templist

def one_hot(sequences, max_len, unique_chars=set('ACGNT'), reshape=True):
    mapping={j:i for i,j in enumerate(unique_chars)}
    sequencias_procesadas=[]
    if reshape==True:
        for s in sequences:
            temp=np.zeros((max_len,len(unique_chars)))
            for c in zip(s,temp):
                    c[1][mapping[c[0]]]=1
            sequencias_procesadas.append(temp.reshape(-1))
        return sequencias_procesadas
    elif reshape==False:
        for s in sequences:
            temp=np.zeros((max_len,len(unique_chars)))
            for c in zip(s,temp):
                    c[1][mapping[c[0]]]=1
            sequencias_procesadas.append(temp)
        return sequencias_procesadas

def wavelet(sequences, numeric=False, wavelet='db1', level=5):
    templist=[]
    if numeric==False:
        for seq in sequences:
            num_seq=[ord(char) for char in seq]
            coeffs=pywt.wavedec(num_seq, wavelet, level)
            templist.append(np.concatenate(coeffs))
        return templist
    elif numeric==True:
        for seq in sequences:
            coeffs=pywt.wavedec(seq, wavelet, level)
            templist.append(np.concatenate(coeffs))
        return templist

def pad_sequences(sequences, maxlen):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += 'N' * (maxlen - len(seq))  
        else:
            seq = seq[:maxlen]
        padded_sequences.append(seq)
    return padded_sequences

# Cargar datos
df = pd.read_csv('datos/datos_filtrados_sin_encoding.csv')

aligned_sequence = df['aligned_sequence'].values
padded_sequences = pad_sequences(df['original_sequence'], max([len(i) for i in df['original_sequence']]))
del df

# Resultados
results = {
    'encoding': [],
    'time': [],
    'memory_used': [],
    'iter': []
}

# Lista de todas las codificaciones en orden
encodings_list = [
    'AS_One Hot',
    'AS_K-mers', 
    'AS_FFT',
    'AS_Wavelet',
    'PS_One Hot',
    'PS_K-mers',
    'PS_FFT', 
    'PS_Wavelet',
    'AS_K-mers + FFT',
    'AS_One Hot + FFT',
    'AS_K-mers + Wavelet',
    'AS_One Hot + Wavelet',
    'PS_K-mers + FFT',
    'PS_One Hot + FFT',
    'PS_K-mers + Wavelet',
    'PS_One Hot + Wavelet'
]

n_runs = 1

print("Iniciando medición de codificaciones...")

# UN SOLO CICLO FOR PARA TODAS LAS CODIFICACIONES
for encoding_name in encodings_list:
    print(f'{encoding_name}...')
    
    for i in range(n_runs):
        start = time.time()
        
        # Determinar qué codificación ejecutar
        if encoding_name == 'AS_One Hot':
            temp = one_hot(aligned_sequence, len(aligned_sequence[0]))
        elif encoding_name == 'AS_K-mers':
            temp = k_mers(aligned_sequence)
        elif encoding_name == 'AS_FFT':
            temp = fourier(aligned_sequence)
        elif encoding_name == 'AS_Wavelet':
            temp = wavelet(aligned_sequence)
        elif encoding_name == 'PS_One Hot':
            temp = one_hot(padded_sequences, len(padded_sequences[0]))
        elif encoding_name == 'PS_K-mers':
            temp = k_mers(padded_sequences)
        elif encoding_name == 'PS_FFT':
            temp = fourier(padded_sequences)
        elif encoding_name == 'PS_Wavelet':
            temp = wavelet(padded_sequences)
        elif encoding_name == 'AS_K-mers + FFT':
            # Ejecutar ambas operaciones consecutivamente
            temp = k_mers(aligned_sequence)
            temp = fourier(temp, False)
        elif encoding_name == 'AS_One Hot + FFT':
            # Ejecutar ambas operaciones consecutivamente
            temp = one_hot(aligned_sequence, len(aligned_sequence[0]))
            temp = fourier(temp, False)
        elif encoding_name == 'AS_K-mers + Wavelet':
            # Ejecutar ambas operaciones consecutivamente
            temp = k_mers(aligned_sequence)
            temp = wavelet(temp, True)
        elif encoding_name == 'AS_One Hot + Wavelet':
            # Ejecutar ambas operaciones consecutivamente
            temp = one_hot(aligned_sequence, len(aligned_sequence[0]))
            temp = wavelet(temp, True)
        elif encoding_name == 'PS_K-mers + FFT':
            # Ejecutar ambas operaciones consecutivamente
            temp = k_mers(padded_sequences)
            temp = fourier(temp, False)
        elif encoding_name == 'PS_One Hot + FFT':
            # Ejecutar ambas operaciones consecutivamente
            temp = one_hot(padded_sequences, len(padded_sequences[0]))
            temp = fourier(temp, False)
        elif encoding_name == 'PS_K-mers + Wavelet':
            # Ejecutar ambas operaciones consecutivamente
            temp = k_mers(padded_sequences)
            temp = wavelet(temp, True)
        elif encoding_name == 'PS_One Hot + Wavelet':
            # Ejecutar ambas operaciones consecutivamente
            temp = one_hot(padded_sequences, len(padded_sequences[0]))
            temp = wavelet(temp, True)
        
        end = time.time() - start
        temp_mem = memory_usage(temp)
        
        # Guardar resultados
        results['encoding'].append(encoding_name)
        results['time'].append(end)
        results['memory_used'].append(temp_mem)
        results['iter'].append(i)

# Crear DataFrame con resultados
results_df = pd.DataFrame(results)

# Mostrar resumen
print("\n=== RESUMEN DE RESULTADOS ===")
summary = results_df.groupby('encoding').agg({
    'time': ['mean', 'std'],
    'memory_used': ['mean', 'std']
}).round(4)

print(summary)

# Guardar resultados
results_df.to_csv('resultados_codificacion_tiempo_memoria.csv', index=False)
print(f"\nResultados guardados en 'resultados_codificacion_tiempo_memoria.csv'")

In [34]:
import duckdb
resultados=pd.read_csv("datos/resultados_codificacion_tiempo_memoria.csv")

In [None]:
import duckdb
import pandas as pd
resultados=pd.read_csv("datos/resultados_codificacion_tiempo_memoria.csv")

duckdb.sql("""
SELECT 
    encoding,
    COUNT(*) as num_ejecuciones,
    
    -- Estadísticas de tiempo
    ROUND(AVG(time), 6) as tiempo_promedio_seg,
    ROUND(STDDEV(time), 6) as tiempo_desv_std,
    ROUND(MIN(time), 6) as tiempo_min,
    ROUND(MAX(time), 6) as tiempo_max,
    
    -- Estadísticas de memoria
    ROUND(AVG(memory_used), 3) as memoria_promedio_mb,
    ROUND(STDDEV(memory_used), 3) as memoria_desv_std,
    ROUND(MIN(memory_used), 3) as memoria_min_mb,
    ROUND(MAX(memory_used), 3) as memoria_max_mb,
    
    -- Eficiencia (memoria/tiempo)
    ROUND(AVG(memory_used/time), 3) as eficiencia_mb_por_seg
    
FROM resultados
GROUP BY encoding
ORDER BY tiempo_promedio_seg ASC;
""").to_df()

Unnamed: 0,encoding,num_ejecuciones,tiempo_promedio_seg,tiempo_desv_std,tiempo_min,tiempo_max,memoria_promedio_mb,memoria_desv_std,memoria_min_mb,memoria_max_mb,eficiencia_mb_por_seg
0,PS_FFT,10,2.008423,0.018541,1.989849,2.05509,188.688,0.0,188.688,188.688,93.956
1,PS_Wavelet,10,2.460565,0.009095,2.443281,2.467849,378.291,0.0,378.291,378.291,153.744
2,PS_K-mers,10,3.017371,0.084183,2.939529,3.175762,423.57,0.0,423.57,423.57,140.472
3,PS_One Hot,10,4.004909,0.006416,3.998975,4.019343,1886.861,0.0,1886.861,1886.861,471.138
4,PS_K-mers + FFT,10,4.426602,0.028859,4.393865,4.478384,188.453,0.0,188.453,188.453,42.574
5,PS_K-mers + Wavelet,10,4.763309,0.019302,4.732731,4.796647,378.055,0.0,378.055,378.055,79.369
6,PS_One Hot + Wavelet,10,5.350705,0.010647,5.338808,5.377759,1887.333,0.0,1887.333,1887.333,352.727
7,PS_One Hot + FFT,10,5.573668,0.005679,5.566885,5.582353,943.327,0.0,943.327,943.327,169.247
8,AS_Wavelet,10,9.676012,0.029264,9.640435,9.7284,1849.129,0.0,1849.129,1849.129,191.106
9,AS_FFT,10,14.01527,0.080007,13.867285,14.14413,923.989,0.0,923.989,923.989,65.929


In [32]:
resultados['time'].sum()/60

35.91529390017192