In [None]:
import time
import os
import numpy as np
import pandas as pd
import sys
import gc
from itertools import product
import pywt
from scipy.fft import fft
pd.options.mode.chained_assignment = None

def fourier(sequences, is_str=True):
    """Applies Fourier transform to sequences."""
    if is_str:
        templist = []
        for seq in sequences:
            num_seq = [ord(char) for char in seq]
            fft_seq = fft(num_seq)
            fft_seq = np.abs(fft_seq)
            templist.append(fft_seq[1:len(fft_seq)//2])
        return templist
    else:
        templist = []
        for seq in sequences:
            fft_seq = fft(seq)
            fft_seq = np.abs(fft_seq)
            templist.append(fft_seq[1:len(fft_seq)//2])
        return templist

def generate_kmers_dict(k, unique_chars=set('ACGNT')):
    """Generates k-mers dictionary."""
    kmers = product(unique_chars, repeat=k)
    return {''.join(kmer): i for i, kmer in enumerate(kmers)}

def k_mers(sequencias, k=3, unique_chars=set('ACGNT')):
    """K-mers encoding."""
    kmers_map = generate_kmers_dict(k, unique_chars)
    templist = []
    for seq in sequencias:
        temp = [seq[i:i+k] for i in range(len(seq) - k + 1)]
        templist.append([kmers_map[i] for i in temp])
    return templist

def one_hot(sequences, max_len, unique_chars=set('ACGNT'), reshape=True):
    """One-hot encoding."""
    mapping = {j: i for i, j in enumerate(unique_chars)}
    sequencias_procesadas = []
    if reshape == True:
        for s in sequences:
            temp = np.zeros((max_len, len(unique_chars)))
            for c in zip(s, temp):
                c[1][mapping[c[0]]] = 1
            sequencias_procesadas.append(temp.reshape(-1))
        return sequencias_procesadas
    elif reshape == False:
        for s in sequences:
            temp = np.zeros((max_len, len(unique_chars)))
            for c in zip(s, temp):
                c[1][mapping[c[0]]] = 1
            sequencias_procesadas.append(temp)
        return sequencias_procesadas

def wavelet(sequences, numeric=False, wavelet='db1', level=5):
    """Wavelet transform."""
    templist = []
    if numeric == False:
        for seq in sequences:
            num_seq = [ord(char) for char in seq]
            coeffs = pywt.wavedec(num_seq, wavelet, level)
            templist.append(np.concatenate(coeffs))
        return templist
    elif numeric == True:
        for seq in sequences:
            coeffs = pywt.wavedec(seq, wavelet, level)
            templist.append(np.concatenate(coeffs))
        return templist

def pad_sequences(sequences, maxlen):
    """Pads sequences to equal length."""
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += 'N' * (maxlen - len(seq))
        else:
            seq = seq[:maxlen]
        padded_sequences.append(seq)
    return padded_sequences

def measure_time(func, *args, **kwargs):
    """Measures execution time."""
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    return result, end_time - start_time

def measure_memory(result):
    """Measures memory usage of result."""
    if isinstance(result, list):
        tamano_lista = sys.getsizeof(result)
        if all(isinstance(item, np.ndarray) for item in result):
            tamano_elementos = sum(arr.nbytes for arr in result)
        else:
            tamano_elementos = sum(sys.getsizeof(elemento) for elemento in result)
            for elemento in result:
                if isinstance(elemento, (list, np.ndarray)):
                    if isinstance(elemento, list):
                        tamano_elementos += sum(sys.getsizeof(subelem) for subelem in elemento)
                    elif isinstance(elemento, np.ndarray):
                        tamano_elementos += elemento.nbytes - sys.getsizeof(elemento)
        
        tamano_total = tamano_lista + tamano_elementos
        return tamano_total / (1024 * 1024)  # MB
    elif isinstance(result, np.ndarray):
        return result.nbytes / (1024 * 1024)  # MB
    else:
        return sys.getsizeof(result) / (1024 * 1024)  # MB

def benchmark_encoding(func, sequences, func_name, n_runs=5, **kwargs):
    """Executes benchmark for an encoding function."""
    print(f"Benchmarking {func_name}...")
    
    time_results = []
    memory_results = []
    
    for i in range(n_runs):
        gc.collect()
        result, exec_time = measure_time(func, sequences, **kwargs)
        memory_size = measure_memory(result)
        
        time_results.append(exec_time)
        memory_results.append(memory_size)
        gc.collect()
    
    return {
        'encoding': func_name,
        'avg_time': np.mean(time_results),
        'std_time': np.std(time_results),
        'avg_memory': np.mean(memory_results),
        'std_memory': np.std(memory_results),
        'all_times': time_results,
        'all_memories': memory_results
    }

def run_combined_benchmarks(df, n_runs=5, multiplier=1):
    """Runs benchmarks for both AS and PS sequences."""
    results = []
    
    # Multiply sequences if needed
    if multiplier > 1:
        df_multiplied = pd.concat([df] * multiplier, ignore_index=True)
        print(f"Multiplied dataset by {multiplier}x: {len(df)} -> {len(df_multiplied)} sequences")
    else:
        df_multiplied = df
    
    # AS sequences
    print(f"\n=== BENCHMARKING AS SEQUENCES (N={len(df_multiplied)}) ===")
    as_sequences = df_multiplied['as'].values
    as_max_len = len(as_sequences[0])
    
    # AS - One Hot
    result = benchmark_encoding(one_hot, as_sequences, 'AS-One Hot', 
                              n_runs, max_len=as_max_len)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    # AS - K-mers
    result = benchmark_encoding(k_mers, as_sequences, 'AS-K-mers', n_runs)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    # AS - FFT
    result = benchmark_encoding(fourier, as_sequences, 'AS-FFT', n_runs)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    # AS - Wavelet
    result = benchmark_encoding(wavelet, as_sequences, 'AS-Wavelet', n_runs)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    # AS combinations
    as_kmers = k_mers(as_sequences)
    as_onehot = one_hot(as_sequences, as_max_len)
    
    result = benchmark_encoding(fourier, as_kmers, 'AS-K-mers + FFT', 
                              n_runs, is_str=False)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    result = benchmark_encoding(fourier, as_onehot, 'AS-One Hot + FFT', 
                              n_runs, is_str=False)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    result = benchmark_encoding(wavelet, as_kmers, 'AS-K-mers + Wavelet', 
                              n_runs, numeric=True)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    result = benchmark_encoding(wavelet, as_onehot, 'AS-One Hot + Wavelet', 
                              n_runs, numeric=True)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(as_sequences)
    results.append(result)
    
    # PS sequences
    print(f"\n=== BENCHMARKING PS SEQUENCES (N={len(df_multiplied)}) ===")
    ps_sequences = df_multiplied['ps'].values
    ps_max_len = len(ps_sequences[0])
    
    # PS - One Hot
    result = benchmark_encoding(one_hot, ps_sequences, 'PS-One Hot', 
                              n_runs, max_len=ps_max_len)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    # PS - K-mers
    result = benchmark_encoding(k_mers, ps_sequences, 'PS-K-mers', n_runs)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    # PS - FFT
    result = benchmark_encoding(fourier, ps_sequences, 'PS-FFT', n_runs)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    # PS - Wavelet
    result = benchmark_encoding(wavelet, ps_sequences, 'PS-Wavelet', n_runs)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    # PS combinations
    ps_kmers = k_mers(ps_sequences)
    ps_onehot = one_hot(ps_sequences, ps_max_len)
    
    result = benchmark_encoding(fourier, ps_kmers, 'PS-K-mers + FFT', 
                              n_runs, is_str=False)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    result = benchmark_encoding(fourier, ps_onehot, 'PS-One Hot + FFT', 
                              n_runs, is_str=False)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    result = benchmark_encoding(wavelet, ps_kmers, 'PS-K-mers + Wavelet', 
                              n_runs, numeric=True)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    result = benchmark_encoding(wavelet, ps_onehot, 'PS-One Hot + Wavelet', 
                              n_runs, numeric=True)
    result['multiplier'] = multiplier
    result['n_sequences'] = len(ps_sequences)
    results.append(result)
    
    return pd.DataFrame(results)

def load_and_process_data():
    """Loads and processes the dataset."""
    print("Loading dataset...")
    df = pd.read_csv('datos/datos_filtrados_sin_encoding.csv')
    df = df[['genus', 'se', 'sequence', 'gc_content']]
    df = df.rename(columns={'se': 'as'})
    
    print("Processing sequences...")
    # Sequence padding
    maxlen = max([len(i) for i in df['sequence']])
    df['ps'] = pad_sequences(df['sequence'], maxlen)
    
    # Add length information
    df['len_sequence'] = [len(i) for i in df['sequence']]
    df['len_ps'] = [len(i) for i in df['ps']]
    df['len_as'] = [len(i) for i in df['as']]
    
    # Class mapping
    map_genus = {j: i for i, j in enumerate(df['genus'].unique())}
    df['clases_modelos'] = df['genus'].map(map_genus)
    
    print(f"Dataset loaded: {len(df)} sequences")
    print(f"Max original sequence length: {max(df['len_sequence'])}")
    print(f"Padded sequence length: {df['len_ps'][0]}")
    print(f"AS sequence length: {df['len_as'][0]}")
    
    return df

def ensure_directory(path):
    """Create directory if it doesn't exist."""
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"Created directory: {path}")

# Create results directory
results_dir = 'results/encoding_time'
ensure_directory(results_dir)

# Load and process data
df = load_and_process_data()

# Run benchmarks for different multipliers
multipliers = [1, 2, 3, 4, 5]
all_results = []

for mult in multipliers:
    print(f"\n{'='*60}")
    print(f"RUNNING BENCHMARK WITH MULTIPLIER {mult}x")
    print(f"{'='*60}")
    
    results = run_combined_benchmarks(df, n_runs=50, multiplier=mult)
    all_results.append(results)

# Combine all results
combined_results = pd.concat(all_results, ignore_index=True)

# Save all results in single file
results_filepath = os.path.join(results_dir, 'benchmark_results.csv')
combined_results.to_csv(results_filepath, index=False)
print(f"All results saved: {results_filepath}")

# Show summary for each multiplier
print(f"\n{'='*60}")
print("BENCHMARK RESULTS SUMMARY")
print(f"{'='*60}")

for mult in multipliers:
    mult_results = combined_results[combined_results['multiplier'] == mult]
    print(f"\nMultiplier {mult}x (N={mult_results['n_sequences'].iloc[0]} sequences):")
    print("-" * 50)
    
    display_results = mult_results[['encoding', 'avg_time', 'std_time', 'avg_memory', 'std_memory']]
    display_results = display_results.sort_values('avg_time')
    print(display_results.to_string(index=False))
    
    print(f"\nFastest encoding: {display_results.iloc[0]['encoding']} ({display_results.iloc[0]['avg_time']:.3f}s)")
    print(f"Most memory efficient: {display_results.nsmallest(1, 'avg_memory').iloc[0]['encoding']} ({display_results.nsmallest(1, 'avg_memory').iloc[0]['avg_memory']:.2f} MB)")

print(f"\nAll results saved in: {results_filepath}")
print("Benchmark completed!")