In [1]:

import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None

df=pd.read_csv('datos/datos_filtrados_sin_encoding.csv')
df=df.rename(columns={'sequence':'aligned_sequence'})

In [3]:
df[df['is_training']==False]['genus'].value_counts()

genus
Streptomyces         802
Pelagibacter         657
Pseudomonas_E        560
Streptococcus        343
Mycobacterium        257
Flavobacterium       245
Microbacterium       162
Prochlorococcus_A    113
Bradyrhizobium       109
Sphingomonas         104
Corynebacterium       81
Vibrio                63
Arthrobacter          60
Chryseobacterium      52
Acinetobacter         50
Nocardioides          49
Rhizobium             29
Collinsella           28
Micromonospora        23
Mesorhizobium         22
Nocardia              17
Bifidobacterium        8
Pelagibacter_A         4
Name: count, dtype: int64

In [3]:
# df=df[['genus', 'se', 'sequence','gc_content']]
# df=df.rename(columns={'se':'as'})

In [4]:
#padding de secuencias para igualar el largo de las mismas entre las más largas y las más cortas
def pad_sequences(sequences, maxlen):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq += 'N' * (maxlen - len(seq))  
        else:
            seq = seq[:maxlen]  # Trunca si es la secuencia es más largo que la variable maxlen
        padded_sequences.append(seq)
    return padded_sequences

maxlen = max([len(i) for i in df['original_sequence']]) 
df['padded_sequences'] = pad_sequences(df['original_sequence'], maxlen)

In [5]:
# df['len_sequence']=[len(i) for i in df['sequence']]
df['len_ps']=[len(i) for i in df['padded_sequences']]
df['len_as']=[len(i) for i in df['aligned_sequence']]

In [6]:
# df=df[['genus', 'gc_content', 'sequence', 'len_sequence', 'ps','len_ps','as','len_as']]

In [7]:
map_genus={j:i for i,j in enumerate(df['genus'].unique())}
df['clases_modelos']=df['genus'].map(map_genus)

In [8]:
pd.DataFrame({'genus':map_genus.keys(), 'model_class':map_genus.values()}).to_csv('datos/mapeo_clases.csv', index=False)

In [9]:
import pywt
from scipy.fft import fft
from itertools import product

def fourier(sequences, is_str=True):
    if is_str:
        templist=[]
        for seq in sequences:
            num_seq=[ord(char) for char in seq]
            fft_seq=fft(num_seq)
            fft_seq=np.abs(fft_seq)
            # fft_seq=fft[1:len(fft_seq)//2]
            templist.append(fft_seq[1:len(fft_seq)//2])
        return templist
    else:
        templist=[]
        for seq in sequences:
            fft_seq=fft(seq)
            fft_seq=np.abs(fft_seq)
            # fft_seq=fft[1:len(fft_seq)//2]
            templist.append(fft_seq[1:len(fft_seq)//2])
        return templist

def generate_kmers_dict(k, unique_chars=set('ACGNT')):
    
    # Generar todas las posibles combinaciones
    kmers = product(unique_chars, repeat=k)
    
    # Crear el diccionario
    kmer_dict = {''.join(kmer): i for i,kmer in enumerate(kmers)}
    
    return kmer_dict


def k_mers(sequencias, k=3, unique_chars=set('ACGNT')):

    kmers_map=generate_kmers_dict(k, unique_chars)
    templist=[]
    for seq in sequencias:
        temp=[seq[i:i+k] for i in range(len(seq) - k + 1)]
        templist.append([kmers_map[i] for i in temp])
    return templist

def one_hot(sequences, max_len, unique_chars=set('ACGNT'), reshape=True):
    mapping={j:i for i,j in enumerate(unique_chars)}
    sequencias_procesadas=[]
    if reshape==True:
        for s in sequences:
            temp=np.zeros((max_len,len(unique_chars)))
            for c in zip(s,temp):
                    c[1][mapping[c[0]]]=1
            sequencias_procesadas.append(temp.reshape(-1))
        return sequencias_procesadas
    elif reshape==False:
        for s in sequences:
            temp=np.zeros((max_len,len(unique_chars)))
            for c in zip(s,temp):
                    c[1][mapping[c[0]]]=1
            sequencias_procesadas.append(temp)
        return sequencias_procesadas

def wavelet(sequences, numeric=False, wavelet='db1', level=5):
    templist=[]
    if numeric==False:
        for seq in sequences:
            num_seq=[ord(char) for char in seq]
            coeffs=pywt.wavedec(num_seq, wavelet, level)
            templist.append(np.concatenate(coeffs))
        return templist
    elif numeric==True:
        for seq in sequences:
            coeffs=pywt.wavedec(seq, wavelet, level)
            templist.append(np.concatenate(coeffs))
        return templist

In [10]:
df.columns

Index(['sequence_id', 'aligned_sequence', 'original_sequence',
       'sequence_length', 'domain', 'phylum', 'class', 'order', 'family',
       'genus', 'species', 'is_training', 'padded_sequences', 'len_ps',
       'len_as', 'clases_modelos'],
      dtype='object')

In [11]:
df['AS_One Hot']=one_hot(df['aligned_sequence'].values, len(df['aligned_sequence'][0]))

df['AS_K-mers']=k_mers(df['aligned_sequence'].values)

df['AS_FFT']=fourier(df['aligned_sequence'].values)

df['AS_Wavelet']=wavelet(df['aligned_sequence'].values)

df['AS_K-mers + FFT']=fourier(df['AS_K-mers'].values, False)
df['AS_One Hot + FFT']=fourier(df['AS_One Hot'].values, False)

df['AS_K-mers + Wavelet']=wavelet(df['AS_K-mers'].values, True)
df['AS_One Hot + Wavelet']=wavelet(df['AS_One Hot'].values, True)

In [12]:
df['PS_One Hot']=one_hot(df['padded_sequences'].values, len(df['padded_sequences'][0]))

df['PS_K-mers']=k_mers(df['padded_sequences'].values)

df['PS_FFT']=fourier(df['padded_sequences'].values)

df['PS_Wavelet']=wavelet(df['padded_sequences'].values)

df['PS_K-mers + FFT']=fourier(df['PS_K-mers'].values, False)
df['PS_One Hot + FFT']=fourier(df['PS_One Hot'].values, False)

df['PS_K-mers + Wavelet']=wavelet(df['PS_K-mers'].values, True)
df['PS_One Hot + Wavelet']=wavelet(df['PS_One Hot'].values, True)

In [None]:
# df.to_parquet('datos/encoded_data.parquet', index=False)
# df.to_csv('datos/encoded_data.csv', index=False, sep='^')