# Helps to create One-Hot Encodings for RNA and protein sequences

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
def create_embeddings(df: pd.DataFrame, enc_size, alphabet: list, seq_key: str):
    seqs = []
    encoded_seqs = []
    df = df.sort_values(by=[f"{seq_key}_ID_Unique"])
    for _, row in tqdm(df.iterrows(), total=len(df)):
        seq = row[seq_key]
        assert type(seq) == str
        for elm in seq:
            assert elm.upper() in alphabet
        enc_seq = np.array([alphabet[elm.upper()] for elm in seq])
        enc_dimension = len(alphabet)
        one_enc_seq = np.zeros((enc_size, enc_dimension), dtype=int)
        one_enc_seq[np.arange(enc_seq.size), enc_seq] = 1 
        seqs.append(enc_seq)
        encoded_seqs.append(one_enc_seq)
    encoded_seqs = np.stack(encoded_seqs)
    return seqs, encoded_seqs

In [None]:
def collect_alphabet(df: pd.DataFrame):
    letters = set()
    seq_key = 'Sequence_2'
    for _, row in tqdm(df.iterrows(), total=len(df)):
        seq = row[seq_key]
        assert type(seq) == str
        for elm in seq:
            letters.add(elm.upper())
    letters = sorted(list(letters))
    print({letter:idx for idx, letter in enumerate(letters)})


In [None]:
unique_RNAs = pd.read_parquet('../results/unique_RNAs.parquet')
rna_alphabet = {'C': 0,
            'G': 1,
            'A': 2,
            'U': 3}
_, rna_embeddings = create_embeddings(unique_RNAs, 150, rna_alphabet, 'Sequence_1')

In [None]:
unique_proteins = pd.read_parquet('../results/unique_proteins.parquet')
# collect_alphabet(unique_proteins)
protein_alphabet = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'X': 19, 'Y': 20}
_, protein_embeddings = create_embeddings(unique_proteins, 1024, protein_alphabet, 'Sequence_2')

In [None]:
np.save('../results/simple_rna_embeddings.npy', rna_embeddings)
np.save('../results/simple_protein_embeddings.npy', protein_embeddings)