# Exporting Unique Sequences
Notebook helps to export all unique RNA and protein sequences


## Protein Sequences

In [1]:
import pandas as pd
import os
import pickle
from pathlib import Path

In [2]:
DB_PATH = "dataset/external_databases"
RESULTS_PATH = "dataset/results/"
Path(RESULTS_PATH).mkdir(parents=True, exist_ok=True)

In [3]:
seq_databases = [
    os.path.join(DB_PATH, 'NCBI/ncbi_proteins.parquet'),
    os.path.join(DB_PATH, 'UniProt/protein_uniprot.parquet'),
]
all_df = pd.DataFrame()
for db in seq_databases:
    temp_df = pd.read_parquet(db, engine='pyarrow')
    all_df = pd.concat([all_df, temp_df])
    del temp_df
print(f"Number of entries:\t\t{all_df.shape[0]:,}")
print(f"Number of unique sequences:\t{all_df['Sequence_2'].nunique():,}")
all_df = all_df.dropna(subset=['Sequence_2'])
all_df['Sequence_2_shuffle'] = False
all_df = all_df.drop_duplicates(subset=['Raw_ID2', 'Sequence_2'])
print(f"Number of entries after dropping duplicates based on Raw_ID2 & Sequence 2: {all_df.shape[0]:,}")
all_df.to_parquet(os.path.join(RESULTS_PATH, 'protein_sequences.parquet'), engine='pyarrow')

Number of entries:		287,226
Number of unique sequences:	166,473
Number of entries after dropping duplicates based on Raw_ID2 & Sequence 2: 167,227


In [4]:
all_df = pd.read_parquet(os.path.join(RESULTS_PATH, 'protein_sequences.parquet'), engine='pyarrow')
# store rna sequences shorter than 1024
all_df_short = all_df[all_df['Sequence_2_len'] <= 1024]
print(f"{all_df_short.shape[0]:,}/{all_df.shape[0]:,}")

all_df_short.to_parquet(os.path.join(RESULTS_PATH, 'protein_sequences_short.parquet'), engine='pyarrow')

133,069/167,227


## RNA Sequences

In [7]:
seq_databases = [
    os.path.join(DB_PATH, 'Ensembl/Ensembl.parquet'),
    os.path.join(DB_PATH, 'miRBase/miRNA.parquet'),
    os.path.join(DB_PATH, 'NCBI/ncbi_rna.parquet'),
    os.path.join(DB_PATH, 'NONCODE/NONCODE.parquet'),
]
all_df = pd.DataFrame()
for db in seq_databases:
    temp_df = pd.read_parquet(db, engine='pyarrow')
    all_df = pd.concat([all_df, temp_df])
    del temp_df
print(f"Number of entries:\t\t{all_df.shape[0]:,}")
print(f"Number of unique sequences:\t{all_df['Sequence_1'].nunique():,}")
all_df = all_df.dropna(subset=['Sequence_1'])
all_df = all_df.drop(['index'], axis=1)
all_df['Sequence_1_shuffle'] = False
all_df.to_parquet(os.path.join(RESULTS_PATH, 'rna_sequences.parquet'), engine='pyarrow')

Number of entries:		564,557
Number of unique sequences:	539,133


In [None]:
all_df = pd.read_parquet(os.path.join(RESULTS_PATH, 'rna_sequences.parquet'), engine='pyarrow')

In [8]:
# store rna sequences shorter than 150 bps
all_df_short = all_df[all_df['Sequence_1_len'] <= 80]
print(f"{all_df_short.shape[0]:,}/{all_df.shape[0]:,}")
all_df_short.to_parquet(os.path.join(RESULTS_PATH, 'rna_sequences_short.parquet'), engine='pyarrow')

3,151/564,557


In [None]:
# Store fasta file of all unique RNA sequences (specific for RNA-FM)
all_df = all_df.sort_values(by=['Sequence_1_len'])
all_df = all_df.dropna(subset=['Sequence_1'])
print(f"Number of sequences: {all_df.shape[0]}")
# Filter out sequences which are longer than 10000 BPs
all_df = all_df[all_df['Sequence_1_len'] <= 1024]
# rna_sequences = [SeqIO.SeqRecord(Seq(seq['Sequence_1']), id=seq['Raw_ID1'], description="") for idx, seq in all_df.iterrows()]
rna_sequences = [(seq['Raw_ID1'], seq['Sequence_1']) for _, seq in all_df.iterrows()]
print(len(rna_sequences))
with open('unique_rna_sequences.pickle', 'wb') as file:
    pickle.dump(rna_sequences, file)