# Exporting Unique Sequences
Notebook helps to export all unique RNA and protein sequences


In [75]:
import os
import pickle
import pandas as pd

In [76]:
WORKING_DIR = "/work/dlclarge1/matusd-rpi/RPI" # change this to your working directory
DB_DIR = "RNAProteinInteractions"
RESULTS_DIR = "data/annotations"

# sequence length limitations
PROTEIN_LEN = 1024
RNA_LEN = 150

os.chdir(WORKING_DIR)

## Protein Sequences

In [77]:
seq_databases = [
    os.path.join(DB_DIR, 'NCBI/ncbi_proteins.parquet'),
    os.path.join(DB_DIR, 'UniProt/protein_uniprot.parquet'),
]
protein_df = pd.DataFrame()
for db in seq_databases:
    temp_df = pd.read_parquet(db, engine='pyarrow')
    db_name = db.split('/')[1]
    print(f"Adding database {db_name} with {temp_df.shape[0]:,} entries.")
    protein_df = pd.concat([protein_df, temp_df])
    del temp_df

print(f"\n Columns: \n {protein_df.columns} \n")
print(f"Sample protein entry: \n {protein_df.iloc[0]} \n")
print(f"Number of entries:\t\t{protein_df.shape[0]:,}")

protein_df = protein_df.dropna(subset=['Sequence_2'])
protein_df = protein_df.drop_duplicates(subset=['Raw_ID2', 'Sequence_2'])
protein_df['Sequence_2_shuffle'] = False # placeholder column for indicating interaction with RNA

print(f"Number of entries after dropping duplicates based on Raw_ID2 & Sequence 2: {protein_df.shape[0]:,}")
print(f"Number of unique sequences:\t{protein_df['Sequence_2'].nunique():,}")

protein_df.to_parquet(os.path.join(RESULTS_DIR, 'protein_sequences.parquet'), engine='pyarrow')

Adding database NCBI with 284,157 entries.
Adding database UniProt with 3,069 entries.

 Columns: 
 Index(['Raw_ID2', 'Sequence_2_ID', 'Sequence_2', 'Sequence_2_len'], dtype='object') 

Sample protein entry: 
 Raw_ID2                                                  NCBI:69865
Sequence_2_ID                                            1434110739
Sequence_2        mesnhksgdglsgtqkeaalralvqrtgyslvqengqrkyggpppg...
Sequence_2_len                                                  587
Name: 0, dtype: object 

Number of entries:		287,226


Number of entries after dropping duplicates based on Raw_ID2 & Sequence 2: 167,227
Number of unique sequences:	166,473


In [78]:
# store protein sequences equal to or shorter than PROTEIN_SEQ_LIMIT residues
protein_df_short = protein_df[protein_df['Sequence_2_len'] <= PROTEIN_LEN]

print(f"Number of protein sequences shorter or equal to {PROTEIN_LEN}: {protein_df_short.shape[0]:,}/{protein_df.shape[0]:,}")

protein_df_short.to_parquet(os.path.join(RESULTS_DIR, 'protein_sequences_short.parquet'), engine='pyarrow')

Number of protein sequences shorter or equal to 1024: 133,069/167,227


## RNA Sequences

In [79]:
seq_databases = [
    os.path.join(DB_DIR, 'Ensembl/Ensembl.parquet'),
    os.path.join(DB_DIR, 'miRBase/miRNA.parquet'),
    os.path.join(DB_DIR, 'NCBI/ncbi_rna.parquet'),
    os.path.join(DB_DIR, 'NONCODE/NONCODE.parquet'),
]
rna_df = pd.DataFrame()
for db in seq_databases:
    temp_df = pd.read_parquet(db, engine='pyarrow')
    db_name = db.split('/')[1]
    print(f"Adding database: {db_name} with {temp_df.shape[0]:,} entries.")
    rna_df = pd.concat([rna_df, temp_df])
    del temp_df

print(f"\n Columns: \n {rna_df.columns} \n")
print(f"Sample RNA entry: \n {rna_df.iloc[0]} \n")
print(f"Number of entries:\t\t{rna_df.shape[0]:,}")

rna_df = rna_df.dropna(subset=['Sequence_1'])
rna_df = rna_df.drop_duplicates(subset=['Raw_ID1', 'Sequence_1'])
rna_df = rna_df.drop(['index'], axis=1)
rna_df['Sequence_1_shuffle'] = False # placeholder column for indicating interaction with protein

print(f"Number of entries after dropping duplicates based on Raw_ID1 & Sequence 1: {rna_df.shape[0]:,}")
print(f"Number of unique sequences:\t{rna_df['Sequence_1'].nunique():,}")

rna_df.to_parquet(os.path.join(RESULTS_DIR, 'rna_sequences.parquet'), engine='pyarrow')

Adding database: Ensembl with 41,086 entries.
Adding database: miRBase with 3,803 entries.
Adding database: NCBI with 493,849 entries.
Adding database: NONCODE with 25,819 entries.

 Columns: 
 Index(['Raw_ID1', 'Sequence_1', 'Sequence_1_len', 'Sequence_1_ID', 'index'], dtype='object') 

Sample RNA entry: 
 Raw_ID1                                  Ensembl:ENSMUSG00000108652
Sequence_1        AACAGAACAAAAAUGGAGAGAAGCAAAGACAGAGCCAUUAAAUAUC...
Sequence_1_len                                                34996
Sequence_1_ID                                    ENSMUSG00000108652
index                                                           NaN
Name: 0, dtype: object 

Number of entries:		564,557
Number of entries after dropping duplicates based on Raw_ID1 & Sequence 1: 543,935
Number of unique sequences:	539,133


In [80]:
# store rna sequences equal to or shorter than RNA_SEQ_LIMIT bps
rna_df_short = rna_df[rna_df['Sequence_1_len'] <= RNA_LEN]

print(f"Number of RNA sequences equal to or shorter than {RNA_LEN}: {rna_df_short.shape[0]:,}/{rna_df.shape[0]:,}")

rna_df_short.to_parquet(os.path.join(RESULTS_DIR, 'rna_sequences_short.parquet'), engine='pyarrow')

Number of RNA sequences equal to or shorter than 150: 11,289/543,935


In [81]:
# store pickle of all unique short RNA sequences (specific for RNA-FM)

rna_df = rna_df.sort_values(by=['Sequence_1_len'])
rna_sequences = [(seq['Raw_ID1'], seq['Sequence_1']) for _, seq in rna_df.iterrows()]

print(f"Number of unique short RNA sequences: {len(rna_sequences):,}")
with open(os.path.join(RESULTS_DIR, 'unique_rna_sequences.pickle'), 'wb') as file:
    pickle.dump(rna_sequences, file)

Number of unique short RNA sequences: 543,935
