# RPI2825 dataset preparation
This notebook helps to create the independent dataset RPI2825 for dataset creation

In [1]:
import os
import pandas as pd

In [2]:
WORKING_DIR = "/work/dlclarge1/matusd-rpi/RPI" # change this to your working directory
INTER_DIR = "data/interactions"
EMB_DIR = "data/embeddings/rpi2825"
ANNOT_DIR = "data/annotations"

RPI2825_PATH = "data/rpi2825/RPI2825.csv"

# sequence length limitations
PROTEIN_LEN = 1024
RNA_LEN = 1024

os.chdir(WORKING_DIR)

if not os.path.exists(INTER_DIR):
    os.makedirs(INTER_DIR)

if not os.path.exists(EMB_DIR):
    os.makedirs(EMB_DIR)    

## Quick overview of RPI2825 data

In [3]:
rpi2825_df = pd.read_csv(RPI2825_PATH)

print(f"Number of interactions: {len(rpi2825_df)}")
print(f"Number of non-ribosomal interactions: {len(rpi2825_df[rpi2825_df['type'] == 'nonribo'])}")
print(f"Number of ribosomal interactions: {len(rpi2825_df[rpi2825_df['type'] == 'ribo'])}\n")

print(f"Number of unique protein sequences: {len(rpi2825_df['protein_seq'].unique())}")
print(f"Number of unique RNA sequences: {len(rpi2825_df['rna_seq'].unique())}\n")

Number of interactions: 2825
Number of non-ribosomal interactions: 390
Number of ribosomal interactions: 2435

Number of unique protein sequences: 881
Number of unique RNA sequences: 460



## Data filtering

In [4]:
# Filter out sequences that are longer than embedding models maximum input lengths
rpi2825_df = rpi2825_df[rpi2825_df['protein_seq'].str.len() <= PROTEIN_LEN]
rpi2825_df = rpi2825_df[rpi2825_df['rna_seq'].str.len() <= RNA_LEN]

print(f"Number of interactions after filtering out sequences longer than defined limit: {len(rpi2825_df)} \n")
print(f"Number of unique protein sequences: {len(rpi2825_df['protein_seq'].unique())}")
print(f"Number of unique RNA sequences: {len(rpi2825_df['rna_seq'].unique())} \n")

Number of interactions after filtering out sequences longer than defined limit: 871 

Number of unique protein sequences: 467
Number of unique RNA sequences: 398 



## Dataset completion

In [5]:
# Rename protein_seq columns to Sequence_2 and rna_seq to Sequence_1 to match the
# format of the other datasets
rpi2825_df = rpi2825_df.rename(columns={'protein_seq': 'Sequence_2', 'rna_seq': 'Sequence_1'})

# Add interaction column for classification task
rpi2825_df['interaction'] = True

# Create embeddings IDs columns
rpi2825_df['Sequence_2_emb_ID'] = rpi2825_df.groupby(['Sequence_2']).ngroup()
rpi2825_df['Sequence_1_emb_ID'] = rpi2825_df.groupby(['Sequence_1']).ngroup()

# Create non-ribosomal and only-ribosomal datasets
rpi2435_df = rpi2825_df[rpi2825_df['type'] == 'ribo']
rpi390_df = rpi2825_df[rpi2825_df['type'] == 'nonribo']

# Save the results
rpi2825_df.to_parquet(os.path.join(INTER_DIR, 'rpi2825_test_set.parquet'), index=False)
rpi2435_df.to_parquet(os.path.join(INTER_DIR, 'rpi2435_test_set.parquet'), index=False)
rpi390_df.to_parquet(os.path.join(INTER_DIR, 'rpi390_test_set.parquet'), index=False)

# print first entry of the dataset
print(f"Number of (positive) interactions: {rpi2825_df.shape[0]:,} \n")
print(f"RPI dataframe columns: \n {rpi2825_df.columns.to_list()} \n")
print(f"RPI sample entries: \n {rpi2825_df.head(3).to_string()}")


Number of (positive) interactions: 871 

RPI dataframe columns: 
 ['type', 'Sequence_2', 'Sequence_1', 'interaction', 'Sequence_2_emb_ID', 'Sequence_1_emb_ID'] 

RPI sample entries: 
       type                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      Sequence_2                           Sequence_1  interaction  Sequence_2_emb_ID  Sequence_1_emb_

## Unique sequences

In [6]:
# Save unique protein and RNA sequences
unique_proteins = rpi2825_df.drop_duplicates(subset=['Sequence_2_emb_ID'])
unique_proteins.to_parquet(os.path.join(ANNOT_DIR, 'rpi2825_unique_proteins.parquet'), engine='pyarrow')

unique_RNA = rpi2825_df.drop_duplicates(subset=['Sequence_1_emb_ID'])
unique_RNA.to_parquet(os.path.join(ANNOT_DIR, 'rpi2825_unique_rna.parquet'), engine='pyarrow')

print(f"Number of unique protein sequences: {unique_proteins['Sequence_2'].nunique():,}")
print(f"Number of unique RNA sequences: {unique_RNA['Sequence_1'].nunique():,}")


Number of unique protein sequences: 467
Number of unique RNA sequences: 398
