In [1]:
from pathlib import Path
import pandas as pd
import sys

from random import choice
import os


src_dir = Path.cwd().parent
sys.path.append(str(src_dir))
import utils

# Dataset Creation
This dataset helps to create our final dataset with the given splits (Training Set, Test Set, Random Test Set)

In [2]:
WORKING_DIR = "/work/dlclarge1/matusd-rpi/RPI/" # change this to your working directory
RESULTS_DIR = "data/annotations/"
INTER_DIR = "data/interactions/"
EMB_DIR = "data/embeddings/"
RNAINTER_DIR = "data/RNAInter/"

os.chdir(WORKING_DIR)

if not os.path.exists(INTER_DIR):
    os.makedirs(INTER_DIR)
    
if not os.path.exists(EMB_DIR):
    os.makedirs(EMB_DIR)
    
# limit on number of interactions per protein/RNA
PROTEIN_INTER = 150
RNA_INTER = 150

## Step 1: Data preparation

In [3]:
# Get RNAInter DB with interaction data

# Create directory
if not os.path.exists(RNAINTER_DIR):
    os.makedirs(RNAINTER_DIR)

rnainter_path = RNAINTER_DIR + "Download_data_RP.txt"

# Download data
if not os.path.exists(rnainter_path):
    os.chdir(RNAINTER_DIR)

    !wget http://www.rnainter.org/raidMedia/download/Download_data_RP.tar.gz
    !tar -xf Download_data_RP.tar.gz
    !rm Download_data_RP.tar.gz

    os.chdir(WORKING_DIR)

In [24]:
# Prepare RNA sequences DataFrame
rna_sequences = pd.read_parquet(os.path.join(RESULTS_DIR, 'rna_short_families.parquet'), engine='pyarrow')

# Drop entries lacking crucial information or duplicated
rna_sequences = rna_sequences.dropna(subset=['Sequence_1', 'Raw_ID1', 'Sequence_1_family'])
rna_sequences = rna_sequences.drop_duplicates(subset=['Raw_ID1'])

print(f"Number of RNA sequences: {rna_sequences.shape[0]:,}")

# Prepare protein sequences DataFrame
protein_sequences = pd.read_parquet(os.path.join(RESULTS_DIR, 'proteins_short_clans.parquet'), engine='pyarrow')

# Drop entries lacking crucial information or duplicated
protein_sequences = protein_sequences.dropna(subset=['Sequence_2', 'Raw_ID2', 'Sequence_2_clan'])
protein_sequences= protein_sequences.drop_duplicates(subset=['Raw_ID2'])

print(f"Number of protein sequences: {protein_sequences.shape[0]:,}")

Number of RNA sequences: 7,847
Number of protein sequences: 26,575


In [25]:
# Load raw RNAInter database
rna_inter_df = utils.load_rna_inter_csv(rnainter_path)
print(f"Number of interactions in RNAInter: {rna_inter_df.shape[0]:,}")

Number of interactions in RNAInter: 37,067,587


In [26]:
# Merge RNAInter and sequences DataFrames
rpi_df = rna_inter_df.merge(rna_sequences, on='Raw_ID1', how='inner').merge(protein_sequences, on='Raw_ID2', how='inner')

# Create interaction label for classification
rpi_df['interaction'] = True

print(f"Number of (positive) interactions with annotated entries: {rpi_df.shape[0]:,} \n")
print(f"RPI dataframe columns: \n {rpi_df.columns.to_list()} \n")
print(f"RPI sample entries: \n {rpi_df.head(3).to_string()}")

# Save RPI DataFrame
rpi_df.to_parquet(os.path.join(INTER_DIR, 'raw_interactions.parquet'), engine='pyarrow')

Number of (positive) interactions with annotated entries: 488,184 

RPI dataframe columns: 
 ['RNAInterID', 'Interactor1.Symbol', 'Category1', 'Species1', 'Interactor2.Symbol', 'Category2', 'Species2', 'Raw_ID1', 'Raw_ID2', 'score', 'strong', 'weak', 'predict', 'Sequence_1_rfam_q_accession', 'Sequence_1_family', 'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description', 'Sequence_1_rfam_e_value', 'Sequence_1', 'Sequence_1_len', 'Sequence_1_ID', 'Sequence_2_clan', 'Sequence_2_ID', 'Sequence_2', 'Sequence_2_len', 'interaction'] 

RPI sample entries: 
    RNAInterID Interactor1.Symbol Category1      Species1 Interactor2.Symbol Category2      Species2         Raw_ID1     Raw_ID2   score strong       weak                                    predict Sequence_1_rfam_q_accession Sequence_1_family Sequence_1_rfam_t_accession              Sequence_1_rfam_description  Sequence_1_rfam_e_value                                                                                                         

## Step 2: Data cleaning

In [27]:
rpi_df = pd.read_parquet(os.path.join(INTER_DIR, 'raw_interactions.parquet'), engine='pyarrow')

# Remove mRNA interactors (ESM-2 model was trained only on non-coding RNAs)
len_before = rpi_df.shape[0]
rpi_df = rpi_df[rpi_df['Category1'].str.lower() != 'mrna']
print(f"Removed {len_before-rpi_df.shape[0]} mRNA interactions.")

# Remove interactions with more than one occurence and with duplicated both interactors
rpi_df = rpi_df.drop_duplicates(subset=['RNAInterID'])
rpi_df = rpi_df.drop_duplicates(subset=['Raw_ID1', 'Raw_ID2'])
print(f"Number of interactions after removing duplicates: {rpi_df.shape[0]:,}")

# Limit number of interactions per protein/RNA
limit_rpi_df = rpi_df.groupby(by=['Raw_ID1']).filter(lambda x: len(x) < RNA_INTER)
limit_rpi_df = limit_rpi_df.groupby(by=['Raw_ID2']).filter(lambda x: len(x) < PROTEIN_INTER)
print(f"Number of interactions after limiting number of interactions per protein/RNA: {limit_rpi_df.shape[0]:,} \n")
limit_rpi_df.to_parquet(os.path.join(INTER_DIR, 'limited_interactions.parquet'), engine='pyarrow')

Removed 25010 mRNA interactions.
Number of interactions after removing duplicates: 463,133
Number of interactions after limiting number of interactions per protein/RNA: 40,891 



In [28]:
# Quick data analysis
print(f"Number of unique RNA sequences: {limit_rpi_df['Sequence_1'].nunique():,}")
print(f"Number of RNA families: {limit_rpi_df['Sequence_1_family'].nunique():,} \n")
print(f"Number of unique protein sequences: {limit_rpi_df['Sequence_2'].nunique():,}")
print(f"Number of protein clans: {limit_rpi_df['Sequence_2_clan'].nunique():,}")

Number of unique RNA sequences: 4,177
Number of RNA families: 1,148 

Number of unique protein sequences: 1,325
Number of protein clans: 152


In [8]:
# Save unique RNA and protein sequences for the embedding stage
unique_proteins = limit_rpi_df.drop_duplicates(subset=['Sequence_2_ID', 'Sequence_2'])
unique_RNA = limit_rpi_df.drop_duplicates(subset=['Sequence_1_ID', 'Sequence_1'])

unique_proteins.to_parquet(os.path.join(EMB_DIR, 'unique_proteins.parquet'), engine='pyarrow')
unique_RNA.to_parquet(os.path.join(EMB_DIR, 'unique_RNA.parquet'), engine='pyarrow')

## Step 3: Negative interactions

In [9]:
df = pd.read_parquet(os.path.join(INTER_DIR, 'limited_interactions.parquet'), engine='pyarrow')
print(f"Number of positive interactions: {df.shape[0]:,}")

# Create negative interactions for RNA interactors
neg_rna_df = utils.create_negative_dataset_per_interactor(df, 1)
print(f"Number of negative interactions based on RNA interactor: {neg_rna_df.shape[0]:,}")

# Create negative interactions for protein interactors
neg_protein_df = utils.create_negative_dataset_per_interactor(df, 2)
print(f"Number of negative interactions based on protein interactor: {neg_protein_df.shape[0]:,}")

Number of positive interactions: 13,330


  1%|▏         | 182/13330 [00:01<01:38, 133.29it/s]

100%|██████████| 13330/13330 [01:25<00:00, 156.34it/s]


Number of negative interactions based on RNA interactor: 13,330


100%|██████████| 13330/13330 [01:25<00:00, 156.51it/s]


Number of negative interactions based on protein interactor: 13,330


In [10]:
# Merge negative interactions and save to parquet
all_interactions_df = pd.concat([df, neg_rna_df, neg_protein_df])
all_interactions_df.to_parquet(os.path.join(INTER_DIR, 'all_interactions.parquet'), engine='pyarrow')
print(f"Number of all interactions: {all_interactions_df.shape[0]:,}")

Number of all interactions: 39,990


## Step 4: Train/Test split

In [11]:
all_interactions_df = pd.read_parquet(os.path.join(INTER_DIR, 'all_interactions.parquet'), engine='pyarrow')

unique_families = all_interactions_df['Sequence_1_family'].unique()
test_set_size = int(len(all_interactions_df) * 0.10)

# Select RNA families for test set
selected_families = []
accumulated_size = 0
for family in unique_families:
    family_size = len(all_interactions_df[all_interactions_df['Sequence_1_family'] == family])
    if accumulated_size + family_size <= test_set_size:
        selected_families.append(family)
        accumulated_size += family_size
    if accumulated_size >= test_set_size:
        break

# Split the Data
test_df = all_interactions_df[all_interactions_df['Sequence_1_family'].isin(selected_families)]
train_df = all_interactions_df[~all_interactions_df['Sequence_1_family'].isin(selected_families)]
assert len(test_df) + len(train_df) == len(all_interactions_df)

# Count of unique RNA families in each set
unique_families_train = train_df['Sequence_1_family'].nunique()
unique_families_test = test_df['Sequence_1_family'].nunique()
assert unique_families_train + unique_families_test == len(unique_families)

# Verify the splits
print(f"Total data points: {len(all_interactions_df)}")
print(f"Training set size: {len(train_df)} -- {round(train_df.shape[0] / all_interactions_df.shape[0] * 100, 2)} % -- {unique_families_train} unique RNA families")
print(f"Test set size: {len(test_df)} -- {round(test_df.shape[0] / all_interactions_df.shape[0] * 100, 2)} % -- {unique_families_test} unique RNA families")

# Save the Data
train_df.to_parquet(os.path.join(INTER_DIR, 'train_set.parquet'), engine='pyarrow')
test_df.to_parquet(os.path.join(INTER_DIR, 'test_set.parquet'), engine='pyarrow')

Total data points: 39990
Training set size: 35992 -- 90.0 % -- 862 unique RNA families
Test set size: 3998 -- 10.0 % -- 50 unique RNA families
