In [21]:
import sys
import os
import random

import pandas as pd

from tqdm import tqdm
from random import choice
from pathlib import Path

# src_dir = Path.cwd().parent
# sys.path.append(str(src_dir))
import utils

# Dataset Creation
This dataset helps to create our final dataset with the given splits (Training Set, Test Set, Random Test Set)

In [3]:
WORKING_DIR = "/work/dlclarge1/matusd-rpi/RPI/" # change this to your working directory
RESULTS_DIR = "data/annotations/"
INTER_DIR = "data/interactions/"
EMB_DIR = "data/embeddings/"
RNAINTER_DIR = "data/RNAInter/"

os.chdir(WORKING_DIR)

if not os.path.exists(INTER_DIR):
    os.makedirs(INTER_DIR)
    
if not os.path.exists(EMB_DIR):
    os.makedirs(EMB_DIR)
    
# limit on number of interactions per protein/RNA
PROTEIN_INTER = 150
RNA_INTER = 150

## Step 1: Data preparation

In [4]:
# Get RNAInter DB with interaction data

# Create directory
if not os.path.exists(RNAINTER_DIR):
    os.makedirs(RNAINTER_DIR)

rnainter_path = RNAINTER_DIR + "Download_data_RP.txt"

# Download data
if not os.path.exists(rnainter_path):
    os.chdir(RNAINTER_DIR)

    !wget http://www.rnainter.org/raidMedia/download/Download_data_RP.tar.gz
    !tar -xf Download_data_RP.tar.gz
    !rm Download_data_RP.tar.gz

    os.chdir(WORKING_DIR)

In [5]:
# Prepare RNA sequences DataFrame
rna_sequences = pd.read_parquet(os.path.join(RESULTS_DIR, 'rna_short_families.parquet'), engine='pyarrow')

# Drop entries lacking crucial information or duplicated
rna_sequences = rna_sequences.dropna(subset=['Sequence_1', 'Raw_ID1', 'Sequence_1_family'])
rna_sequences = rna_sequences.drop_duplicates(subset=['Raw_ID1'])

print(f"Number of RNA sequences: {rna_sequences.shape[0]:,}")

# Prepare protein sequences DataFrame
protein_sequences = pd.read_parquet(os.path.join(RESULTS_DIR, 'proteins_short_clans.parquet'), engine='pyarrow')

# Drop entries lacking crucial information or duplicated
protein_sequences = protein_sequences.dropna(subset=['Sequence_2', 'Raw_ID2', 'Sequence_2_clan'])
protein_sequences= protein_sequences.drop_duplicates(subset=['Raw_ID2'])

print(f"Number of protein sequences: {protein_sequences.shape[0]:,}")

Number of RNA sequences: 7,847
Number of protein sequences: 26,575


In [6]:
# Load raw RNAInter database
rna_inter_df = utils.load_rna_inter_csv(rnainter_path)
print(f"Number of interactions in RNAInter: {rna_inter_df.shape[0]:,}")

Number of interactions in RNAInter: 37,067,587


In [7]:
# Merge RNAInter and sequences DataFrames
rpi_df = rna_inter_df.merge(rna_sequences, on='Raw_ID1', how='inner').merge(protein_sequences, on='Raw_ID2', how='inner')

# Create interaction label for classification
rpi_df['interaction'] = True

print(f"Number of (positive) interactions with annotated entries: {rpi_df.shape[0]:,} \n")
print(f"RPI dataframe columns: \n {rpi_df.columns.to_list()} \n")
print(f"RPI sample entries: \n {rpi_df.head(3).to_string()}")

# Save RPI DataFrame
rpi_df.to_parquet(os.path.join(INTER_DIR, 'raw_interactions.parquet'), engine='pyarrow')

Number of (positive) interactions with annotated entries: 488,184 

RPI dataframe columns: 
 ['RNAInterID', 'Interactor1.Symbol', 'Category1', 'Species1', 'Interactor2.Symbol', 'Category2', 'Species2', 'Raw_ID1', 'Raw_ID2', 'score', 'strong', 'weak', 'predict', 'Sequence_1_rfam_q_accession', 'Sequence_1_family', 'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description', 'Sequence_1_rfam_e_value', 'Sequence_1', 'Sequence_1_len', 'Sequence_1_ID', 'Sequence_2_clan', 'Sequence_2_ID', 'Sequence_2', 'Sequence_2_len', 'interaction'] 

RPI sample entries: 
    RNAInterID Interactor1.Symbol Category1      Species1 Interactor2.Symbol Category2      Species2         Raw_ID1     Raw_ID2   score strong       weak                                    predict Sequence_1_rfam_q_accession Sequence_1_family Sequence_1_rfam_t_accession              Sequence_1_rfam_description  Sequence_1_rfam_e_value                                                                                                         

## Step 2: Data cleaning

In [8]:
rpi_df = pd.read_parquet(os.path.join(INTER_DIR, 'raw_interactions.parquet'), engine='pyarrow')

# Remove mRNA interactors (ESM-2 model was trained only on non-coding RNAs)
len_before = rpi_df.shape[0]
rpi_df = rpi_df[rpi_df['Category1'].str.lower() != 'mrna']
print(f"Removed {len_before-rpi_df.shape[0]} mRNA interactions.")

# Remove interactions with more than one occurence and with duplicated both interactors
rpi_df = rpi_df.drop_duplicates(subset=['RNAInterID'])
rpi_df = rpi_df.drop_duplicates(subset=['Raw_ID1', 'Raw_ID2'])
print(f"Number of interactions after removing duplicates: {rpi_df.shape[0]:,}")

# Limit number of interactions per protein/RNA
limit_rpi_df = rpi_df.groupby(by=['Raw_ID1']).filter(lambda x: len(x) < RNA_INTER)
limit_rpi_df = limit_rpi_df.groupby(by=['Raw_ID2']).filter(lambda x: len(x) < PROTEIN_INTER)
print(f"Number of interactions after limiting number of interactions per protein/RNA: {limit_rpi_df.shape[0]:,} \n")
limit_rpi_df.to_parquet(os.path.join(INTER_DIR, 'limited_interactions.parquet'), engine='pyarrow')

Removed 25010 mRNA interactions.
Number of interactions after removing duplicates: 463,133
Number of interactions after limiting number of interactions per protein/RNA: 40,891 



In [9]:
# Quick data analysis
print(f"Number of unique RNA sequences: {limit_rpi_df['Sequence_1'].nunique():,}")
print(f"Number of RNA families: {limit_rpi_df['Sequence_1_family'].nunique():,} \n")
print(f"Number of unique protein sequences: {limit_rpi_df['Sequence_2'].nunique():,}")
print(f"Number of protein clans: {limit_rpi_df['Sequence_2_clan'].nunique():,}")

Number of unique RNA sequences: 4,177
Number of RNA families: 1,148 

Number of unique protein sequences: 1,325
Number of protein clans: 152


In [10]:
# Save unique RNA and protein sequences for the embedding stage
unique_proteins = limit_rpi_df.drop_duplicates(subset=['Sequence_2_ID', 'Sequence_2'])
unique_RNA = limit_rpi_df.drop_duplicates(subset=['Sequence_1_ID', 'Sequence_1'])

unique_proteins.to_parquet(os.path.join(EMB_DIR, 'unique_proteins.parquet'), engine='pyarrow')
unique_RNA.to_parquet(os.path.join(EMB_DIR, 'unique_RNA.parquet'), engine='pyarrow')

## Step 3: Negative interactions

In [18]:
# Get unique RNA families and protein clans
unique_rna_families = set(limit_rpi_df['Sequence_1_family'])
unique_protein_clans = set(limit_rpi_df['Sequence_2_clan'])
unique_protein_categories = set(limit_rpi_df['Category2'])
unique_rna_categories = set(limit_rpi_df['Category1'])

# Initialize dictionaries
non_interacting_clans_per_rna_family = {family: set(unique_protein_clans) for family in unique_rna_families}
non_interacting_families_per_protein_clan = {clan: set(unique_rna_families) for clan in unique_protein_clans}
interacting_rna_categories_per_clan = {clan: set() for clan in unique_protein_clans} 
interacting_protein_categories_per_family = {family: set() for family in unique_rna_families}

# Precompute clan and family categories
clan_categories = {clan: limit_rpi_df[limit_rpi_df['Sequence_2_clan'] == clan]['Category2'].iloc[0] for clan in unique_protein_clans}
family_categories = {family: limit_rpi_df[limit_rpi_df['Sequence_1_family'] == family]['Category1'].iloc[0] for family in unique_rna_families}

# Update dictionaries by removing interacting pairs
for _, row in limit_rpi_df.iterrows():
    rna_family = row['Sequence_1_family']
    protein_clan = row['Sequence_2_clan']
    rna_category = row['Category1']
    protein_category = row['Category2']
    
    if rna_family in non_interacting_clans_per_rna_family and protein_clan in non_interacting_clans_per_rna_family[rna_family]:
        non_interacting_clans_per_rna_family[rna_family].remove(protein_clan)
        interacting_rna_categories_per_clan[protein_clan].add(rna_category)

    if protein_clan in non_interacting_families_per_protein_clan and rna_family in non_interacting_families_per_protein_clan[protein_clan]:
        non_interacting_families_per_protein_clan[protein_clan].remove(rna_family)
        interacting_protein_categories_per_family[rna_family].add(protein_category)

# Convert sets to lists (if required)
non_interacting_clans_per_rna_family = {k: list(v) for k, v in non_interacting_clans_per_rna_family.items()}
non_interacting_families_per_protein_clan = {k: list(v) for k, v in non_interacting_families_per_protein_clan.items()}

# Average of clans per family 
avg_clans_per_family = sum([len(clans) for clans in non_interacting_clans_per_rna_family.values()])/len(non_interacting_clans_per_rna_family)
print(f"Average number of clans per family: {avg_clans_per_family:.2f}")

# Number of clans with no non-interacting families
print(f"Number of clans with no non-interacting families: {len([clan for clan, families in non_interacting_families_per_protein_clan.items() if len(families) == 0])}\n")

# Average of families per clan
avg_families_per_clan = sum([len(families) for families in non_interacting_families_per_protein_clan.values()])/len(non_interacting_families_per_protein_clan)
print(f"Average number of families per clan: {avg_families_per_clan:.2f}")

# Number of families with no non-interacting clans
print(f"Number of families with no non-interacting clans: {len([family for family, clans in non_interacting_clans_per_rna_family.items() if len(clans) == 0])}")

Average number of clans per family: 139.52
Number of clans with no non-interacting families: 0

Average number of families per clan: 1053.74
Number of families with no non-interacting clans: 0


In [19]:
# Create copies of the original dictionaries
filtered_clans_per_rna_family = non_interacting_clans_per_rna_family.copy()
filtered_families_per_protein_clan = non_interacting_families_per_protein_clan.copy()

# Filter non-interacting clans per RNA family using interacting categories
for family, clans in filtered_clans_per_rna_family.items():
    interacting_categories = interacting_protein_categories_per_family[family]
    filtered_clans_per_rna_family[family] = [
        clan for clan in clans if clan_categories[clan] not in interacting_categories
    ]

# Filter non-interacting families per protein clan using interacting categories
for clan, families in filtered_families_per_protein_clan.items():
    interacting_categories = interacting_rna_categories_per_clan[clan]
    filtered_families_per_protein_clan[clan] = [
        family for family in families if family_categories[family] not in interacting_categories
    ]

# Average of clans per family 
avg_clans_per_family = sum([len(clans) for clans in filtered_clans_per_rna_family.values()])/len(filtered_clans_per_rna_family)
print(f"Average number of clans per family: {avg_clans_per_family:.2f}")

# Number of clans with no non-interacting families
print(f"Number of clans with no non-interacting families: {len([clan for clan, families in filtered_families_per_protein_clan.items() if len(families) == 0]):,}\n")

# Average of families per clan
avg_families_per_clan = sum([len(families) for families in filtered_families_per_protein_clan.values()])/len(filtered_families_per_protein_clan)
print(f"Average number of families per clan: {avg_families_per_clan:.2f}")

# Number of families with no non-interacting clans
print(f"Number of families with no non-interacting clans: {len([family for family, clans in filtered_clans_per_rna_family.items() if len(clans) == 0]):,}")

Average number of clans per family: 74.26
Number of clans with no non-interacting families: 0

Average number of families per clan: 370.10
Number of families with no non-interacting clans: 215


In [20]:
# Revert clans with no non-interacting families to the original non-interacting set
for clan in filtered_families_per_protein_clan:
    if not filtered_families_per_protein_clan[clan]:
        filtered_families_per_protein_clan[clan] = non_interacting_families_per_protein_clan[clan]

# Revert families with no non-interacting clans to the original non-interacting set
for family in filtered_clans_per_rna_family:
    if not filtered_clans_per_rna_family[family]:
        filtered_clans_per_rna_family[family] = non_interacting_clans_per_rna_family[family]

# Average of clans per family 
avg_clans_per_family = sum([len(clans) for clans in filtered_clans_per_rna_family.values()])/len(filtered_clans_per_rna_family)
print(f"Average number of clans per family: {avg_clans_per_family:.2f}")

# Average of families per clan
avg_families_per_clan = sum([len(families) for families in filtered_families_per_protein_clan.values()])/len(filtered_families_per_protein_clan)
print(f"Average number of families per clan: {avg_families_per_clan:.2f}")

# Number of clans with no non-interacting families
print(f"Number of clans with no non-interacting families: {len([clan for clan, families in filtered_families_per_protein_clan.items() if len(families) == 0]):,}")

# Number of families with no non-interacting clans
print(f"Number of families with no non-interacting clans: {len([family for family, clans in filtered_clans_per_rna_family.items() if len(clans) == 0]):,}")

Average number of clans per family: 99.01
Average number of families per clan: 370.10
Number of clans with no non-interacting families: 0
Number of families with no non-interacting clans: 0


In [22]:
limit_rpi_df.columns

Index(['RNAInterID', 'Interactor1.Symbol', 'Category1', 'Species1',
       'Interactor2.Symbol', 'Category2', 'Species2', 'Raw_ID1', 'Raw_ID2',
       'score', 'strong', 'weak', 'predict', 'Sequence_1_rfam_q_accession',
       'Sequence_1_family', 'Sequence_1_rfam_t_accession',
       'Sequence_1_rfam_description', 'Sequence_1_rfam_e_value', 'Sequence_1',
       'Sequence_1_len', 'Sequence_1_ID', 'Sequence_2_clan', 'Sequence_2_ID',
       'Sequence_2', 'Sequence_2_len', 'interaction'],
      dtype='object')

In [None]:
def randomly_select_protein_from_clan(clan, df):
    proteins_in_clan = df[df['Sequence_2_clan'] == clan]
    if not proteins_in_clan.empty:
        selected_protein = random.choice(proteins_in_clan.to_dict(orient='records'))
        return {
            'Interactor2.Symbol': selected_protein['Interactor2.Symbol'],
            'Category2': selected_protein['Category2'],
            'Species2': selected_protein['Species2'],
            'Sequence_2': selected_protein['Sequence_2'],
            'Sequence_2_ID': selected_protein['Sequence_2_ID'],
            'Raw_ID2': selected_protein['Raw_ID2'],
            'Sequence_2_clan': selected_protein['Sequence_2_clan'],
            'Sequence_2_len': selected_protein['Sequence_2_len']
        }
    else:
        return {}
    
def randomly_select_rna_from_family(family, df):
    rnas_in_family = df[df['Sequence_1_family'] == family]
    if not rnas_in_family.empty:
        selected_rna = random.choice(rnas_in_family.to_dict(orient='records'))
        return {
            'Interactor1.Symbol': selected_rna['Interactor1.Symbol'],
            'Category1': selected_rna['Category1'],
            'Species1': selected_rna['Species1'],
            'Sequence_1': selected_rna['Sequence_1'],
            'Sequence_1_ID': selected_rna['Sequence_1_ID'],
            'Raw_ID1': selected_rna['Raw_ID1'],
            'Sequence_1_family': selected_rna['Sequence_1_family'],
            'Sequence_1_len': selected_rna['Sequence_1_len']
            
        }
    else:
        return {}

In [33]:
# Assuming limit_rpi_df is your original DataFrame
negative_interactions = []

for _, row in tqdm(limit_rpi_df.iterrows(), total=limit_rpi_df.shape[0], desc="Generating Negative Interactions"):
    # Negative interaction based on RNA
    rna_family = row['Sequence_1_family']
    non_interacting_clans = filtered_clans_per_rna_family.get(rna_family, [])
    if non_interacting_clans:
        chosen_clan = random.choice(non_interacting_clans)
        # Assuming you have a function to randomly select a protein from a clan
        chosen_protein = randomly_select_protein_from_clan(chosen_clan, limit_rpi_df)
        negative_entry = row.copy()
        negative_entry.update(chosen_protein)  # Update the entry with the chosen protein's information
        negative_entry['interaction'] = False
        negative_entry['RNAInterID'] = f"N_{row['RNAInterID']}"
        negative_interactions.append(negative_entry)

    # Negative interaction based on Protein
    protein_clan = row['Sequence_2_clan']
    non_interacting_families = filtered_families_per_protein_clan.get(protein_clan, [])
    if non_interacting_families:
        chosen_family = random.choice(non_interacting_families)
        # Assuming you have a function to randomly select an RNA from a family
        chosen_rna = randomly_select_rna_from_family(chosen_family, limit_rpi_df)
        negative_entry = row.copy()
        negative_entry.update(chosen_rna)  # Update the entry with the chosen RNA's information
        negative_entry['interaction'] = False
        negative_entry['RNAInterID'] = f"N_{row['RNAInterID']}"
        negative_interactions.append(negative_entry)

# Convert the list of negative interactions to a DataFrame
negative_interactions_df = pd.DataFrame(negative_interactions)

# Concatenate the original DataFrame with the negative interactions DataFrame
all_interactions_df = pd.concat([limit_rpi_df, negative_interactions_df], ignore_index=True)
all_interactions_df.to_parquet(os.path.join(INTER_DIR, 'all_interactions.parquet'), engine='pyarrow')

Generating Negative Interactions: 100%|██████████| 40891/40891 [04:26<00:00, 153.46it/s]


## Step 4: Train/Test split

In [34]:
all_interactions_df = pd.read_parquet(os.path.join(INTER_DIR, 'all_interactions.parquet'), engine='pyarrow')

unique_families = all_interactions_df['Sequence_1_family'].unique()
test_set_size = int(len(all_interactions_df) * 0.10)

# Select RNA families for test set
selected_families = []
accumulated_size = 0
for family in unique_families:
    family_size = len(all_interactions_df[all_interactions_df['Sequence_1_family'] == family])
    if accumulated_size + family_size <= test_set_size:
        selected_families.append(family)
        accumulated_size += family_size
    if accumulated_size >= test_set_size:
        break

# Split the Data
test_df = all_interactions_df[all_interactions_df['Sequence_1_family'].isin(selected_families)]
train_df = all_interactions_df[~all_interactions_df['Sequence_1_family'].isin(selected_families)]
assert len(test_df) + len(train_df) == len(all_interactions_df)

# Count of unique RNA families in each set
unique_families_train = train_df['Sequence_1_family'].nunique()
unique_families_test = test_df['Sequence_1_family'].nunique()
assert unique_families_train + unique_families_test == len(unique_families)

# Verify the splits
print(f"Total data points: {len(all_interactions_df)}")
print(f"Training set size: {len(train_df)} -- {round(train_df.shape[0] / all_interactions_df.shape[0] * 100, 2)} % -- {unique_families_train} unique RNA families")
print(f"Test set size: {len(test_df)} -- {round(test_df.shape[0] / all_interactions_df.shape[0] * 100, 2)} % -- {unique_families_test} unique RNA families")

# Save the Data
train_df.to_parquet(os.path.join(INTER_DIR, 'train_set.parquet'), engine='pyarrow')
test_df.to_parquet(os.path.join(INTER_DIR, 'test_set.parquet'), engine='pyarrow')

Total data points: 122673
Training set size: 110406 -- 90.0 % -- 1059 unique RNA families
Test set size: 12267 -- 10.0 % -- 89 unique RNA families
