In [1]:
import sys
import os
import random

import pandas as pd

from tqdm import tqdm
from random import choice
from pathlib import Path

# src_dir = Path.cwd().parent
# sys.path.append(str(src_dir))
from utils import load_rna_inter_csv

# Dataset Creation
This dataset helps to create our final dataset with the given splits (Training Set, Test Set, Random Test Set)

In [2]:
WORKING_DIR = "/work/dlclarge1/matusd-rpi/RPI/" # change this to your working directory
ANNOT_DIR = "data/annotations/"
INTER_DIR = "data/interactions/"
EMB_DIR = "data/embeddings/"
RNAINTER_DIR = "data/RNAInter/"

os.chdir(WORKING_DIR)

if not os.path.exists(INTER_DIR):
    os.makedirs(INTER_DIR)
    
if not os.path.exists(EMB_DIR):
    os.makedirs(EMB_DIR)
    
# limit on number of interactions per protein/RNA
PROTEIN_INTER = 150
RNA_INTER = 150

TEST_SET_SIZE = 0.2

## Step 1: Data preparation

In [3]:
# Get RNAInter DB with interaction data

# Create directory
if not os.path.exists(RNAINTER_DIR):
    os.makedirs(RNAINTER_DIR)

rnainter_path = RNAINTER_DIR + "Download_data_RP.txt"

# Download data
if not os.path.exists(rnainter_path):
    os.chdir(RNAINTER_DIR)

    !wget http://www.rnainter.org/raidMedia/download/Download_data_RP.tar.gz
    !tar -xf Download_data_RP.tar.gz
    !rm Download_data_RP.tar.gz

    os.chdir(WORKING_DIR)

In [4]:
# Prepare RNA sequences DataFrame
rna_sequences = pd.read_parquet(os.path.join(ANNOT_DIR, 'rna_short_families.parquet'), engine='pyarrow')

# Drop entries lacking crucial information or duplicated
rna_sequences = rna_sequences.dropna(subset=['Sequence_1', 'Raw_ID1', 'Sequence_1_family'])
rna_sequences = rna_sequences.drop_duplicates(subset=['Raw_ID1'])

print(f"Number of RNA sequences: {rna_sequences.shape[0]:,}")

# Prepare protein sequences DataFrame
protein_sequences = pd.read_parquet(os.path.join(ANNOT_DIR, 'proteins_short_clans.parquet'), engine='pyarrow')

# Drop entries lacking crucial information or duplicated
protein_sequences = protein_sequences.dropna(subset=['Sequence_2', 'Raw_ID2', 'Sequence_2_clan'])
protein_sequences= protein_sequences.drop_duplicates(subset=['Raw_ID2'])

print(f"Number of protein sequences: {protein_sequences.shape[0]:,}")

Number of RNA sequences: 7,847
Number of protein sequences: 26,575


In [5]:
# Load raw RNAInter database
rna_inter_df = load_rna_inter_csv(rnainter_path)
print(f"Number of interactions in RNAInter: {rna_inter_df.shape[0]:,}")

KeyboardInterrupt: 

In [None]:
# print all rna categories
print("RNA categories:")
print(rna_inter_df['Category1'].unique())

RNA categories:
['lncRNA' 'mRNA' 'nonsense_mediated_decay' 'retained_intron' 'unknown'
 'others' 'rRNA' 'pseudo' 'ncRNA' 'piRNA' 'TEC' 'circRNA' 'miRNA' 'snRNA'
 'snoRNA' 'non_stop_decay' 'tRNA' 'sncRNA' 'scRNA' 'ribozyme' 'sRNA'
 'miscRNA' 'scaRNA' 'unassigned RNA' 'Category1' 'Mt_tRNA' 'misc_RNA'
 'vtRNAs']


In [None]:
# Merge RNAInter and sequences DataFrames
rpi_df = rna_inter_df.merge(rna_sequences, on='Raw_ID1', how='inner').merge(protein_sequences, on='Raw_ID2', how='inner')

# Create interaction label for classification
rpi_df['interaction'] = True

print(f"Number of (positive) interactions with annotated entries: {rpi_df.shape[0]:,} \n")
print(f"RPI dataframe columns: \n {rpi_df.columns.to_list()} \n")
print(f"RPI sample entries: \n {rpi_df.head(3).to_string()}")

# Save RPI DataFrame
rpi_df.to_parquet(os.path.join(INTER_DIR, 'raw_interactions.parquet'), engine='pyarrow')

Number of (positive) interactions with annotated entries: 488,184 

RPI dataframe columns: 
 ['RNAInterID', 'Interactor1.Symbol', 'Category1', 'Species1', 'Interactor2.Symbol', 'Category2', 'Species2', 'Raw_ID1', 'Raw_ID2', 'score', 'strong', 'weak', 'predict', 'Sequence_1_rfam_q_accession', 'Sequence_1_family', 'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description', 'Sequence_1_rfam_e_value', 'Sequence_1', 'Sequence_1_len', 'Sequence_1_ID', 'Sequence_2_clan', 'Sequence_2_ID', 'Sequence_2', 'Sequence_2_len', 'interaction'] 

RPI sample entries: 
    RNAInterID Interactor1.Symbol Category1      Species1 Interactor2.Symbol Category2      Species2         Raw_ID1     Raw_ID2   score strong       weak                                    predict Sequence_1_rfam_q_accession Sequence_1_family Sequence_1_rfam_t_accession              Sequence_1_rfam_description  Sequence_1_rfam_e_value                                                                                                         

## Step 2: Data cleaning

In [None]:
rpi_df = pd.read_parquet(os.path.join(INTER_DIR, 'raw_interactions.parquet'), engine='pyarrow')

# Remove invalid RNA interactors
# (we remove all mRNAs as the RNA-FM model has been trained only on ncRNAs)
valid_rna = ['lncRNA', 'snRNA', 'snoRNA', 'scaRNA', 'miRNA', 'circRNA', 'rRNA',
       'ribozyme', 'sncRNA', 'misc_RNA', 'ncRNA', 'sRNA', 'Mt_tRNA', 'vtRNAs',
       'pseudo', 'others']
print(f"Invalid RNA interactors removed: \n {rpi_df[~rpi_df['Category1'].isin(valid_rna)]['Category1'].value_counts()} \n")
rpi_df = rpi_df[rpi_df['Category1'].isin(valid_rna)]
print(f"Number of interactions with valid RNA interactors: {rpi_df.shape[0]:,} \n")

# Remove invalid protein interactors
valid_protein = ['TF', 'RBP', 'protein']
print(f"Invalid protein interactors removed: \n {rpi_df[~rpi_df['Category2'].isin(valid_protein)]['Category2'].value_counts()} \n")
rpi_df = rpi_df[rpi_df['Category2'].isin(valid_protein)]
print(f"Number of interactions with valid protein interactors: {rpi_df.shape[0]:,} \n")

# Remove interactions with more than one occurence and with duplicated both interactors
rpi_df = rpi_df.drop_duplicates(subset=['RNAInterID'])
rpi_df = rpi_df.drop_duplicates(subset=['Raw_ID1', 'Raw_ID2'])
print(f"Number of interactions after removing duplicates: {rpi_df.shape[0]:,}")

# Limit number of interactions per protein/RNA
limit_rpi_df = rpi_df.groupby(by=['Raw_ID1']).filter(lambda x: len(x) < RNA_INTER)
limit_rpi_df = limit_rpi_df.groupby(by=['Raw_ID2']).filter(lambda x: len(x) < PROTEIN_INTER)
print(f"Number of interactions after limiting number of interactions per protein/RNA: {limit_rpi_df.shape[0]:,} \n")
limit_rpi_df.to_parquet(os.path.join(INTER_DIR, 'limited_interactions.parquet'), engine='pyarrow')

Invalid RNA interactors removed: 
 Category1
mRNA                       25010
retained_intron              128
nonsense_mediated_decay       41
unknown                       17
Name: count, dtype: int64 

Number of interactions with valid RNA interactors: 462,988 

Invalid protein interactors removed: 
 Series([], Name: count, dtype: int64) 

Number of interactions with valid protein interactors: 462,988 

Number of interactions after removing duplicates: 462,947
Number of interactions after limiting number of interactions per protein/RNA: 40,744 



In [None]:
# Quick data analysis
print(f"Number of unique RNA sequences: {limit_rpi_df['Sequence_1'].nunique():,}")
print(f"Number of RNA families: {limit_rpi_df['Sequence_1_family'].nunique():,} \n")

# for embeddings, we use "Sequence_1_ID" as an identifier of RNA sequences
assert limit_rpi_df['Raw_ID1'].nunique() == limit_rpi_df['Sequence_1_ID'].nunique()

print(f"Number of unique protein sequences: {limit_rpi_df['Sequence_2'].nunique():,}")
print(f"Number of protein clans: {limit_rpi_df['Sequence_2_clan'].nunique():,}")

# for embeddings, we use "Sequence_2_ID" as an identifier of protein sequences
assert limit_rpi_df['Raw_ID2'].nunique() == limit_rpi_df['Sequence_2_ID'].nunique()

Number of unique RNA sequences: 4,169
Number of RNA families: 1,148 

Number of unique protein sequences: 1,308
Number of protein clans: 152


In [None]:
# Create unique embedding IDs for RNA and protein sequences
limit_rpi_df['Sequence_2_emb_ID'] = limit_rpi_df.groupby(['Sequence_2']).ngroup()
unique_proteins = limit_rpi_df.drop_duplicates(subset=['Sequence_2_emb_ID'])
print(f"Number of unique protein sequences: {unique_proteins['Sequence_2'].nunique():,}")
unique_proteins.to_parquet(os.path.join(ANNOT_DIR, 'unique_proteins.parquet'), engine='pyarrow')

limit_rpi_df['Sequence_1_emb_ID'] = limit_rpi_df.groupby(['Sequence_1']).ngroup()
unique_RNA = limit_rpi_df.drop_duplicates(subset=['Sequence_1_emb_ID'])
print(f"Number of unique RNA sequences: {unique_RNA['Sequence_1'].nunique():,}")
unique_RNA.to_parquet(os.path.join(ANNOT_DIR, 'unique_rna.parquet'), engine='pyarrow')

Number of unique protein sequences: 1,308
Number of unique RNA sequences: 4,169


## Step 3: Negative interactions

In [None]:
# Get unique RNA families and protein clans
unique_rna_families = set(limit_rpi_df['Sequence_1_family'])
unique_protein_clans = set(limit_rpi_df['Sequence_2_clan'])
unique_protein_categories = set(limit_rpi_df['Category2'])
unique_rna_categories = set(limit_rpi_df['Category1'])

# Initialize dictionaries
non_interacting_clans_per_rna_family = {family: set(unique_protein_clans) for family in unique_rna_families}
non_interacting_families_per_protein_clan = {clan: set(unique_rna_families) for clan in unique_protein_clans}
interacting_rna_categories_per_clan = {clan: set() for clan in unique_protein_clans} 
interacting_protein_categories_per_family = {family: set() for family in unique_rna_families}

# Precompute clan and family categories
clan_categories = {clan: limit_rpi_df[limit_rpi_df['Sequence_2_clan'] == clan]['Category2'].iloc[0] for clan in unique_protein_clans}
family_categories = {family: limit_rpi_df[limit_rpi_df['Sequence_1_family'] == family]['Category1'].iloc[0] for family in unique_rna_families}

# Update dictionaries by removing interacting pairs
for _, row in limit_rpi_df.iterrows():
    rna_family = row['Sequence_1_family']
    protein_clan = row['Sequence_2_clan']
    rna_category = row['Category1']
    protein_category = row['Category2']
    
    if rna_family in non_interacting_clans_per_rna_family and protein_clan in non_interacting_clans_per_rna_family[rna_family]:
        non_interacting_clans_per_rna_family[rna_family].remove(protein_clan)
        interacting_rna_categories_per_clan[protein_clan].add(rna_category)

    if protein_clan in non_interacting_families_per_protein_clan and rna_family in non_interacting_families_per_protein_clan[protein_clan]:
        non_interacting_families_per_protein_clan[protein_clan].remove(rna_family)
        interacting_protein_categories_per_family[rna_family].add(protein_category)

# Convert sets to lists (if required)
non_interacting_clans_per_rna_family = {k: list(v) for k, v in non_interacting_clans_per_rna_family.items()}
non_interacting_families_per_protein_clan = {k: list(v) for k, v in non_interacting_families_per_protein_clan.items()}

# Average of clans per family 
avg_clans_per_family = sum([len(clans) for clans in non_interacting_clans_per_rna_family.values()])/len(non_interacting_clans_per_rna_family)
print(f"Average number of clans per family: {avg_clans_per_family:.2f}")

# Number of clans with no non-interacting families
print(f"Number of clans with no non-interacting families: {len([clan for clan, families in non_interacting_families_per_protein_clan.items() if len(families) == 0])}\n")

# Average of families per clan
avg_families_per_clan = sum([len(families) for families in non_interacting_families_per_protein_clan.values()])/len(non_interacting_families_per_protein_clan)
print(f"Average number of families per clan: {avg_families_per_clan:.2f}")

# Number of families with no non-interacting clans
print(f"Number of families with no non-interacting clans: {len([family for family, clans in non_interacting_clans_per_rna_family.items() if len(clans) == 0])}")

Average number of clans per family: 139.53
Number of clans with no non-interacting families: 0

Average number of families per clan: 1053.83
Number of families with no non-interacting clans: 0


In [None]:
# Create copies of the original dictionaries
filtered_clans_per_rna_family = non_interacting_clans_per_rna_family.copy()
filtered_families_per_protein_clan = non_interacting_families_per_protein_clan.copy()

# Filter non-interacting clans per RNA family using interacting categories
for family, clans in filtered_clans_per_rna_family.items():
    interacting_categories = interacting_protein_categories_per_family[family]
    filtered_clans_per_rna_family[family] = [
        clan for clan in clans if clan_categories[clan] not in interacting_categories
    ]

# Filter non-interacting families per protein clan using interacting categories
for clan, families in filtered_families_per_protein_clan.items():
    interacting_categories = interacting_rna_categories_per_clan[clan]
    filtered_families_per_protein_clan[clan] = [
        family for family in families if family_categories[family] not in interacting_categories
    ]

# Average of clans per family 
avg_clans_per_family = sum([len(clans) for clans in filtered_clans_per_rna_family.values()])/len(filtered_clans_per_rna_family)
print(f"Average number of clans per family: {avg_clans_per_family:.2f}")

# Number of clans with no non-interacting families
print(f"Number of clans with no non-interacting families: {len([clan for clan, families in filtered_families_per_protein_clan.items() if len(families) == 0]):,}\n")

# Average of families per clan
avg_families_per_clan = sum([len(families) for families in filtered_families_per_protein_clan.values()])/len(filtered_families_per_protein_clan)
print(f"Average number of families per clan: {avg_families_per_clan:.2f}")

# Number of families with no non-interacting clans
print(f"Number of families with no non-interacting clans: {len([family for family, clans in filtered_clans_per_rna_family.items() if len(clans) == 0]):,}")

Average number of clans per family: 74.45
Number of clans with no non-interacting families: 2

Average number of families per clan: 368.82
Number of families with no non-interacting clans: 214


In [None]:
# Revert clans with no non-interacting families to the original non-interacting set
for clan in filtered_families_per_protein_clan:
    if not filtered_families_per_protein_clan[clan]:
        filtered_families_per_protein_clan[clan] = non_interacting_families_per_protein_clan[clan]

# Revert families with no non-interacting clans to the original non-interacting set
for family in filtered_clans_per_rna_family:
    if not filtered_clans_per_rna_family[family]:
        filtered_clans_per_rna_family[family] = non_interacting_clans_per_rna_family[family]

# Average of clans per family 
avg_clans_per_family = sum([len(clans) for clans in filtered_clans_per_rna_family.values()])/len(filtered_clans_per_rna_family)
print(f"Average number of clans per family: {avg_clans_per_family:.2f}")

# Number of clans with no non-interacting families
print(f"Number of clans with no non-interacting families: {len([clan for clan, families in filtered_families_per_protein_clan.items() if len(families) == 0]):,}")

# Average of families per clan
avg_families_per_clan = sum([len(families) for families in filtered_families_per_protein_clan.values()])/len(filtered_families_per_protein_clan)
print(f"Average number of families per clan: {avg_families_per_clan:.2f}\n")

# Number of families with no non-interacting clans
print(f"Number of families with no non-interacting clans: {len([family for family, clans in filtered_clans_per_rna_family.items() if len(clans) == 0]):,}")

Average number of clans per family: 99.09
Number of clans with no non-interacting families: 0
Average number of families per clan: 374.38

Number of families with no non-interacting clans: 0


In [None]:
def randomly_select_protein_from_clan(clan, df):
    proteins_in_clan = df[df['Sequence_2_clan'] == clan]
    if not proteins_in_clan.empty:
        selected_protein = random.choice(proteins_in_clan.to_dict(orient='records'))
        return {
            'Interactor2.Symbol': selected_protein['Interactor2.Symbol'],
            'Category2': selected_protein['Category2'],
            'Species2': selected_protein['Species2'],
            'Sequence_2': selected_protein['Sequence_2'],
            'Sequence_2_ID': selected_protein['Sequence_2_ID'],
            'Raw_ID2': selected_protein['Raw_ID2'],
            'Sequence_2_clan': selected_protein['Sequence_2_clan'],
            'Sequence_2_len': selected_protein['Sequence_2_len'],
            'Sequence_2_emb_ID': selected_protein['Sequence_2_emb_ID']
        }
    else:
        return {}
    
def randomly_select_rna_from_family(family, df):
    rnas_in_family = df[df['Sequence_1_family'] == family]
    if not rnas_in_family.empty:
        selected_rna = random.choice(rnas_in_family.to_dict(orient='records'))
        return {
            'Interactor1.Symbol': selected_rna['Interactor1.Symbol'],
            'Category1': selected_rna['Category1'],
            'Species1': selected_rna['Species1'],
            'Sequence_1': selected_rna['Sequence_1'],
            'Sequence_1_ID': selected_rna['Sequence_1_ID'],
            'Raw_ID1': selected_rna['Raw_ID1'],
            'Sequence_1_family': selected_rna['Sequence_1_family'],
            'Sequence_1_len': selected_rna['Sequence_1_len'],
            'Sequence_1_emb_ID': selected_rna['Sequence_1_emb_ID']
        }
    else:
        return {}

In [None]:
# Assuming limit_rpi_df is your original DataFrame
negative_interactions = []

for _, row in tqdm(limit_rpi_df.iterrows(), total=limit_rpi_df.shape[0], desc="Generating Negative Interactions"):
    # Negative interaction based on RNA
    rna_family = row['Sequence_1_family']
    non_interacting_clans = filtered_clans_per_rna_family.get(rna_family, [])
    if non_interacting_clans:
        chosen_clan = random.choice(non_interacting_clans)
        # Assuming you have a function to randomly select a protein from a clan
        chosen_protein = randomly_select_protein_from_clan(chosen_clan, limit_rpi_df)
        negative_entry = row.copy()
        negative_entry.update(chosen_protein)  # Update the entry with the chosen protein's information
        negative_entry['interaction'] = False
        negative_entry['RNAInterID'] = f"N_{row['RNAInterID']}"
        negative_interactions.append(negative_entry)

    # Negative interaction based on Protein
    protein_clan = row['Sequence_2_clan']
    non_interacting_families = filtered_families_per_protein_clan.get(protein_clan, [])
    if non_interacting_families:
        chosen_family = random.choice(non_interacting_families)
        # Assuming you have a function to randomly select an RNA from a family
        chosen_rna = randomly_select_rna_from_family(chosen_family, limit_rpi_df)
        negative_entry = row.copy()
        negative_entry.update(chosen_rna)  # Update the entry with the chosen RNA's information
        negative_entry['interaction'] = False
        negative_entry['RNAInterID'] = f"N_{row['RNAInterID']}"
        negative_interactions.append(negative_entry)

# Convert the list of negative interactions to a DataFrame
negative_interactions_df = pd.DataFrame(negative_interactions)

# Concatenate the original DataFrame with the negative interactions DataFrame
all_interactions_df = pd.concat([limit_rpi_df, negative_interactions_df], ignore_index=True)
all_interactions_df.to_parquet(os.path.join(INTER_DIR, 'all_interactions.parquet'), engine='pyarrow')

Generating Negative Interactions: 100%|██████████| 40744/40744 [06:48<00:00, 99.71it/s] 


In [None]:
# Quick analysis of the dataset
print(f"Number of negative interactions: {all_interactions_df[all_interactions_df['interaction'] == False].shape[0]:,}")
print(f"Number of positive interactions: {all_interactions_df[all_interactions_df['interaction'] == True].shape[0]:,}")

assert all_interactions_df.shape[0] == limit_rpi_df.shape[0] + negative_interactions_df.shape[0]

Number of negative interactions: 81,488
Number of positive interactions: 40,744


## Step 4: Train/Test split

In [None]:
def select_protein_clans(all_interactions_df, fraction):
    # Calculate the number of unique RNA families each protein clan interacts with
    clan_variability = all_interactions_df.groupby('Sequence_2_clan')['Sequence_1_family'].nunique()

    # Sort clans by variability (ascending)
    sorted_clans = clan_variability.sort_values().index.tolist()

    # Select the top fraction of clans
    num_clans_to_select = int(len(sorted_clans) * fraction)
    selected_clans = sorted_clans[:num_clans_to_select]
    print(f"Number of protein clans selected: {len(selected_clans)}")
    return selected_clans

def create_train_test_sets(interactions_file, output_dir, test_set_clan_fraction=0.10):
    # Read the dataframe
    all_interactions_df = pd.read_parquet(interactions_file, engine='pyarrow')

    # Select protein clans for test set
    selected_clans = select_protein_clans(all_interactions_df, fraction=test_set_clan_fraction)

    # Split the Data
    test_df = all_interactions_df[all_interactions_df['Sequence_2_clan'].isin(selected_clans)]
    train_df = all_interactions_df[~all_interactions_df['Sequence_2_clan'].isin(selected_clans)]

    # Assertions to verify the splits
    assert len(test_df) + len(train_df) == len(all_interactions_df)

    # Count of unique RNA families in each set
    unique_families_train = train_df['Sequence_1_family'].nunique()
    unique_families_test = test_df['Sequence_1_family'].nunique()
    
    # Count of unique protein clans in each set
    unique_clans_train = train_df['Sequence_2_clan'].nunique()
    unique_clans_test = test_df['Sequence_2_clan'].nunique()
    # assert unique_clans_train + unique_clans_test == len(unique_clans)

    # Verify the splits
    print(f"Total data points: {len(all_interactions_df)}")
    print(f"Training set size: {len(train_df)} -- {round(train_df.shape[0] / all_interactions_df.shape[0] * 100, 2)}% \
          -- {unique_families_train} unique RNA families \
          and {unique_clans_train} unique protein clans")
    print(f"Test set size: {len(test_df)} -- {round(test_df.shape[0] / all_interactions_df.shape[0] * 100, 2)}% \
          -- {unique_families_test} unique RNA families \
          and {unique_clans_test} unique protein clans")

    # Save the Data
    train_df.to_parquet(os.path.join(output_dir, 'train_set.parquet'), engine='pyarrow')
    test_df.to_parquet(os.path.join(output_dir, 'test_set.parquet'), engine='pyarrow')

    return train_df, test_df


In [None]:
all_interactions_path = os.path.join(INTER_DIR, 'all_interactions.parquet')

# Create train and test sets
train_df, test_df = create_train_test_sets(all_interactions_path, INTER_DIR, 0.3)


Number of protein clans selected: 45
Total data points: 122232
Training set size: 109075 -- 89.24%           -- 1148 unique RNA families           and 107 unique protein clans
Test set size: 13157 -- 10.76%           -- 999 unique RNA families           and 45 unique protein clans


In [None]:
train_set = pd.read_parquet(os.path.join(INTER_DIR, 'train_set.parquet'), engine='pyarrow')
test_set = pd.read_parquet(os.path.join(INTER_DIR, 'test_set.parquet'), engine='pyarrow')

# check how many positive and negative samples in each set
print(f"Number of negative interactions in the train set: {train_set[train_set['interaction'] == False].shape[0]:,}")
print(f"Number of positive interactions in the train set: {train_set[train_set['interaction'] == True].shape[0]:,}\n")
print(f"Number of negative interactions in the test set: {test_set[test_set['interaction'] == False].shape[0]:,}")
print(f"Number of positive interactions in the test set: {test_set[test_set['interaction'] == True].shape[0]:,}")

Number of negative interactions in the train set: 68,973
Number of positive interactions in the train set: 40,102

Number of negative interactions in the test set: 12,515
Number of positive interactions in the test set: 642
