In [None]:
import pandas as pd
import utils
import numpy as np
from random import choice
from ushuffle import shuffle
from annotate.get_RNA_family import rfam_scan_single_sequence
from annotate.cluster_sequences import get_new_sequences, get_new_sequences_protein
from tqdm import tqdm
import pathlib
import os

# Dataset Creation
This dataset helps to create our final dataset with the given splits (Training Set, Test Set, Random Test Set)

In [None]:
DATASET_PATH = f"../results/"
pathlib.Path(DATASET_PATH).mkdir(parents=True, exist_ok=True)

## Step 1: Join Sequences into DB

In [None]:
# Load RAW RNAInter database
rna_inter_df = utils.load_rna_inter_csv(path="../Download_data_RP.txt")
rna_sequences_families = pd.read_parquet(os.path.join(DATASET_PATH, 'rna_sequences_short_families.parquet'), engine='pyarrow')
rna_sequences_clusters = pd.read_parquet(os.path.join(DATASET_PATH, '../results/rna_sequences_clusters.parquet'), engine='pyarrow')
assert rna_sequences_families.shape[0] == rna_sequences_clusters.shape[0]
rna_sequences = rna_sequences_clusters.merge(rna_sequences_families, on=['Raw_ID1', 'Sequence_1_ID', 'Sequence_1', 'Sequence_1_len', 'Sequence_1_shuffle'], how='inner')
assert rna_sequences.shape[0] == rna_sequences_clusters.shape[0]
protein_sequences = pd.read_parquet(os.path.join(DATASET_PATH, '../results/protein_sequences_clusters.parquet'), engine='pyarrow')
all_interactions = rna_inter_df.merge(rna_sequences, on='Raw_ID1', how='inner').merge(protein_sequences, on='Raw_ID2', how='inner')

## Step 2: Removing uncertainty

In [None]:
# As there are many possible sequences for a single referenced ID in RNAInter
# we only keep pairs with only one existing RNA and protein sequence.
very_unique = all_interactions.groupby('RNAInterID').count().reset_index()
very_unique = very_unique[very_unique['Category1'] == 1][['RNAInterID']]
# Alternativly a random combination of avaialble sequences can be taken, e.g. 
# all_interactions.sample(frac=1) #  shuffles the dataframe randomly
# all_interactions = all_interactions.drop_duplicates(subset=['RNAInterID']) #  keeps only the first appearance

all_interactions = all_interactions.merge(very_unique, on='RNAInterID', how='inner')
# We also remove interaction pairs where the RNA family is not known.
all_interactions = all_interactions[all_interactions['Sequence_1_family'] != 'unknown']

In [None]:
# Limit max amount of protein-Interactions per RNA
limited_interactions = all_interactions.groupby(by=['Raw_ID1']).filter(lambda x: len(x) < 150)
# limit max amount of rna-interactions per protein
limited_interactions = limited_interactions.groupby(by=['Raw_ID2']).filter(lambda x: len(x) < 150)
limited_interactions.to_parquet(os.path.join(DATASET_PATH, 'limited_interactions.parquet'), engine='pyarrow')

## Step 3: Creating Test Set based on RNA-family

In [None]:
limited_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'limited_interactions.parquet'), engine='pyarrow')
test_set_len = round(limited_interactions.shape[0] * 0.05)
# pick random rna family
train_set_interactions = limited_interactions
rna_families = list(train_set_interactions['Sequence_1_family'].unique())
#  build test set
test_set_interactions = pd.DataFrame()
while test_set_interactions.shape[0] < test_set_len:
    rna_family = choice(rna_families)
    temp_df = train_set_interactions[train_set_interactions['Sequence_1_family'] == rna_family]
    # temp_proteins = list(temp_df['Raw_ID2'].unique())
    #for protein in temp_proteins:
    #    train_set_interactions = train_set_interactions.drop(train_set_interactions[train_set_interactions['Raw_ID2'] == protein].index)
    if temp_df.shape[0] > test_set_len:
        continue
    rna_families.remove(rna_family)
    train_set_interactions = train_set_interactions.drop(train_set_interactions[train_set_interactions['Sequence_1_family'] == rna_family].index)
    test_set_interactions = pd.concat([test_set_interactions, temp_df])

assert train_set_interactions.shape[0] + test_set_interactions.shape[0] == limited_interactions.shape[0]
# build valid set
print(f"Size of train-set: {train_set_interactions.shape[0]} -- {round(train_set_interactions.shape[0] / limited_interactions.shape[0] * 100, 2)} %")
print(f"Size of test-set: {test_set_interactions.shape[0]} -- {round(test_set_interactions.shape[0] / limited_interactions.shape[0] * 100, 2)} %")
train_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'org_train_set.parquet'), engine='pyarrow')
test_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'org_test_set.parquet'), engine='pyarrow')

In [None]:
# This function SELECTS for each interaction a new RNA-interaction partner from the entire dataset for a given protein-interaction partner
def increase_set_1(limited_interactions: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    # Increase test-set for each protein-interaction
    increased_set = pd.DataFrame()
    for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        Raw_ID2 = row['Raw_ID2']
        rna_family = row['Sequence_1_family']
        rna_cluster = row['Sequence_1_cluster']
        rna_category = row['Category1']
        # filter out every RNA that interacts with the same protein (Raw_ID2)
        temp_df = limited_interactions[limited_interactions['Raw_ID2'] != Raw_ID2]
        # Filter out same rna family of interactor
        temp_df = temp_df[temp_df['Sequence_1_family'] != rna_family]
        # Filter out same rna cluster of interactor
        temp_df = temp_df[temp_df['Sequence_1_cluster'] != rna_cluster]
        # Filter out same rna type of interactor
        temp_df = temp_df[temp_df['Category1'] != rna_category]
        assert temp_df.shape[0] != 0
        while True:
            random_row = temp_df.sample().to_dict('records')[0]
            if dataset[
                (dataset['Raw_ID2'] == row['Raw_ID2']) &
                (dataset['Sequence_2_ID'] == row['Sequence_2_ID']) &
                (dataset['Raw_ID1'] == random_row['Raw_ID1']) &
                (dataset['Sequence_1_ID'] == random_row['Sequence_1_ID'])
            ].shape[0] == 0:
                break
            # print("Oups, some duplicate found")

        # merge random row and row
        # Remove RNA elements from row
        row = row.to_dict()
        for k in ('Raw_ID1', 'Interactor1.Symbol', 'Category1', 'Species1', 'Sequence_1', 'Sequence_1_len',
           'Sequence_1_ID', 'Sequence_1_shuffle', 'Sequence_1_cluster',
           'Sequence_1_cluster_sim', 'Sequence_1_cluster_reference',
           'Sequence_1_rfam_q_accession', 'Sequence_1_family',
           'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description',
           'Sequence_1_rfam_e_value', 'Id'):
            row.pop(k, None)
        # Remove protein elements from new rom
        for k in ('Raw_ID2','Interactor2.Symbol', 'Category2', 'Species2', 'Sequence_2_ID', 'Sequence_2',
           'Sequence_2_len', 'Sequence_2_shuffle', 'Sequence_2_cluster', 'Sequence_2_cluster_sim', 'Sequence_2_cluster_reference',
            'score', 'strong', 'weak', 'predict', 'RNAInterID', 'Id'):
            random_row.pop(k, None)
        new_row = {**row, **random_row, 'score': 0, 'Sequence_1_shuffle': True, 'strong': float("nan"),
                   'weak': float("nan"), 'predict': float("nan")}
        new_row['RNAInterID'] = new_row['RNAInterID'] + '.RV1'
        increased_set = pd.concat([increased_set, pd.DataFrame([new_row])], ignore_index=True)

    assert dataset.shape[0] == increased_set.shape[0]
    assert dataset.merge(increased_set, on=['Raw_ID1', 'Sequence_1_ID', 'Raw_ID2', 'Sequence_2_ID'], how='inner').shape[0] == 0
    return increased_set

# This function SELECTS for each interaction a new protein-interaction partner from the entire dataset for a given rna-interaction partner
def increase_set_2(limited_interactions: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    # Increase test-set for each rna-interaction
    increased_set = pd.DataFrame()
    for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        Raw_ID1 = row['Raw_ID1']
        protein_cluster = row['Sequence_2_cluster']
        protein_category = row['Category2']
        # filter out every RNA that interacts with the same protein (Raw_ID2)
        temp_df = limited_interactions[limited_interactions['Raw_ID1'] != Raw_ID1]
        # Filter out same rna family of interactor
        # Filter out same rna cluster of interactor
        temp_df = temp_df[temp_df['Sequence_2_cluster'] != protein_cluster]
        # Filter out same rna type of interactor
        temp_df = temp_df[temp_df['Category2'] != protein_cluster]
        assert temp_df.shape[0] != 0
        while True:
            random_row = temp_df.sample().to_dict('records')[0]
            if dataset[
                (dataset['Raw_ID1'] == row['Raw_ID1']) &
                (dataset['Sequence_1_ID'] == row['Sequence_1_ID']) &
                (dataset['Raw_ID2'] == random_row['Raw_ID2']) &
                (dataset['Sequence_2_ID'] == random_row['Sequence_2_ID'])
            ].shape[0] == 0:
                break
            # print("Oups, some duplicate found")

        # merge random row and row
        # Remove protein elements from row
        row = row.to_dict()
        for k in ('Raw_ID2', 'Interactor2.Symbol', 'Category2', 'Species2', 'Sequence_2', 'Sequence_2_len',
           'Sequence_2_ID', 'Sequence_2_shuffle', 'Sequence_2_cluster',
           'Sequence_2_cluster_sim', 'Sequence_2_cluster_reference',
            'Id'):
            row.pop(k, None)
        # Remove rna elements from new rom
        for k in ('Raw_ID1', 'Interactor1.Symbol', 'Category1', 'Species1', 'Sequence_1', 'Sequence_1_len',
           'Sequence_1_ID', 'Sequence_1_shuffle', 'Sequence_1_cluster',
           'Sequence_1_cluster_sim', 'Sequence_1_cluster_reference',
           'Sequence_1_rfam_q_accession', 'Sequence_1_family',
           'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description',
           'Sequence_1_rfam_e_value',
            'score', 'strong', 'weak', 'predict', 'RNAInterID', 'Id'):
            random_row.pop(k, None)
        new_row = {**row, **random_row}
        new_row['score'] = 0
        new_row['Sequence_2_shuffle'] = True
        new_row['strong'] = float("nan")
        new_row['weak'] = float("nan")
        new_row['predict'] = float("nan")
        new_row['RNAInterID'] = new_row['RNAInterID'] + '.PV1'
        increased_set = pd.concat([increased_set, pd.DataFrame([new_row])], ignore_index=True)

    assert dataset.shape[0] == increased_set.shape[0]
    assert dataset.merge(increased_set, on=['Raw_ID1', 'Sequence_1_ID', 'Raw_ID2', 'Sequence_2_ID'], how='inner').shape[0] == 0
    return increased_set

# Shuffle rna-sequence per protein-interaction
def increase_set_3(limited_interactions: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    increased_set = pd.DataFrame()
    rna_clusters_all = pd.read_parquet('../results/rna_sequences_clusters.parquet', engine='pyarrow')
    for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        Raw_ID2 = row['Raw_ID2']
        # get all families and clusters that interact with this protein.
        # We want to prevent to generate any similar RNA
        rna_families = set(limited_interactions[limited_interactions['Raw_ID2'] == Raw_ID2]['Sequence_1_family'])
        rna_clusters = set(limited_interactions[limited_interactions['Raw_ID2'] == Raw_ID2]['Sequence_1_cluster'])
        # Shuffle/Generate new RNA sequence
        sequence = row['Sequence_1']
        while True:
            shuffled_seq = shuffle(sequence.encode('ASCII'), 2).decode('ASCII')
            # assign rfam family to new sequence
            new_row = row
            row['Sequence_1'] = shuffled_seq
            if rfam_scan_single_sequence(row, row['Sequence_1_family']):
                continue
            # assign cluster to new sequence
            if get_new_sequences(rna_clusters_all[rna_clusters_all['Sequence_1_cluster'] == row['Sequence_1_cluster']],
                                 row):
                continue
            new_row['RNAInterID'] = new_row['RNAInterID'] + ".RV2"
            new_row['Sequence_1_shuffle'] = True
            new_row['score'] = 0
            new_row['strong'] = float("nan")
            new_row['weak'] = float("nan")
            new_row['predict'] = float("nan")
            increased_set = pd.concat([increased_set, pd.DataFrame([new_row])], ignore_index=True)
            break
    return increased_set

# Shuffle protein-sequence per rna-interaction
def increase_set_4(limited_interactions: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    increased_set = pd.DataFrame()
    protein_clusters_all = pd.read_parquet('../results/protein_sequences_clusters.parquet', engine='pyarrow')
    for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        Raw_ID1 = row['Raw_ID1']
        # get all families and clusters that interact with this protein.
        # We want to prevent to generate any similar RNA
        protein_clusters = set(limited_interactions[limited_interactions['Raw_ID1'] == Raw_ID1]['Sequence_2_cluster'])
        # Shuffle/Generate new Protein sequence
        sequence = row['Sequence_2']
        while True:
            shuffled_seq = shuffle(sequence.encode('ASCII'), 2).decode('ASCII')
            new_row = row
            row['Sequence_2'] = shuffled_seq

            # assign cluster to new sequence
            if get_new_sequences_protein(protein_clusters_all[protein_clusters_all['Sequence_2_cluster'] == row['Sequence_2_cluster']],
                                 row):
                print(".")
                continue
            new_row['RNAInterID'] = new_row['RNAInterID'] + ".PV2"
            new_row['Sequence_2_shuffle'] = True
            new_row['score'] = 0
            new_row['strong'] = float("nan")
            new_row['weak'] = float("nan")
            new_row['predict'] = float("nan")
            increased_set = pd.concat([increased_set, pd.DataFrame([row])], ignore_index=True)
            break
    return increased_set

## Step 4: Create negative interactions for the test set

In [None]:
test_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'org_test_set.parquet'), engine='pyarrow')
limited_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'limited_interactions.parquet'), engine='pyarrow')
print("Starting to build test set")
test_org_size = test_set_interactions.shape[0]
test_set_org = test_set_interactions
test_set_1 = increase_set_1(limited_interactions, test_set_org)
assert test_set_1.shape[0] == test_set_org.shape[0]
test_set_2 = increase_set_2(limited_interactions, test_set_org)
assert test_set_2.shape[0] == test_set_org.shape[0]

test_set_interactions = pd.concat([test_set_org,
                                    test_set_1,
                                    test_set_2,
                                    ])
assert test_set_interactions.shape[0] == test_org_size * 3
test_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'test_set_interactions.parquet'), engine='pyarrow')
print("test set done...")

## Step 5: Create negative interactions for the training set

In [None]:
test_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'org_train_set.parquet'), engine='pyarrow')
limited_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'limited_interactions.parquet'), engine='pyarrow')
print("Starting to build train set")
train_org_size = train_set_interactions.shape[0]
train_set_org = train_set_interactions
train_set_1 = increase_set_1(limited_interactions, train_set_org)
assert train_set_1.shape[0] == train_set_org.shape[0]
train_set_2 = increase_set_2(limited_interactions, train_set_org)
assert train_set_2.shape[0] == train_set_org.shape[0]

train_set_interactions = pd.concat([train_set_org,
                                    train_set_1,
                                    train_set_2,
                                   ])
assert train_set_interactions.shape[0] == train_org_size * 3
test_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'train_set_interactions.parquet'), engine='pyarrow')
print("train set done...")

## Step 6: Create Random Test Test based on the Training Set

In [None]:
# read datasets
train_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'train_set_interactions.parquet'))
print(f"Size of train: {len(train_set_interactions)}")
# do another data random data split based on random
test_set_random_interactions = train_set_interactions.sample(frac=0.1)
train_set_interactions = train_set_interactions.drop(test_set_random_interactions.index)
print(f"Size of train: {len(train_set_interactions)}")
print(f"Size of test-random: {len(test_set_random_interactions)}")
train_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'train_set_interactions.parquet'))
test_set_random_interactions.to_parquet(os.path.join(DATASET_PATH, 'test_set_radnom_interactions.parquet'))

## Step 7: Export all unique RNAs and proteins fr

In [None]:
# Adding unique IDs to all proteins and RNAs
train_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'train_set_interactions.parquet'), engine='pyarrow')
test_set_random_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'test_set_radnom_interactions.parquet'), engine='pyarrow')
test_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'test_set_interactions.parquet'), engine='pyarrow')
# Add unique ID for each distinct rna
all_df = pd.concat([train_set_interactions, test_set_random_interactions, test_set_interactions])
all_df['Sequence_1_ID_Unique'] = all_df.groupby(['Sequence_1']).ngroup()
all_df = all_df.drop_duplicates(subset=['Sequence_1_ID_Unique'])
assert train_set_interactions.merge(all_df[['Sequence_1_ID_Unique', 'Sequence_1']], on=['Sequence_1'], how='inner').shape[0] == train_set_interactions.shape[0]
train_set_interactions = train_set_interactions.merge(all_df[['Sequence_1_ID_Unique', 'Sequence_1']], on=['Sequence_1'], how='inner')
assert test_set_random_interactions.merge(all_df[['Sequence_1_ID_Unique', 'Sequence_1']], on='Sequence_1', how='inner').shape[0] == test_set_random_interactions.shape[0]
test_set_random_interactions = test_set_random_interactions.merge(all_df[['Sequence_1_ID_Unique', 'Sequence_1']], on='Sequence_1', how='inner')
assert test_set_interactions.merge(all_df[['Sequence_1_ID_Unique', 'Sequence_1']], on='Sequence_1', how='inner').shape[0] == test_set_interactions.shape[0]
test_set_interactions = test_set_interactions.merge(all_df[['Sequence_1_ID_Unique', 'Sequence_1']], on='Sequence_1', how='inner')

# add unique protein id for each distinct protein id
all_df = pd.concat([train_set_interactions, test_set_random_interactions, test_set_interactions])
all_df['Sequence_2_ID_Unique'] = all_df.groupby(['Sequence_2']).ngroup()
all_df = all_df.drop_duplicates(subset=['Sequence_2_ID_Unique'])
assert train_set_interactions.merge(all_df[['Sequence_2_ID_Unique', 'Sequence_2']], on='Sequence_2', how='inner').shape[0] == train_set_interactions.shape[0]
train_set_interactions = train_set_interactions.merge(all_df[['Sequence_2_ID_Unique', 'Sequence_2']], on='Sequence_2', how='inner')
assert test_set_random_interactions.merge(all_df[['Sequence_2_ID_Unique', 'Sequence_2']], on='Sequence_2', how='inner').shape[0] == test_set_random_interactions.shape[0]
test_set_random_interactions = test_set_random_interactions.merge(all_df[['Sequence_2_ID_Unique', 'Sequence_2']], on='Sequence_2', how='inner')
assert test_set_interactions.merge(all_df[['Sequence_2_ID_Unique', 'Sequence_2']], on='Sequence_2', how='inner').shape[0] == test_set_interactions.shape[0]
test_set_interactions = test_set_interactions.merge(all_df[['Sequence_2_ID_Unique', 'Sequence_2']], on='Sequence_2', how='inner')

train_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'final_train_set.parquet'), engine='pyarrow')
test_set_random_interactions.to_parquet(os.path.join(DATASET_PATH, 'final_test_set_random.parquet'), engine='pyarrow')
test_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'final_test_set.parquet'), engine='pyarrow')

In [None]:
train_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'final_train_set.parquet'), engine='pyarrow')
test_set_random_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'final_test_set_random.parquet'), engine='pyarrow')
test_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'final_test_set.parquet'), engine='pyarrow')
# get unique proteins and RNAs
all_df = pd.concat([
    train_set_interactions,
    test_set_random_interactions,
    test_set_interactions
])
unique_proteins = all_df[['Sequence_2_ID_Unique', 'Sequence_2']].drop_duplicates()
unique_RNAs = all_df[['Sequence_1_ID_Unique', 'Sequence_1']].drop_duplicates()
unique_proteins.to_parquet(os.path.join(DATASET_PATH, 'unique_proteins.parquet'), engine='pyarrow')
unique_RNAs.to_parquet(os.path.join(DATASET_PATH, 'unique_RNAs.parquet'), engine='pyarrow')

## Optional: Create fully random dataset based on random
Can be used for an ablation experiment to see if the model is still able to learn when presenting random training data

In [None]:
# create dataset with random labels
train_set_interactions = pd.read_parquet(os.path.join(DATASET_PATH, 'final_train_set.parquet'), engine='pyarrow')
train_set_interactions['Sequence_1_shuffle'] = 0
train_set_interactions['Sequence_2_shuffle'] = 0
assert len(train_set_interactions[(train_set_interactions['Sequence_1_shuffle'] == 1) | (train_set_interactions['Sequence_2_shuffle'] == 1)]) == 0
train_set_interactions['Sequence_1_shuffle'] = np.random.choice([0, 1], train_set_interactions.shape[0], p=[1/3, 2/3])
print(len(train_set_interactions[(train_set_interactions['Sequence_1_shuffle'] == 1) | (train_set_interactions['Sequence_2_shuffle'] == 1)]) / len(train_set_interactions))
train_set_interactions.to_parquet(os.path.join(DATASET_PATH, 'random_train_set.parquet'))