# RPI2825 dataset preparation
This notebook helps to create the independent dataset RPI2825 for dataset creation

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
rpi_db = pd.read_csv('RPI2825.csv')
print(f"Original RPI2825 has {len(rpi_db)} entries/interactions")
print(f"Original RPI2825 has {len(list(rpi_db['protein_seq'].unique()))} unique protein sequences")
print(f"Original RPI2825 has {len(list(rpi_db['rna_seq'].unique()))} unique RNA sequences")
# filter protein sequences longer 1024
rpi_db['protein_seq_len'] = rpi_db['protein_seq'].apply(len)
rpi_db = rpi_db[rpi_db['protein_seq_len'] <= 1024]
# filter rna sequences longer 150
rpi_db['rna_seq_len'] = rpi_db['rna_seq'].apply(len)
rpi_db = rpi_db[rpi_db['rna_seq_len'] <= 150]
print(f"Filtered RPI2825 has {len(rpi_db)} entries/interactions")

In [None]:
print(f"Filtered RPI2825 has {len(list(rpi_db['protein_seq'].unique()))} unique protein sequences")
print(f"Filtered RPI2825 has {len(list(rpi_db['rna_seq'].unique()))} unique RNA sequences")

In [None]:
rpi_db.to_parquet('rpi2825.parquet')

In [None]:
rna_inter_train = pd.read_parquet('../results/dataset_v4/final_train_set_reduced.parquet')
rna_inter_test = pd.read_parquet('../results/dataset_v4/final_test_set_reduced.parquet')

In [None]:
# Check if there are unique sequences which are the same in both datasets
rna_seqs_rpi_2825 = set(rpi_db['rna_seq'])
rna_seqs_rna_inter_train = set(rna_inter_train['Sequence_1'])
print(len(rna_seqs_rna_inter_train))
rna_seqs_rna_inter_train = set([x.upper() for x in rna_seqs_rna_inter_train])
rna_seqs_rna_inter_test = set(rna_inter_test['Sequence_1'])
print(len(rna_seqs_rna_inter_test))
rna_seqs_rna_inter_test = set([x.upper() for x in rna_seqs_rna_inter_test])
print(f"{len(rna_seqs_rpi_2825.intersection(rna_seqs_rna_inter_train))}")
print(f"{len(rna_seqs_rpi_2825.intersection(rna_seqs_rna_inter_test))}")
print(f"{len(rna_seqs_rna_inter_train.intersection(rna_seqs_rna_inter_test))}")

# Check if there are unique sequences which are the same in both datasets
protein_seqs_rpi_2825 = set(rpi_db['protein_seq'])
protein_seqs_rna_inter_train = set(rna_inter_train['Sequence_2'])
protein_seqs_rna_inter_train = set([x.upper() for x in protein_seqs_rna_inter_train])
protein_seqs_rna_inter_test = set(rna_inter_test['Sequence_2'])
protein_seqs_rna_inter_test = set([x.upper() for x in protein_seqs_rna_inter_test])
print(f"{len(protein_seqs_rpi_2825.intersection(protein_seqs_rna_inter_train))}")
print(f"{len(protein_seqs_rpi_2825.intersection(protein_seqs_rna_inter_test))}")
print(f"{len(protein_seqs_rna_inter_train.intersection(protein_seqs_rna_inter_test))}")

In [None]:
# Store unique rna sequences to obtain rna family information
unique_rna_df = rpi_db[['rna_seq']].drop_duplicates(subset=['rna_seq']).rename(columns={
    'rna_seq': 'Sequence_1'
})
unique_rna_df['Sequence_1_ID'] = unique_rna_df.reset_index().index
unique_rna_df['Raw_ID1'] = unique_rna_df['Sequence_1_ID']
unique_rna_df.to_parquet("../results/rpi2825/rna_sequences.parquet")

In [None]:
# load annotated data
rna_families_df = pd.read_parquet('../results/rpi2825/rna_sequences_families.parquet')
org_len = len(rpi_db)
rna_families_df = rna_families_df.drop(columns=['Id'])
rpi_db = rpi_db.rename(columns={
    'rna_seq': 'Sequence_1'
})
rpi_db = rpi_db.merge(rna_families_df, on=['Sequence_1'])
assert org_len == len(rpi_db)

In [None]:
rna_cluster_df = pd.read_parquet('../results/rpi2825/rna_sequences_clusters.parquet')
org_len = len(rpi_db)
rna_cluster_df = rna_cluster_df.drop(columns=['Sequence_1_ID', 'Raw_ID1'])
rpi_db = rpi_db.merge(rna_cluster_df, on=['Sequence_1'])
assert org_len == len(rpi_db)

In [None]:
# Store unique rna sequences to obtain rna family information
unique_protein_df = rpi_db[['protein_seq']].drop_duplicates(subset=['protein_seq']).rename(columns={
    'protein_seq': 'Sequence_2'
})
unique_protein_df['Sequence_2_ID'] = unique_protein_df.reset_index().index
unique_protein_df['Raw_ID2'] = unique_protein_df['Sequence_2_ID']
unique_protein_df.to_parquet("../results/rpi2825/protein_sequences.parquet")

In [None]:
protein_cluster_df = pd.read_parquet('../results/rpi2825/protein_sequences_clusters.parquet')
org_len = len(rpi_db)
protein_cluster_df = protein_cluster_df.drop(columns=['Sequence_2_ID', 'Raw_ID2'])
rpi_db = rpi_db.rename(columns={
    'protein_seq': 'Sequence_2'
})
rpi_db = rpi_db.merge(protein_cluster_df, on=['Sequence_2'])
assert org_len == len(rpi_db)

In [None]:
# Prepare extension
rpi_db = rpi_db.assign(RNAInterID=range(len(rpi_db)))
org_len = len(rpi_db)
rpi_db = rpi_db.merge(unique_protein_df, on=['Sequence_2'])
assert org_len == len(rpi_db)

In [None]:
# This function SELECTS for each interaction a new RNA-interaction partner from the entire dataset for a given protein-interaction partner
def increase_set_1(limited_interactions: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    # Increase test-set for each protein-interaction
    increased_set = pd.DataFrame()
    for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        Raw_ID2 = row['Raw_ID2']
        rna_family = row['Sequence_1_family']
        rna_cluster = row['Sequence_1_cluster']
        # rna_category = row['Category1']
        # filter out every RNA that interacts with the same protein (Raw_ID2)
        temp_df = limited_interactions[limited_interactions['Raw_ID2'] != Raw_ID2]
        # Filter out same rna family of interactor
        temp_df = temp_df[temp_df['Sequence_1_family'] != rna_family]
        # Filter out same rna cluster of interactor
        temp_df = temp_df[temp_df['Sequence_1_cluster'] != rna_cluster]
        # Filter out same rna type of interactor
        # temp_df = temp_df[temp_df['Category1'] != rna_category]
        assert temp_df.shape[0] != 0
        while True:
            random_row = temp_df.sample().to_dict('records')[0]
            if dataset[
                (dataset['Raw_ID2'] == row['Raw_ID2']) &
                (dataset['Sequence_2_ID'] == row['Sequence_2_ID']) &
                (dataset['Raw_ID1'] == random_row['Raw_ID1']) &
                (dataset['Sequence_1_ID'] == random_row['Sequence_1_ID'])
            ].shape[0] == 0:
                break
            # print("Oups, some duplicate found")

        # merge random row and row
        # Remove RNA elements from row
        row = row.to_dict()
        for k in ('Raw_ID1', 'Interactor1.Symbol', 'Category1', 'Species1', 'Sequence_1', 'Sequence_1_len',
           'Sequence_1_ID', 'Sequence_1_shuffle', 'Sequence_1_cluster',
           'Sequence_1_cluster_sim', 'Sequence_1_cluster_reference',
           'Sequence_1_rfam_q_accession', 'Sequence_1_family',
           'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description',
           'Sequence_1_rfam_e_value', 'Id'):
            row.pop(k, None)
        # Remove protein elements from new rom
        for k in ('Raw_ID2','Interactor2.Symbol', 'Category2', 'Species2', 'Sequence_2_ID', 'Sequence_2',
           'Sequence_2_len', 'Sequence_2_shuffle', 'Sequence_2_cluster', 'Sequence_2_cluster_sim', 'Sequence_2_cluster_reference',
            'score', 'strong', 'weak', 'predict', 'RNAInterID', 'Id'):
            random_row.pop(k, None)
        new_row = {**row, **random_row, 'score': 0, 'Sequence_1_shuffle': True, 'strong': float("nan"),
                   'weak': float("nan"), 'predict': float("nan")}
        new_row['RNAInterID'] = str(new_row['RNAInterID']) + '.RV1'
        increased_set = pd.concat([increased_set, pd.DataFrame([new_row])], ignore_index=True)

    assert dataset.shape[0] == increased_set.shape[0]
    assert dataset.merge(increased_set, on=['Raw_ID1', 'Sequence_1_ID', 'Raw_ID2', 'Sequence_2_ID'], how='inner').shape[0] == 0
    return increased_set

In [None]:
# This function SELECTS for each interaction a new protein-interaction partner from the entire dataset for a given rna-interaction partner
def increase_set_2(limited_interactions: pd.DataFrame, dataset: pd.DataFrame) -> pd.DataFrame:
    # Increase test-set for each rna-interaction
    increased_set = pd.DataFrame()
    for _, row in tqdm(dataset.iterrows(), total=dataset.shape[0]):
        Raw_ID1 = row['Raw_ID1']
        protein_cluster = row['Sequence_2_cluster']
        # protein_category = row['Category2']
        # filter out every RNA that interacts with the same protein (Raw_ID2)
        temp_df = limited_interactions[limited_interactions['Raw_ID1'] != Raw_ID1]
        # Filter out same rna family of interactor
        # Filter out same rna cluster of interactor
        temp_df = temp_df[temp_df['Sequence_2_cluster'] != protein_cluster]
        # Filter out same rna type of interactor
        # temp_df = temp_df[temp_df['Category2'] != protein_cluster]
        assert temp_df.shape[0] != 0
        while True:
            random_row = temp_df.sample().to_dict('records')[0]
            if dataset[
                (dataset['Raw_ID1'] == row['Raw_ID1']) &
                (dataset['Sequence_1_ID'] == row['Sequence_1_ID']) &
                (dataset['Raw_ID2'] == random_row['Raw_ID2']) &
                (dataset['Sequence_2_ID'] == random_row['Sequence_2_ID'])
            ].shape[0] == 0:
                break
            # print("Oups, some duplicate found")

        # merge random row and row
        # Remove protein elements from row
        row = row.to_dict()
        for k in ('Raw_ID2', 'Interactor2.Symbol', 'Category2', 'Species2', 'Sequence_2', 'Sequence_2_len',
           'Sequence_2_ID', 'Sequence_2_shuffle', 'Sequence_2_cluster',
           'Sequence_2_cluster_sim', 'Sequence_2_cluster_reference',
            'Id'):
            row.pop(k, None)
        # Remove rna elements from new rom
        for k in ('Raw_ID1', 'Interactor1.Symbol', 'Category1', 'Species1', 'Sequence_1', 'Sequence_1_len',
           'Sequence_1_ID', 'Sequence_1_shuffle', 'Sequence_1_cluster',
           'Sequence_1_cluster_sim', 'Sequence_1_cluster_reference',
           'Sequence_1_rfam_q_accession', 'Sequence_1_family',
           'Sequence_1_rfam_t_accession', 'Sequence_1_rfam_description',
           'Sequence_1_rfam_e_value',
            'score', 'strong', 'weak', 'predict', 'RNAInterID', 'Id'):
            random_row.pop(k, None)
        new_row = {**row, **random_row}
        new_row['score'] = 0
        new_row['Sequence_2_shuffle'] = True
        new_row['strong'] = float("nan")
        new_row['weak'] = float("nan")
        new_row['predict'] = float("nan")
        new_row['RNAInterID'] = str(new_row['RNAInterID']) + '.PV1'
        increased_set = pd.concat([increased_set, pd.DataFrame([new_row])], ignore_index=True)

    assert dataset.shape[0] == increased_set.shape[0]
    assert dataset.merge(increased_set, on=['Raw_ID1', 'Sequence_1_ID', 'Raw_ID2', 'Sequence_2_ID'], how='inner').shape[0] == 0
    return increased_set

In [None]:
temp_df_1 = increase_set_1(rpi_db, rpi_db)
temp_df_2 = increase_set_2(rpi_db, rpi_db)
rpi_db_extent = pd.concat([rpi_db, temp_df_1, temp_df_2])
# assign unique Sequence IDs for embedding creation

rpi_db['Sequence_1_ID_Unique'] = rpi_db.groupby(['Sequence_1']).ngroup()
rpi_db['Sequence_2_ID_Unique'] = rpi_db.groupby(['Sequence_2']).ngroup()
unique_proteins = rpi_db[['Sequence_2_ID_Unique', 'Sequence_2']].drop_duplicates()
unique_RNAs = rpi_db[['Sequence_1_ID_Unique', 'Sequence_1']].drop_duplicates()
unique_proteins.to_parquet("../results/rpi2825/unique_proteins.parquet")
unique_RNAs.to_parquet("../results/rpi2825/unique_RNAs.parquet")
assert len(rpi_db_extent) == len(rpi_db) * 3

In [None]:
rpi_db_extent['RNAInterID'] = rpi_db_extent['RNAInterID'].astype(str)
rpi_db_extent = rpi_db_extent.merge(unique_proteins, on=['Sequence_2'])
rpi_db_extent = rpi_db_extent.merge(unique_RNAs, on=['Sequence_1'])
rpi_db_extent.to_parquet("../results/rpi2825/rpi2825_extented.parquet")