# Code

In [1]:
import pandas as pd
import os
import time
import numpy as np
import argparse
import sys
sys.path.insert(0, '..')
import dataset.train_test_val_utils as utils
import ast
import re
from tqdm.notebook import tqdm

In [2]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
original_files_dir = os.path.join(ROOT_DIR, 'dataset', 'original_files')
processed_files_dir = os.path.join(ROOT_DIR, 'dataset', 'processed_files')
rna_rna_pairs_data_dir = os.path.join(ROOT_DIR, 'dataset', 'rna_rna_pairs')

### Pre-processing

In [3]:
df_hub = utils.read_dataframe(os.path.join(original_files_dir, 'hub.table.paris.txt'), columns_to_drop = ['Unnamed: 0','gene_name']).rename({'cell_line': 'cell_line_set',
                                                                                                                                                    'degree':'n_interactors',
                                                                                                                                                    'gene_type': 'gene_type_set',
                                                                                                                                                    'species': 'species_set'}, axis = 1)
tx = utils.read_dataframe(os.path.join(original_files_dir,'tx_regions.ens99.txt'), columns_to_drop = ['Unnamed: 0','ensembl_transcript_id']).rename({'ensembl_gene_id': 'gene_id'}, axis = 1)
cc = pd.read_csv(os.path.join(original_files_dir,'controls_controlled.hub.txt'), sep = '\t')
int_or = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['Unnamed: 0', 'gene_name1', 'gene_name2', 'score', 'tx_id_1', 'tx_id_2', 'rise_id', 'type_interaction'])

df_genes = df_hub.merge(tx, on = 'gene_id')
assert df_genes.shape[0] == df_hub.shape[0]

In [4]:
int_or_all = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['rise_id','Unnamed: 0', 'type_interaction'])

In [5]:
int_or_all.drop_duplicates().shape[0]

90086

In [6]:
int_or.shape[0]

92173

In [7]:
int_or = int_or.drop_duplicates().reset_index(drop = True)
int_or.shape[0]

89971

In [8]:
def get_list_couples(d):
    l = []
    for pair in d:
        rgx = re.search(r"(.+)--(.+)", pair)
        first_gene, second_gene = rgx.group(1), rgx.group(2)
        couple = '_'.join(sorted([first_gene, second_gene]))
        # print(couple, couple in all_couples)
        # assert couple in all_couples
        l.append(couple)
    return l

def get_couple_id(g1, g2, needed_to_swap = False):
    l = sorted([g1, g2])
    if needed_to_swap:
        need_to_swap = False if l[0] == g1 else True
        return '_'.join(l), need_to_swap
    else:
        return '_'.join(l)

def create_pairs(x):
    """
    Pairs will be created in such a way that gene1_gene2 is equal to gene2_gene1. How? I will simply order the strings before create the couple string.
    """
    first_gene = x.gene_id1
    second_gene = x.gene_id2
    couple, need_to_swap = get_couple_id(first_gene, second_gene, needed_to_swap = True)
    return pd.Series([couple, need_to_swap])

def obtain_df_pos_controls(cc):
    diz = {}
    idx = 0
    for _, row in cc.iterrows():
        controlled_gene = row.controlled
        control_gene = row.controls
        d_neg = ast.literal_eval(row.couples_negative)
        d_pos = ast.literal_eval(row.couples_rr_ctrlled)
        negatives = get_list_couples(d_neg)
        positives = get_list_couples(d_pos)
        for pair in positives:
            rgx = re.search(r"(.+)_(.+)", pair)
            first_gene, second_gene = rgx.group(1), rgx.group(2)
            gene_to_search = second_gene if first_gene == controlled_gene else first_gene
            negative_pair =  get_couple_id(control_gene, gene_to_search)
            assert negative_pair in negatives
            diz[idx] = {'positive': pair, 'negative':negative_pair}
            idx+=1
        
    df_pairs = pd.DataFrame.from_dict(diz, 'index')
    return df_pairs

def swap_genes_if_needed(df):
    original_dim = df.shape[0]
    where = df.need_to_swap
    df.loc[where, ['gene_id1', 'gene_id2']] = (df.loc[where, ['gene_id2', 'gene_id1']].values)
    df.loc[where, ['length_1', 'length_2']] = (df.loc[where, ['length_2', 'length_1']].values)
    df.loc[where, ['start_map1', 'start_map2']] = (df.loc[where, ['start_map2', 'start_map1']].values)
    df.loc[where, ['end_map1', 'end_map2']] = (df.loc[where, ['end_map2', 'end_map1']].values)
    df.loc[where, ['transcript_biotype_1', 'transcript_biotype_2']] = (df.loc[where, ['transcript_biotype_2', 'transcript_biotype_1']].values)
    df.loc[where, ['tx_id_1_localization', 'tx_id_2_localization']] = (df.loc[where, ['tx_id_2_localization', 'tx_id_1_localization']].values)
    df.loc[where, ['gene_type1', 'gene_type2']] = (df.loc[where, ['gene_type2', 'gene_type1']].values) 
    df.loc[where, ['cdna_1', 'cdna_2']] = (df.loc[where, ['cdna_2', 'cdna_1']].values)
    df = df.drop('need_to_swap', axis = 1)
    assert df.drop_duplicates().shape[0] == original_dim 
    return df

def create_features(df):
    df['area_of_the_matrix'] = df['length_1']*df['length_2']
    df['protein_coding_1'] = df['transcript_biotype_1'].apply(lambda x: True if x == 'protein_coding' else False)
    df['protein_coding_2'] = df['transcript_biotype_2'].apply(lambda x: True if x == 'protein_coding' else False)
    assert (df['end_map1'] - df['start_map1']).min() > 0 
    assert (df['end_map2'] - df['start_map2']).min() > 0 
    df['area_of_the_interaction'] = (df['end_map1'] - df['start_map1'])*(df['end_map2'] - df['start_map2'])
    df['interacting'] = True
    assert set(pd.get_dummies(df.cell_line).columns) == set(['HEK293T', 'HeLa', 'HEK293', 'mES', 'Mouse_brain'])
    df = pd.concat([df, pd.get_dummies(df.cell_line)], axis = 1).drop('cell_line', axis = 1)
    return df

In [9]:
df_pairs = obtain_df_pos_controls(cc)

In [10]:
print(f'We have {len(set(df_pairs.positive))} pairs interacting (they can have multiple interactions) \n')
print(f'We have {len(set(df_pairs.negative))} pairs not interacting \n')

We have 80494 pairs interacting (they can have multiple interactions) 

We have 160511 pairs not interacting 



In [11]:
# LEVA I COMMENTI

# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) > 0
# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) > 0

# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) == 0
# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) == 0

In [12]:
df_neg = df_pairs[['negative']].drop_duplicates().reset_index(drop = True)
df_neg[['gene1', 'gene2']] = df_neg['negative'].str.split('_', expand = True)
df_neg = df_neg.rename({'negative':'couples'}, axis = 1)

df_pairs = df_pairs.groupby('positive').agg({'negative': list}).reset_index()

In [13]:
int_or[['couples', 'need_to_swap']] = int_or[['gene_id1', 'gene_id2']].apply(create_pairs, axis = 1)

In [14]:
int_or = swap_genes_if_needed(int_or)

In [15]:
assert (int_or[['gene_id1', 'gene_id2']].apply(create_pairs, axis = 1)[1] == False).all() #check if swapping works

In [16]:
int_or.drop_duplicates().shape[0]

88999

In [17]:
int_or = create_features(int_or)

In [18]:
int_or.drop_duplicates().shape[0]

88999

In [19]:
assert int_or.groupby('gene_id1').std(numeric_only = True).protein_coding_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).protein_coding_2.max() == 0

assert int_or.groupby('gene_id1').std(numeric_only = True).length_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).length_2.max() == 0

idx = np.random.randint(int_or.shape[0])
assert int_or.loc[idx].length_1 == len(int_or.loc[idx].cdna_1)
assert int_or.loc[idx].length_2 == len(int_or.loc[idx].cdna_2)

In [22]:
gene_info1 = int_or[['gene_id1', 'length_1', 'cdna_1', 'protein_coding_1']]
gene_info1.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info2 = int_or[['gene_id2', 'length_2', 'cdna_2', 'protein_coding_2']]
gene_info2.columns = ['gene_id', 'length', 'cdna', 'protein_coding']

In [26]:
gene_info = pd.concat([gene_info1, gene_info2], axis = 0, ignore_index = True).drop_duplicates()

In [None]:
assert set(gene_info.gene_id) == set(df_genes.gene_id)

In [31]:
df_genes = df_genes.merge(gene_info)

In [33]:
df_genes.to_csv(os.path.join(processed_files_dir, 'df_genes.csv'), index = False)

In [None]:
#clean int_or
int_full_info = int_or.drop(['cdna1', 'cdna2'], axis = 1).columns

int_full_info.to_csv(os.path.join(processed_files_dir, 'full_paris_info_interactions.csv'), index = False)

In [None]:
df_int = int_full_info[['couples', 'gene_id1', 'gene_id2', 'interacting', 
                        'length_1', 'protein_coding_1',
                        'start_map1', 'end_map1', 'start_map2', 'end_map2']].rename({'gene_id1':'gene1', 'gene_id2':'gene1'}, axis = 1)

In [39]:
df_neg

Unnamed: 0,couples,gene1,gene2
0,ENSG00000000003_ENSG00000133703,ENSG00000000003,ENSG00000133703
1,ENSG00000000003_ENSG00000272716,ENSG00000000003,ENSG00000272716
2,ENSG00000000003_ENSG00000183779,ENSG00000000003,ENSG00000183779
3,ENSG00000000003_ENSG00000164363,ENSG00000000003,ENSG00000164363
4,ENSG00000000003_ENSG00000129351,ENSG00000000003,ENSG00000129351
...,...,...,...
160506,ENSMUSG00000021302_ENSMUSG00000118552,ENSMUSG00000021302,ENSMUSG00000118552
160507,ENSMUSG00000064373_ENSMUSG00000118559,ENSMUSG00000064373,ENSMUSG00000118559
160508,ENSMUSG00000027630_ENSMUSG00000118559,ENSMUSG00000027630,ENSMUSG00000118559
160509,ENSMUSG00000015090_ENSMUSG00000118559,ENSMUSG00000015090,ENSMUSG00000118559


In [46]:
#TODO:
# Metti a df_neg le stesse colonne di df_int. Per le regioni di interazione potrei prendere delle interazioni finte dai positivi.

In [None]:
# # ONLY FOR CONTROLS 
# df['interacting'] = False
# df[['area_of_the_interaction', 
#     'start_map1', 'end_map1', 
#     'start_map2', 'end_map2', 
#     'tx_id_1_localization', 'tx_id_2_localization',
#     'HEK293T', 'HeLa', 'HEK293', 'mES', 'Mouse_brain',
#    'tx_id_1_localization', 'tx_id_2_localization']] = np.nan

In [10]:
#create protein_coding_columns