# Code

In [1]:
import pandas as pd
import os
import time
import numpy as np
import argparse
import sys
sys.path.insert(0, '..')
import dataset.train_test_val_utils as utils
import ast
import re
from tqdm.notebook import tqdm

In [2]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
original_files_dir = os.path.join(ROOT_DIR, 'dataset', 'original_files')
processed_files_dir = os.path.join(ROOT_DIR, 'dataset', 'processed_files')
rna_rna_pairs_data_dir = os.path.join(ROOT_DIR, 'dataset', 'rna_rna_pairs')

### Pre-processing

In [3]:
df_hub = utils.read_dataframe(os.path.join(original_files_dir, 'hub.table.paris.txt'), columns_to_drop = ['Unnamed: 0','gene_name']).rename({'cell_line': 'cell_line_set',
                                                                                                                                                    'degree':'n_interactors',
                                                                                                                                                    'gene_type': 'gene_type_set',
                                                                                                                                                    'species': 'species_set'}, axis = 1)
tx = utils.read_dataframe(os.path.join(original_files_dir,'tx_regions.ens99.txt'), columns_to_drop = ['Unnamed: 0','ensembl_transcript_id']).rename({'ensembl_gene_id': 'gene_id'}, axis = 1)
cc = pd.read_csv(os.path.join(original_files_dir,'controls_controlled.hub.txt'), sep = '\t')
int_or = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['Unnamed: 0', 'gene_name1', 'gene_name2', 'score', 'tx_id_1', 'tx_id_2', 'rise_id', 'type_interaction', 'tx_id_1_localization', 'tx_id_2_localization', 'gene_type1', 'gene_type2'])

df_genes = df_hub.merge(tx, on = 'gene_id')
assert df_genes.shape[0] == df_hub.shape[0]

In [4]:
int_or_all = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['rise_id','Unnamed: 0', 'type_interaction'])

In [5]:
int_or_all.drop_duplicates().shape[0]

90086

In [6]:
int_or.shape[0]

92173

In [7]:
int_or = int_or.drop_duplicates().reset_index(drop = True)
int_or.shape[0]

89971

In [8]:
def get_list_couples(d):
    l = []
    for pair in d:
        rgx = re.search(r"(.+)--(.+)", pair)
        first_gene, second_gene = rgx.group(1), rgx.group(2)
        couple = '_'.join(sorted([first_gene, second_gene]))
        # print(couple, couple in all_couples)
        # assert couple in all_couples
        l.append(couple)
    return l

def get_couple_id(g1, g2, needed_to_swap = False):
    l = sorted([g1, g2])
    if needed_to_swap:
        need_to_swap = False if l[0] == g1 else True
        return '_'.join(l), need_to_swap
    else:
        return '_'.join(l)

def create_pairs(x):
    """
    Pairs will be created in such a way that gene1_gene2 is equal to gene2_gene1. How? I will simply order the strings before create the couple string.
    """
    first_gene = x.gene_id1
    second_gene = x.gene_id2
    couple, need_to_swap = get_couple_id(first_gene, second_gene, needed_to_swap = True)
    return pd.Series([couple, need_to_swap])

def obtain_df_pos_controls(cc):
    diz = {}
    idx = 0
    for _, row in cc.iterrows():
        controlled_gene = row.controlled
        control_gene = row.controls
        d_neg = ast.literal_eval(row.couples_negative)
        d_pos = ast.literal_eval(row.couples_rr_ctrlled)
        negatives = get_list_couples(d_neg)
        positives = get_list_couples(d_pos)
        for pair in positives:
            rgx = re.search(r"(.+)_(.+)", pair)
            first_gene, second_gene = rgx.group(1), rgx.group(2)
            gene_to_search = second_gene if first_gene == controlled_gene else first_gene
            negative_pair =  get_couple_id(control_gene, gene_to_search)
            assert negative_pair in negatives
            diz[idx] = {'positive': pair, 'negative':negative_pair}
            idx+=1
        
    df_pairs = pd.DataFrame.from_dict(diz, 'index')
    return df_pairs

def swap_genes_if_needed(df):
    original_dim = df.shape[0]
    where = df.need_to_swap
    df.loc[where, ['gene_id1', 'gene_id2']] = (df.loc[where, ['gene_id2', 'gene_id1']].values)
    df.loc[where, ['length_1', 'length_2']] = (df.loc[where, ['length_2', 'length_1']].values)
    df.loc[where, ['start_map1', 'start_map2']] = (df.loc[where, ['start_map2', 'start_map1']].values)
    df.loc[where, ['end_map1', 'end_map2']] = (df.loc[where, ['end_map2', 'end_map1']].values)
    df.loc[where, ['transcript_biotype_1', 'transcript_biotype_2']] = (df.loc[where, ['transcript_biotype_2', 'transcript_biotype_1']].values)
    df.loc[where, ['gene_type1', 'gene_type2']] = (df.loc[where, ['gene_type2', 'gene_type1']].values)
    df.loc[where, ['cdna_1', 'cdna_2']] = (df.loc[where, ['cdna_2', 'cdna_1']].values)
    df = df.drop('need_to_swap', axis = 1)
    assert df.drop_duplicates().shape[0] == original_dim 
    return df

def create_features(df):
    df['area_of_the_matrix'] = df['length_1']*df['length_2']
    df['protein_coding_1'] = df['transcript_biotype_1'].apply(lambda x: True if x == 'protein_coding' else False)
    df['protein_coding_2'] = df['transcript_biotype_2'].apply(lambda x: True if x == 'protein_coding' else False)
    assert (df['end_map1'] - df['start_map1']).min() > 0 
    assert (df['end_map2'] - df['start_map2']).min() > 0 
    df['area_of_the_interaction'] = (df['end_map1'] - df['start_map1'])*(df['end_map2'] - df['start_map2'])
    df['interacting'] = True
    assert set(pd.get_dummies(df.cell_line).columns) == set(['HEK293T', 'HeLa', 'HEK293', 'mES', 'Mouse_brain'])
    df = pd.concat([df, pd.get_dummies(df.cell_line)], axis = 1).drop('cell_line', axis = 1)
    return df

def create_boxes_xywh(row):
    """
    start_map1, end_map1, start_map2, end_map2 are in the interval [0, len(rna)-1] for the interactive pairs (indeed, the interactive regions must be sliced like: cdna1[start_map1:(end_map1-1)])
    Args:
        a row (pd.Series) of the dataset.
    Returns:
        boxes (list): A list of bboxes  with the form -> bbox = [x, y, w, h] 
    """
    x = row.start_map1
    y = row.start_map2
    w = row.end_map1 - row.start_map1
    h = row.end_map2 - row.start_map2
    assert row.area_of_the_interaction == h*w
    # if row.area_of_the_interaction != h*w:
    #     print(row.area_of_the_interaction, h*w, h, w)
    #     assert False
    
    return pd.Series([x, y, w, h])

In [9]:
df_pairs = obtain_df_pos_controls(cc)

In [10]:
print(f'We have {len(set(df_pairs.positive))} pairs interacting (they can have multiple interactions) \n')
print(f'We have {len(set(df_pairs.negative))} pairs not interacting \n')

We have 80494 pairs interacting (they can have multiple interactions) 

We have 160511 pairs not interacting 



In [11]:
# LEVA I COMMENTI

# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) > 0
# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) > 0

# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) == 0
# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) == 0

In [12]:
df_neg = df_pairs[['negative']].drop_duplicates().reset_index(drop = True)
df_neg[['gene1', 'gene2']] = df_neg['negative'].str.split('_', expand = True)
df_neg = df_neg.rename({'negative':'couples'}, axis = 1)

df_pairs = df_pairs.groupby('positive').agg({'negative': list}).reset_index()

In [13]:
int_or[['couples', 'need_to_swap']] = int_or[['gene_id1', 'gene_id2']].apply(create_pairs, axis = 1)

In [14]:
int_or = swap_genes_if_needed(int_or)

In [15]:
assert (int_or[['gene_id1', 'gene_id2']].apply(create_pairs, axis = 1)[1] == False).all() #check if swapping works

In [16]:
int_or.drop_duplicates().shape[0]

88999

In [17]:
int_or = create_features(int_or)

In [18]:
int_or.drop_duplicates().shape[0]

88999

In [19]:
assert int_or.groupby('gene_id1').std(numeric_only = True).protein_coding_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).protein_coding_2.max() == 0

assert int_or.groupby('gene_id1').std(numeric_only = True).length_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).length_2.max() == 0

idx = np.random.randint(int_or.shape[0])
assert int_or.loc[idx].length_1 == len(int_or.loc[idx].cdna_1)
assert int_or.loc[idx].length_2 == len(int_or.loc[idx].cdna_2)

### Gene info df

In [21]:
gene_info1 = int_or[['gene_id1', 'length_1', 'cdna_1', 'protein_coding_1']]
gene_info1.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info2 = int_or[['gene_id2', 'length_2', 'cdna_2', 'protein_coding_2']]
gene_info2.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info = pd.concat([gene_info1, gene_info2], axis = 0, ignore_index = True).drop_duplicates()
assert set(gene_info.gene_id) == set(df_genes.gene_id)
df_genes = df_genes.merge(gene_info)
df_genes.to_csv(os.path.join(processed_files_dir, 'df_genes.csv'), index = False)

In [23]:
#clean int_or
int_or = int_or.drop(['cdna_1', 'cdna_2'], axis = 1)

### Clean bounding boxes of df interactions

In [37]:
import itertools
from util import box_ops

In [29]:
def clean_bounding_boxes(list_bboxes):
        list_of_indexes = []
        for idx in range(len(list_bboxes)):
            list_of_indexes.append(IndexNode(idx))
        index_pairs = list(itertools.combinations(list(range(len(list_bboxes))), 2))
        for index_pair in index_pairs:
            if box_ops.bboxes_overlaps(list_bboxes[index_pair[0]], list_bboxes[index_pair[1]]):
                list_of_indexes[index_pair[0]].add_link(list_of_indexes[index_pair[1]])
        nodes = set(list_of_indexes)
        
        # Find all the connected components.
        list_group_indexes = []
        for components in connected_components(nodes):
            group_indexes = sorted(node.name for node in components)
            list_group_indexes.append(group_indexes)
        new_list_of_boxes = []
        for group_idx in list_group_indexes:
            if len(group_idx)>0:

                group = np.array(list_bboxes)[group_idx,] 
                #now it's x1, y1, w, h

                group[:,2] = group[:,0] + group[:,2]
                group[:,3] = group[:,1] + group[:,3]
                #now it's x1, y1, x2, y2

                min_x1 = group.min(axis = 0)[0]
                min_y1 = group.min(axis = 0)[1]
                max_x2 =  group.max(axis = 0)[2]
                max_y2 = group.max(axis = 0)[3]

                w, h = (max_x2-min_x1), (max_y2-min_y1)

                new_list_of_boxes.append([min_x1, min_y1, w, h])
            else:
                new_list_of_boxes.append(list_bboxes[group_idx[0]])
        return new_list_of_boxes
        
class IndexNode(object):
    """
    credits: https://breakingcode.wordpress.com/2013/04/08/finding-connected-components-in-a-graph/
    """
    def __init__(self, name):
        self.__name  = name
        self.__links = set()

    @property
    def name(self):
        return self.__name

    @property
    def links(self):
        return set(self.__links)

    def add_link(self, other):
        self.__links.add(other)
        other.__links.add(self)
   
 # The function to look for connected components.
def connected_components(nodes):
    # List of connected components found. The order is random.
    result = []
    # Make a copy of the set, so we can modify it.
    nodes = set(nodes)
    # Iterate while we still have nodes to process.
    while nodes:
        # Get a random node and remove it from the global set.
        n = nodes.pop()
        # This set will contain the next group of nodes connected to each other.
        group = {n}
        # Build a queue with this node in it.
        queue = [n]
        # Iterate the queue.
        # When it's empty, we finished visiting a group of connected nodes.
        while queue:
            # Consume the next item from the queue.
            n = queue.pop(0)
            # Fetch the neighbors.
            neighbors = n.links
            # Remove the neighbors we already visited.
            neighbors.difference_update(group)
            # Remove the remaining nodes from the global set.
            nodes.difference_update(neighbors)
            # Add them to the group of connected nodes.
            group.update(neighbors)
            # Add them to the queue, so we visit them in the next iterations.
            queue.extend(neighbors)
        # Add the group to the list of groups.
        result.append(group)
    # Return the list of groups.
    return result

In [30]:
df_boxes = int_or.filter(['start_map1', 'end_map1', 'start_map2', 'end_map2','area_of_the_interaction'], axis = 1).apply(create_boxes_xywh, axis = 1).rename({0: 'x1', 1: 'y1', 2:'w', 3:'h'}, axis = 1)
int_or = pd.concat([int_or, df_boxes], axis = 1).drop(['start_map1', 'end_map1', 'start_map2', 'end_map2'], axis = 1)

In [43]:
row

gene_id1                                   ENSG00000198727
gene_id2                                   ENSG00000281026
gene_type1                                  protein_coding
gene_type2                                          lncRNA
method                                              PARIS1
species                                                 hs
length_1                                              1141
transcript_biotype_1                        protein_coding
length_2                                              4890
transcript_biotype_2                                lncRNA
couples                    ENSG00000198727_ENSG00000281026
area_of_the_matrix                                 5579490
protein_coding_1                                      True
protein_coding_2                                     False
area_of_the_interaction                                522
interacting                                           True
HEK293                                                  

In [48]:
#approx 13 min
diz_int = {}
idx = 0
for couple in tqdm(int_or.couples.unique()):
    subset = int_or[int_or.couples == couple]
    list_of_boxes = subset.filter(['x1', 'y1', 'w', 'h']).values.tolist()
    new_list_of_boxes = clean_bounding_boxes(list_of_boxes)
    row = int_or[int_or.couples == couple].iloc[0]
    for box in new_list_of_boxes:
        d = dict(row)
        d['x1'] = box[0]
        d['y1'] = box[1] 
        d['w'] = box[2]
        d['h'] = box[3]
        diz_int[idx] = d
        idx+=1

  0%|          | 0/85127 [00:00<?, ?it/s]

In [76]:
df_int = pd.DataFrame.from_dict(diz_int, 'index').rename({'gene_id1':'gene1', 'gene_id2':'gene2'}, axis = 1)

In [77]:
assert len(int_or.couples.unique()) == len(df_int.couples.unique())

In [78]:
print(f'#interazioni prima {int_or.shape[0]}, #interazioni dopo: {df_int.shape[0]}')

#interazioni prima 89971, #interazioni dopo: 88106


In [79]:
df_int.to_csv(os.path.join(processed_files_dir, 'full_paris_info_interactions.csv'), index = False)

In [80]:
df_int = df_int[['couples', 'gene1', 'gene2', 
                 'interacting', 'length_1', 'length_2',
                 'protein_coding_1', 'protein_coding_2',
                 'x1', 'y1', 'w', 'h']]

In [84]:
df_int[df_int.couples == 'ENSG00000005022_ENSG00000234421']

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
42028,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,831,752,21,20
42029,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,265,228,23,23


In [83]:
df_int[df_int.couples.duplicated()].sort_values('couples')

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
42029,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,265,228,23,23
3028,ENSG00000006327_ENSG00000198886,ENSG00000006327,ENSG00000198886,True,1133,1378,True,True,875,1101,35,39
3029,ENSG00000006327_ENSG00000198886,ENSG00000006327,ENSG00000198886,True,1133,1378,True,True,962,206,53,21
16378,ENSG00000006831_ENSG00000229807,ENSG00000006831,ENSG00000229807,True,3971,19245,True,False,3105,6378,20,33
38457,ENSG00000008128_ENSG00000248333,ENSG00000008128,ENSG00000248333,True,2984,2998,True,True,2546,2528,41,32
...,...,...,...,...,...,...,...,...,...,...,...,...
57380,ENSMUSG00000098178_ENSMUSG00000107822,ENSMUSG00000098178,ENSMUSG00000107822,True,1831,1064,False,True,811,311,26,18
60453,ENSMUSG00000098178_ENSMUSG00000109324,ENSMUSG00000098178,ENSMUSG00000109324,True,1831,2147,False,True,513,813,32,33
65742,ENSMUSG00000098178_ENSMUSG00000109508,ENSMUSG00000098178,ENSMUSG00000109508,True,1831,2887,False,False,1332,1703,39,19
60644,ENSMUSG00000098178_ENSMUSG00000115420,ENSMUSG00000098178,ENSMUSG00000115420,True,1831,281,False,False,915,176,21,20


In [88]:
df_neg['interacting'] = False
df_neg.merge(df_genes[['gene_id', 'length', 'protein_coding']], left_on = 'gene1', right_on = 'gene_id').drop('gene_id', axis = 1).rename({'length': 'length_1','protein_coding':'protein_coding_1'} , axis = 1)
df_neg.merge(df_genes[['gene_id', 'length', 'protein_coding']], left_on = 'gene2', right_on = 'gene_id').drop('gene_id', axis = 1).rename({'length': 'length_2','protein_coding':'protein_coding_2'} , axis = 1)

Unnamed: 0,couples,gene1,gene2,interacting,length_2,protein_coding_2
0,ENSG00000000003_ENSG00000133703,ENSG00000000003,ENSG00000133703,False,5430,True
1,ENSG00000065427_ENSG00000133703,ENSG00000065427,ENSG00000133703,False,5430,True
2,ENSG00000073614_ENSG00000133703,ENSG00000073614,ENSG00000133703,False,5430,True
3,ENSG00000128872_ENSG00000133703,ENSG00000128872,ENSG00000133703,False,5430,True
4,ENSG00000009694_ENSG00000133703,ENSG00000009694,ENSG00000133703,False,5430,True
...,...,...,...,...,...,...
160506,ENSMUSG00000110735_ENSMUSG00000114342,ENSMUSG00000110735,ENSMUSG00000114342,False,3335,False
160507,ENSMUSG00000024500_ENSMUSG00000114342,ENSMUSG00000024500,ENSMUSG00000114342,False,3335,False
160508,ENSMUSG00000111448_ENSMUSG00000117490,ENSMUSG00000111448,ENSMUSG00000117490,False,2128,False
160509,ENSMUSG00000004113_ENSMUSG00000117490,ENSMUSG00000004113,ENSMUSG00000117490,False,2128,False


In [46]:
#TODO:
# Metti a df_neg le stesse colonne di df_int. Per le regioni di interazione potrei prendere delle interazioni finte dai positivi.

In [91]:
df_pairs

Unnamed: 0,positive,negative
0,ENSG00000000003_ENSG00000001630,"[ENSG00000000003_ENSG00000107758, ENSG00000001..."
1,ENSG00000000003_ENSG00000011275,"[ENSG00000000003_ENSG00000115705, ENSG00000011..."
2,ENSG00000000003_ENSG00000048140,"[ENSG00000048140_ENSG00000122042, ENSG00000000..."
3,ENSG00000000003_ENSG00000135441,"[ENSG00000122042_ENSG00000135441, ENSG00000000..."
4,ENSG00000000003_ENSG00000138246,"[ENSG00000122042_ENSG00000138246, ENSG00000000..."
...,...,...
80489,ENSMUSG00000109336_ENSMUSG00000110909,"[ENSMUSG00000037946_ENSMUSG00000110909, ENSMUS..."
80490,ENSMUSG00000109838_ENSMUSG00000116737,"[ENSMUSG00000105135_ENSMUSG00000116737, ENSMUS..."
80491,ENSMUSG00000112035_ENSMUSG00000112761,"[ENSMUSG00000091078_ENSMUSG00000112035, ENSMUS..."
80492,ENSMUSG00000114342_ENSMUSG00000114936,"[ENSMUSG00000102464_ENSMUSG00000114936, ENSMUS..."


In [96]:
df_int

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
0,ENSG00000145391_ENSG00000203801,ENSG00000145391,ENSG00000203801,True,7365,1785,True,False,2552,1175,25,19
1,ENSG00000143195_ENSG00000145391,ENSG00000143195,ENSG00000145391,True,13360,7365,True,True,7785,2119,22,22
2,ENSG00000039139_ENSG00000143195,ENSG00000039139,ENSG00000143195,True,15633,13360,True,True,2933,11748,18,39
3,ENSG00000123728_ENSG00000143195,ENSG00000123728,ENSG00000143195,True,3972,13360,True,True,3388,9487,19,18
4,ENSG00000143195_ENSG00000238013,ENSG00000143195,ENSG00000238013,True,13360,268,True,False,12157,154,19,19
...,...,...,...,...,...,...,...,...,...,...,...,...
88101,ENSG00000103126_ENSG00000253197,ENSG00000103126,ENSG00000253197,True,3707,602,True,False,1995,344,24,23
88102,ENSG00000136099_ENSG00000288586,ENSG00000136099,ENSG00000288586,True,5088,1703,True,False,932,124,16,14
88103,ENSG00000221968_ENSG00000259060,ENSG00000221968,ENSG00000259060,True,1790,578,True,True,56,21,11,12
88104,ENSG00000147119_ENSG00000254369,ENSG00000147119,ENSG00000254369,True,2396,3992,True,False,1173,2673,12,6
