# Code

In [1]:
import pandas as pd
import os
import time
import numpy as np
import sys
from tqdm.notebook import tqdm
sys.path.insert(0, '..')
import dataset.preprocessing as utils

In [2]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
original_files_dir = os.path.join(ROOT_DIR, 'dataset', 'original_files')
processed_files_dir = os.path.join(ROOT_DIR, 'dataset', 'processed_files')
rna_rna_pairs_data_dir = os.path.join(ROOT_DIR, 'dataset', 'rna_rna_pairs')

### Pre-processing

In [3]:
df_hub = utils.read_dataframe(os.path.join(original_files_dir, 'hub.table.paris.txt'), columns_to_drop = ['Unnamed: 0','gene_name']).rename({'cell_line': 'cell_line_set',
                                                                                                                                                    'degree':'n_interactors',
                                                                                                                                                    'gene_type': 'gene_type_set',
                                                                                                                                                    'species': 'species_set'}, axis = 1)
tx = utils.read_dataframe(os.path.join(original_files_dir,'tx_regions.ens99.txt'), columns_to_drop = ['Unnamed: 0','ensembl_transcript_id']).rename({'ensembl_gene_id': 'gene_id'}, axis = 1)
cc = pd.read_csv(os.path.join(original_files_dir,'controls_controlled.hub.txt'), sep = '\t')
int_or = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['Unnamed: 0', 'gene_name1', 'gene_name2', 'score', 'tx_id_1', 'tx_id_2', 'rise_id', 'type_interaction', 'tx_id_1_localization', 'tx_id_2_localization'])

df_genes = df_hub.merge(tx, on = 'gene_id')
assert df_genes.shape[0] == df_hub.shape[0]

In [4]:
int_or_all = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['rise_id','Unnamed: 0', 'type_interaction'])

In [5]:
int_or_all.drop_duplicates().shape[0]

90086

In [6]:
int_or.shape[0]

92173

In [7]:
int_or = int_or.drop_duplicates().reset_index(drop = True)
int_or.shape[0]

89971

In [8]:
df_pairs_full = utils.obtain_df_pos_controls(cc)

In [9]:
print(f'We have {len(set(df_pairs_full.positive))} pairs interacting (they can have multiple interactions) \n')
print(f'We have {len(set(df_pairs_full.negative))} pairs not interacting \n')

We have 80494 pairs interacting (they can have multiple interactions) 

We have 160511 pairs not interacting 



In [10]:
df_pairs_full.to_csv(os.path.join(processed_files_dir, 'df_pairs_full.csv'), index = False)

In [11]:
# LEVA I COMMENTI

# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) > 0
# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) > 0

# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) == 0
# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) == 0

In [12]:
df_neg = df_pairs_full[['negative']].drop_duplicates().reset_index(drop = True)
df_neg[['gene1', 'gene2']] = df_neg['negative'].str.split('_', expand = True)
df_neg = df_neg.rename({'negative':'couples'}, axis = 1)

df_pairs = df_pairs_full.groupby('positive').agg({'negative': list}).reset_index()

In [13]:
int_or[['couples', 'need_to_swap']] = int_or[['gene_id1', 'gene_id2']].apply(utils.create_pairs, axis = 1)

In [14]:
int_or = utils.swap_genes_if_needed(int_or)

In [15]:
assert (int_or[['gene_id1', 'gene_id2']].apply(utils.create_pairs, axis = 1)[1] == False).all() #check if swapping works

In [16]:
int_or.drop_duplicates().shape[0]

88999

In [17]:
int_or = utils.create_features(int_or)

In [18]:
int_or.drop_duplicates().shape[0]

88999

In [21]:
assert int_or.groupby('gene_id1').std(numeric_only = True).protein_coding_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).protein_coding_2.max() == 0

assert int_or.groupby('gene_id1').std(numeric_only = True).length_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).length_2.max() == 0

idx = np.random.randint(int_or.shape[0])
assert int_or.loc[idx].length_1 == len(int_or.loc[idx].cdna_1)
assert int_or.loc[idx].length_2 == len(int_or.loc[idx].cdna_2)

### Gene info df

In [22]:
gene_info1 = int_or[['gene_id1', 'length_1', 'cdna_1', 'protein_coding_1']]
gene_info1.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info2 = int_or[['gene_id2', 'length_2', 'cdna_2', 'protein_coding_2']]
gene_info2.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info = pd.concat([gene_info1, gene_info2], axis = 0, ignore_index = True).drop_duplicates()
#assert set(gene_info.gene_id) == set(df_genes.gene_id)
df_genes = df_genes.merge(gene_info)
df_genes.to_csv(os.path.join(processed_files_dir, 'df_genes.csv'), index = False)

In [23]:
#clean int_or
int_or = int_or.drop(['cdna_1', 'cdna_2'], axis = 1)

### Clean bounding boxes of df interactions

In [22]:
df_boxes = int_or.filter(['start_map1', 'end_map1', 'start_map2', 'end_map2','area_of_the_interaction'], axis = 1).apply(utils.create_boxes_xywh, axis = 1).rename({0: 'x1', 1: 'y1', 2:'w', 3:'h'}, axis = 1)
int_or = pd.concat([int_or, df_boxes], axis = 1).drop(['start_map1', 'end_map1', 'start_map2', 'end_map2'], axis = 1)

In [23]:
#approx 13 min
diz_int = {}
idx = 0
for couple in tqdm(int_or.couples.unique()):
    subset = int_or[int_or.couples == couple]
    list_of_boxes = subset.filter(['x1', 'y1', 'w', 'h']).values.tolist()
    new_list_of_boxes = utils.clean_bounding_boxes(list_of_boxes)
    row = int_or[int_or.couples == couple].iloc[0]
    for box in new_list_of_boxes:
        d = dict(row)
        d['x1'] = box[0]
        d['y1'] = box[1] 
        d['w'] = box[2]
        d['h'] = box[3]
        diz_int[idx] = d
        idx+=1

  0%|          | 0/85127 [00:00<?, ?it/s]

In [24]:
df_int = pd.DataFrame.from_dict(diz_int, 'index').rename({'gene_id1':'gene1', 'gene_id2':'gene2'}, axis = 1)

In [25]:
assert len(int_or.couples.unique()) == len(df_int.couples.unique())

In [26]:
print(f'#interazioni prima {int_or.shape[0]}, #interazioni dopo: {df_int.shape[0]}')

#interazioni prima 89971, #interazioni dopo: 88106


In [27]:
df_int.to_csv(os.path.join(processed_files_dir, 'full_paris_info_interactions.csv'), index = False)

In [25]:
df_int = df_int[['couples', 'gene1', 'gene2', 
                 'interacting', 'length_1', 'length_2',
                 'protein_coding_1', 'protein_coding_2',
                 'x1', 'y1', 'w', 'h']]

In [26]:
df_int[df_int.couples == 'ENSG00000005022_ENSG00000234421']

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
42028,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,831,752,21,20
42029,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,265,228,23,23


In [27]:
df_int[df_int.couples.duplicated()].sort_values('couples')

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
42029,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,265,228,23,23
3028,ENSG00000006327_ENSG00000198886,ENSG00000006327,ENSG00000198886,True,1133,1378,True,True,856,461,54,39
3029,ENSG00000006327_ENSG00000198886,ENSG00000006327,ENSG00000198886,True,1133,1378,True,True,875,1101,35,39
16378,ENSG00000006831_ENSG00000229807,ENSG00000006831,ENSG00000229807,True,3971,19245,True,False,3105,6378,20,33
38457,ENSG00000008128_ENSG00000248333,ENSG00000008128,ENSG00000248333,True,2984,2998,True,True,2546,2528,41,32
...,...,...,...,...,...,...,...,...,...,...,...,...
57380,ENSMUSG00000098178_ENSMUSG00000107822,ENSMUSG00000098178,ENSMUSG00000107822,True,1831,1064,False,True,868,306,24,22
60453,ENSMUSG00000098178_ENSMUSG00000109324,ENSMUSG00000098178,ENSMUSG00000109324,True,1831,2147,False,True,650,1735,34,22
65742,ENSMUSG00000098178_ENSMUSG00000109508,ENSMUSG00000098178,ENSMUSG00000109508,True,1831,2887,False,False,1243,1702,29,19
60644,ENSMUSG00000098178_ENSMUSG00000115420,ENSMUSG00000098178,ENSMUSG00000115420,True,1831,281,False,False,915,176,21,20


In [28]:
df_neg['interacting'] = False
df_neg = df_neg.merge(df_genes[['gene_id', 'length', 'protein_coding']], left_on = 'gene1', right_on = 'gene_id').drop('gene_id', axis = 1).rename({'length': 'length_1','protein_coding':'protein_coding_1'} , axis = 1)
df_neg = df_neg.merge(df_genes[['gene_id', 'length', 'protein_coding']], left_on = 'gene2', right_on = 'gene_id').drop('gene_id', axis = 1).rename({'length': 'length_2','protein_coding':'protein_coding_2'} , axis = 1)

In [29]:
assert set(df_pairs_full.negative) ==  set(df_neg.couples)
assert df_pairs_full.shape[0] ==  df_neg.shape[0] # ci sono dei negativi che sono gli stessi per piu 

AssertionError: 

In [30]:
df_int1 = df_int[['gene1', 'x1', 'w']].rename({'gene1':'gene', 'x1':'c1',  'w': 'l'}, axis = 1)
df_int2 = df_int[['gene2', 'y1', 'h']].rename({'gene2':'gene', 'y1':'c1',  'h': 'l'}, axis = 1)
df_coord = pd.concat([df_int1, df_int2], ignore_index = True)#.drop_duplicates().reset_index(drop = True)
#df_coord may have duplicates. but this is something I want. If a gene appears more than once, I want it to be sampled according to its distribution.

In [31]:
assert set(df_neg.gene1).union(set(df_neg.gene2)) - set(df_coord.gene) == set()

In [32]:
# RICONTROLLALA, POI STIMA IL TEMPO

def create_fake_coord_neg(x, df_coord, df_pairs_full, df_int):
    g1 = x.gene1
    g2 = x.gene2
    s1 = df_coord[df_coord.gene == g1].sample(1).iloc[0]
    s2 = df_coord[df_coord.gene == g2].sample(1).iloc[0]
    
    pos = df_pairs_full[df_pairs_full.negative == x.couples].sample(1).iloc[0] #dovrebbe essere 1 ma non sono sicuro (e possibile piu di una? dovrei ragionarci su con l assert di prima), nel dubbio campiono.
    p1, p2 = pos.positive.split('_')
    
    interaction_coords = df_int[df_int.couples == pos.positive].sample(1).iloc[0] #puo essere piu di una, se ho piu di una regione di interzione
    
    if g1 == p1:
        x1, w = interaction_coords.x1, interaction_coords.w
        y1, h = s2.c1, s2.l
        
    elif g1 == p2:
        x1, w = interaction_coords.y1, interaction_coords.h
        y1, h = s2.c1, s2.l
        
    elif g2 == p1:
        x1, w = s1.c1, s1.l
        y1, h = interaction_coords.x1, interaction_coords.w
        
    elif g2 == p2:
        x1, w = s1.c1, s1.l
        y1, h = interaction_coords.y1, interaction_coords.h
        
    else:
        raise NotImplementedError
    
    return x1, y1, w, h

In [33]:
#69 min
start_time = time.time()
new_cols = df_neg[['couples', 'gene1', 'gene2']].apply(create_fake_coord_neg, axis = 1, args = (df_coord,df_pairs_full,df_int,))
print(f"Total time: {(time.time()-start_time)/60} minutes")

Total time: 68.04835674762725 minutes


In [34]:
new_cols = new_cols.apply(pd.Series).rename({0:'x1', 1:'y1', 2:'w', 3:'h'}, axis = 1)

In [35]:
df_neg = pd.concat([df_neg, new_cols], axis = 1)

In [36]:
df_neg

Unnamed: 0,couples,gene1,gene2,interacting,length_1,protein_coding_1,length_2,protein_coding_2,x1,y1,w,h
0,ENSG00000000003_ENSG00000183779,ENSG00000000003,ENSG00000183779,False,3796,True,3316,True,882,2952,29,38
1,ENSG00000041353_ENSG00000183779,ENSG00000041353,ENSG00000183779,False,7003,True,3316,True,33,793,36,29
2,ENSG00000065243_ENSG00000183779,ENSG00000065243,ENSG00000183779,False,6070,True,3316,True,2852,2949,18,31
3,ENSG00000084710_ENSG00000183779,ENSG00000084710,ENSG00000183779,False,7486,True,3316,True,4741,493,71,26
4,ENSG00000105287_ENSG00000183779,ENSG00000105287,ENSG00000183779,False,3321,True,3316,True,3177,2746,23,17
...,...,...,...,...,...,...,...,...,...,...,...,...
160506,ENSMUSG00000063626_ENSMUSG00000116557,ENSMUSG00000063626,ENSMUSG00000116557,False,9220,True,3257,False,4655,1066,18,21
160507,ENSMUSG00000107577_ENSMUSG00000116557,ENSMUSG00000107577,ENSMUSG00000116557,False,3662,False,3257,False,114,1066,13,21
160508,ENSMUSG00000107924_ENSMUSG00000108322,ENSMUSG00000107924,ENSMUSG00000108322,False,656,False,1332,False,232,1279,19,31
160509,ENSMUSG00000109413_ENSMUSG00000110285,ENSMUSG00000109413,ENSMUSG00000110285,False,1631,False,1250,False,418,741,24,15


In [41]:
#check if it worked
assert (df_neg.x1 <= df_neg.length_1).all()
assert ((df_neg.x1 + df_neg.w) <= df_neg.length_1).all()
assert (df_neg.y1 <= df_neg.length_2).all()
assert ((df_neg.y1 + df_neg.h) <= df_neg.length_2).all()

In [42]:
# 37 min
#check if it worked
start_time = time.time()
for _, row in df_neg.iterrows():
    g1 = row.gene1
    g2 = row.gene2
    assert [row.x1, row.w] in df_coord[df_coord.gene == g1][['c1', 'l']].values
    assert [row.y1, row.h] in df_coord[df_coord.gene == g2][['c1', 'l']].values
    if np.random.rand() < 0.0003: #progress
        print(f"{np.round(_/df_neg.shape[0] * 100, 2)}% in {(time.time()-start_time)/60} minutes")
print(f"Total time: {(time.time()-start_time)/60} minutes")

5.61% in 2.1523165424664814 minutes
9.53% in 3.6562916239102683 minutes
10.27% in 3.938682496547699 minutes
11.06% in 4.239685928821563 minutes
11.63% in 4.461082176367442 minutes
11.82% in 4.531164320309957 minutes
12.42% in 4.7625657280286156 minutes
13.5% in 5.176020340124766 minutes
14.09% in 5.400156982739767 minutes
17.5% in 6.706833561261495 minutes
19.37% in 7.421156402428945 minutes
19.44% in 7.446127025286357 minutes
23.37% in 8.95110768477122 minutes
23.38% in 8.955255691210429 minutes
24.49% in 9.379004844029744 minutes
29.34% in 11.233985873063405 minutes
36.14% in 13.838606373469036 minutes
43.92% in 16.81263703505198 minutes
46.63% in 17.852186103661854 minutes
46.93% in 17.96507182518641 minutes
50.66% in 19.39568118651708 minutes
62.3% in 23.84744616349538 minutes
63.25% in 24.212267577648163 minutes
66.15% in 25.32101083199183 minutes
71.67% in 27.36985952059428 minutes
71.97% in 27.482889684041343 minutes
72.33% in 27.615792389710744 minutes
72.41% in 27.647251530488

In [43]:
df = pd.concat([df_int, df_neg], ignore_index = True, axis = 0)
df.to_csv(os.path.join(processed_files_dir, 'final_df.csv'), index = False)