# Code

In [1]:
import pandas as pd
import os
import time
import numpy as np
import sys
from tqdm.notebook import tqdm
sys.path.insert(0, '..')
import dataset.preprocessing as utils

In [2]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
original_files_dir = os.path.join(ROOT_DIR, 'dataset', 'original_files')
processed_files_dir = os.path.join(ROOT_DIR, 'dataset', 'processed_files')
rna_rna_pairs_data_dir = os.path.join(ROOT_DIR, 'dataset', 'rna_rna_pairs')

### Pre-processing

In [3]:
df_hub = utils.read_dataframe(os.path.join(original_files_dir, 'hub.table.paris.txt'), columns_to_drop = ['Unnamed: 0','gene_name']).rename({'cell_line': 'cell_line_set',
                                                                                                                                                    'degree':'n_interactors',
                                                                                                                                                    'gene_type': 'gene_type_set',
                                                                                                                                                    'species': 'species_set'}, axis = 1)
tx = utils.read_dataframe(os.path.join(original_files_dir,'tx_regions.ens99.txt'), columns_to_drop = ['Unnamed: 0','ensembl_transcript_id']).rename({'ensembl_gene_id': 'gene_id'}, axis = 1)
cc = pd.read_csv(os.path.join(original_files_dir,'controls_controlled.hub.txt'), sep = '\t')
int_or = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['Unnamed: 0', 'gene_name1', 'gene_name2', 'score', 'tx_id_1', 'tx_id_2', 'rise_id', 'type_interaction', 'tx_id_1_localization', 'tx_id_2_localization'])

df_genes = df_hub.merge(tx, on = 'gene_id')
assert df_genes.shape[0] == df_hub.shape[0]

In [4]:
int_or_all = utils.read_dataframe(os.path.join(original_files_dir, 'rise_paris_tr.new.mapped_interactions.tx_regions.txt'), columns_to_drop = ['rise_id','Unnamed: 0', 'type_interaction'])

In [5]:
int_or_all.drop_duplicates().shape[0]

90086

In [6]:
int_or.shape[0]

92173

In [7]:
int_or = int_or.drop_duplicates().reset_index(drop = True)
int_or.shape[0]

89971

In [8]:
df_pairs_full = utils.obtain_df_pos_controls(cc)

In [9]:
print(f'We have {len(set(df_pairs_full.positive))} pairs interacting (they can have multiple interactions) \n')
print(f'We have {len(set(df_pairs_full.negative))} pairs not interacting \n')

We have 80494 pairs interacting (they can have multiple interactions) 

We have 160511 pairs not interacting 



In [10]:
df_pairs_full.to_csv(os.path.join(processed_files_dir, 'df_pairs_full.csv'), index = False)

In [11]:
# LEVA I COMMENTI

# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) > 0
# assert len(set(df_pairs.positive).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) > 0

# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id1 + '_' + int_or.gene_id2))) == 0
# assert len(set(df_pairs.negative).intersection(set(int_or.gene_id2 + '_' + int_or.gene_id1))) == 0

In [12]:
df_neg = df_pairs_full[['negative']].drop_duplicates().reset_index(drop = True)
df_neg[['gene1', 'gene2']] = df_neg['negative'].str.split('_', expand = True)
df_neg = df_neg.rename({'negative':'couples'}, axis = 1)

df_pairs = df_pairs_full.groupby('positive').agg({'negative': list}).reset_index()

In [13]:
int_or[['couples', 'need_to_swap']] = int_or[['gene_id1', 'gene_id2']].apply(utils.create_pairs, axis = 1)

In [14]:
int_or = utils.swap_genes_if_needed(int_or)

In [15]:
assert (int_or[['gene_id1', 'gene_id2']].apply(utils.create_pairs, axis = 1)[1] == False).all() #check if swapping works

In [16]:
int_or.drop_duplicates().shape[0]

88999

In [17]:
int_or = utils.create_features(int_or)

In [18]:
int_or.drop_duplicates().shape[0]

88999

In [19]:
assert int_or.groupby('gene_id1').std(numeric_only = True).protein_coding_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).protein_coding_2.max() == 0

assert int_or.groupby('gene_id1').std(numeric_only = True).length_1.max() == 0
assert int_or.groupby('gene_id2').std(numeric_only = True).length_2.max() == 0

idx = np.random.randint(int_or.shape[0])
assert int_or.loc[idx].length_1 == len(int_or.loc[idx].cdna_1)
assert int_or.loc[idx].length_2 == len(int_or.loc[idx].cdna_2)

### Gene info df

In [22]:
gene_info1 = int_or[['gene_id1', 'length_1', 'cdna_1', 'protein_coding_1']]
gene_info1.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info2 = int_or[['gene_id2', 'length_2', 'cdna_2', 'protein_coding_2']]
gene_info2.columns = ['gene_id', 'length', 'cdna', 'protein_coding']
gene_info = pd.concat([gene_info1, gene_info2], axis = 0, ignore_index = True).drop_duplicates()
#assert set(gene_info.gene_id) == set(df_genes.gene_id)
df_genes = df_genes.merge(gene_info)
df_genes.to_csv(os.path.join(processed_files_dir, 'df_genes.csv'), index = False)

In [23]:
#clean int_or
int_or = int_or.drop(['cdna_1', 'cdna_2'], axis = 1)

### Clean bounding boxes of df interactions

In [24]:
df_boxes = int_or.filter(['start_map1', 'end_map1', 'start_map2', 'end_map2','area_of_the_interaction'], axis = 1).apply(utils.create_boxes_xywh, axis = 1).rename({0: 'x1', 1: 'y1', 2:'w', 3:'h'}, axis = 1)
int_or = pd.concat([int_or, df_boxes], axis = 1).drop(['start_map1', 'end_map1', 'start_map2', 'end_map2'], axis = 1)

In [25]:
#approx 13 min
diz_int = {}
idx = 0
for couple in tqdm(int_or.couples.unique()):
    subset = int_or[int_or.couples == couple]
    list_of_boxes = subset.filter(['x1', 'y1', 'w', 'h']).values.tolist()
    new_list_of_boxes = utils.clean_bounding_boxes(list_of_boxes)
    row = int_or[int_or.couples == couple].iloc[0]
    for box in new_list_of_boxes:
        d = dict(row)
        d['x1'] = box[0]
        d['y1'] = box[1] 
        d['w'] = box[2]
        d['h'] = box[3]
        diz_int[idx] = d
        idx+=1

  0%|          | 0/85127 [00:00<?, ?it/s]

In [26]:
df_int = pd.DataFrame.from_dict(diz_int, 'index').rename({'gene_id1':'gene1', 'gene_id2':'gene2'}, axis = 1)

In [27]:
assert len(int_or.couples.unique()) == len(df_int.couples.unique())

In [28]:
print(f'#interazioni prima {int_or.shape[0]}, #interazioni dopo: {df_int.shape[0]}')

#interazioni prima 89971, #interazioni dopo: 88106


In [29]:
df_int.to_csv(os.path.join(processed_files_dir, 'full_paris_info_interactions.csv'), index = False)

In [30]:
df_int = df_int[['couples', 'gene1', 'gene2', 
                 'interacting', 'length_1', 'length_2',
                 'protein_coding_1', 'protein_coding_2',
                 'x1', 'y1', 'w', 'h']]

In [31]:
df_int[df_int.couples == 'ENSG00000005022_ENSG00000234421']

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
42028,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,831,752,21,20
42029,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,265,228,23,23


In [32]:
df_int[df_int.couples.duplicated()].sort_values('couples')

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
42029,ENSG00000005022_ENSG00000234421,ENSG00000005022,ENSG00000234421,True,1307,869,True,False,265,228,23,23
3028,ENSG00000006327_ENSG00000198886,ENSG00000006327,ENSG00000198886,True,1133,1378,True,True,875,1101,35,39
3029,ENSG00000006327_ENSG00000198886,ENSG00000006327,ENSG00000198886,True,1133,1378,True,True,856,461,54,39
16378,ENSG00000006831_ENSG00000229807,ENSG00000006831,ENSG00000229807,True,3971,19245,True,False,589,17178,23,22
38457,ENSG00000008128_ENSG00000248333,ENSG00000008128,ENSG00000248333,True,2984,2998,True,True,2546,2528,41,32
...,...,...,...,...,...,...,...,...,...,...,...,...
57380,ENSMUSG00000098178_ENSMUSG00000107822,ENSMUSG00000098178,ENSMUSG00000107822,True,1831,1064,False,True,868,306,24,22
60453,ENSMUSG00000098178_ENSMUSG00000109324,ENSMUSG00000098178,ENSMUSG00000109324,True,1831,2147,False,True,650,1735,34,22
65742,ENSMUSG00000098178_ENSMUSG00000109508,ENSMUSG00000098178,ENSMUSG00000109508,True,1831,2887,False,False,1332,1703,39,19
60644,ENSMUSG00000098178_ENSMUSG00000115420,ENSMUSG00000098178,ENSMUSG00000115420,True,1831,281,False,False,915,176,21,20


In [33]:
df_neg['interacting'] = False
df_neg = df_neg.merge(df_genes[['gene_id', 'length', 'protein_coding']], left_on = 'gene1', right_on = 'gene_id').drop('gene_id', axis = 1).rename({'length': 'length_1','protein_coding':'protein_coding_1'} , axis = 1)
df_neg = df_neg.merge(df_genes[['gene_id', 'length', 'protein_coding']], left_on = 'gene2', right_on = 'gene_id').drop('gene_id', axis = 1).rename({'length': 'length_2','protein_coding':'protein_coding_2'} , axis = 1)

In [34]:
#TODO:
# Metti a df_neg le stesse colonne di df_int. Per le regioni di interazione potrei prendere delle interazioni finte dai positivi.

In [38]:
df_pairs_full.negative

0               ENSG00000000003_ENSG00000164363
1               ENSG00000000003_ENSG00000151148
2               ENSG00000000003_ENSG00000114805
3               ENSG00000000003_ENSG00000133703
4               ENSG00000000003_ENSG00000272716
                          ...                  
160546    ENSMUSG00000021302_ENSMUSG00000118552
160547    ENSMUSG00000015090_ENSMUSG00000118559
160548    ENSMUSG00000064373_ENSMUSG00000118559
160549    ENSMUSG00000027630_ENSMUSG00000118559
160550    ENSMUSG00000033209_ENSMUSG00000118604
Name: negative, Length: 160551, dtype: object

In [47]:
df_pairs_full.drop_duplicates()

Unnamed: 0,positive,negative
0,ENSG00000122042_ENSG00000164363,ENSG00000000003_ENSG00000164363
1,ENSG00000122042_ENSG00000151148,ENSG00000000003_ENSG00000151148
2,ENSG00000114805_ENSG00000122042,ENSG00000000003_ENSG00000114805
3,ENSG00000122042_ENSG00000133703,ENSG00000000003_ENSG00000133703
4,ENSG00000122042_ENSG00000272716,ENSG00000000003_ENSG00000272716
...,...,...
160546,ENSMUSG00000021302_ENSMUSG00000089665,ENSMUSG00000021302_ENSMUSG00000118552
160547,ENSMUSG00000015090_ENSMUSG00000035575,ENSMUSG00000015090_ENSMUSG00000118559
160548,ENSMUSG00000035575_ENSMUSG00000064373,ENSMUSG00000064373_ENSMUSG00000118559
160549,ENSMUSG00000027630_ENSMUSG00000035575,ENSMUSG00000027630_ENSMUSG00000118559


In [40]:
df_pairs_full.shape[0]

160551

In [45]:
df_neg.shape[0]

160511

In [42]:
assert set(df_pairs_full.negative) ==  set(df_neg.couples)
assert df_pairs_full.shape[0] ==  df_neg.shape[0] # ci sono dei negativi che sono gli stessi per piu 

AssertionError: 

In [52]:
df_int1 = df_int[['gene1', 'x1', 'w']].rename({'gene1':'gene', 'x1':'c1',  'w': 'l'}, axis = 1)
df_int2 = df_int[['gene2', 'y1', 'h']].rename({'gene2':'gene', 'y1':'c1',  'h': 'l'}, axis = 1)
df_coord = pd.concat([df_int1, df_int2], ignore_index = True)#.drop_duplicates().reset_index(drop = True)
#df_coord may have duplicates. but this is something I want. If a gene appears more than once, I want it to be sampled according to its distribution.

In [None]:
assert set(df_neg.gene1).union(set(df_neg.gene2)) - set(df_coord.gene) == set()

In [74]:
df_int

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
0,ENSG00000145391_ENSG00000203801,ENSG00000145391,ENSG00000203801,True,7365,1785,True,False,2552,1175,25,19
1,ENSG00000143195_ENSG00000145391,ENSG00000143195,ENSG00000145391,True,13360,7365,True,True,7785,2119,22,22
2,ENSG00000039139_ENSG00000143195,ENSG00000039139,ENSG00000143195,True,15633,13360,True,True,2933,11748,18,39
3,ENSG00000123728_ENSG00000143195,ENSG00000123728,ENSG00000143195,True,3972,13360,True,True,3388,9487,19,18
4,ENSG00000143195_ENSG00000238013,ENSG00000143195,ENSG00000238013,True,13360,268,True,False,12157,154,19,19
...,...,...,...,...,...,...,...,...,...,...,...,...
88101,ENSG00000103126_ENSG00000253197,ENSG00000103126,ENSG00000253197,True,3707,602,True,False,1995,344,24,23
88102,ENSG00000136099_ENSG00000288586,ENSG00000136099,ENSG00000288586,True,5088,1703,True,False,932,124,16,14
88103,ENSG00000221968_ENSG00000259060,ENSG00000221968,ENSG00000259060,True,1790,578,True,True,56,21,11,12
88104,ENSG00000147119_ENSG00000254369,ENSG00000147119,ENSG00000254369,True,2396,3992,True,False,1173,2673,12,6


In [64]:
# RICONTROLLALA, POI STIMA IL TEMPO

def create_fake_coord_neg(x, df_coord, df_pairs_full, df_int):
    g1 = x.gene1
    g2 = x.gene2
    s1 = df_coord[df_coord.gene == g1].sample(1)
    s2 = df_coord[df_coord.gene == g2].sample(1)
    
    pos = df_pairs_full[df_pairs_full.negative == x.couples].sample(1) #dovrebbe essere 1 ma non sono sicuro (e possibile piu di una? dovrei ragionarci su con l assert di prima), nel dubbio campiono.
    p1, p2 = pos.positive.split('_')
    
    interaction_coords = df_int[df_int.couples == pos.positive].sample(1) #puo essere piu di una, se ho piu di una regione di interzione
    
    if g1 == p1:
        x1, w = interaction_coords.x1, interaction_coords.w
        y1, h = s2.c1, s2.l
        
    elif g1 == p2:
        x1, w = interaction_coords.y1, interaction_coords.h
        y1, h = s1.c1, s1.l
        
    elif g2 == p1:
        x1, w = s2.c1, s2.l
        y1, h = interaction_coords.x1, interaction_coords.w
        
    elif g2 == p2:
        x1, w = s1.c1, s1.l
        y1, h = interaction_coords.y1, interaction_coords.h
        
    else:
        raise NotImplementedError
    
    return s1.c1, s1.l, s2.c1, s2.l

In [65]:
new_cols = df_neg[['couples', 'gene1', 'gene2']].apply(create_fake_coord_neg, axis = 1, args = (df_coord,df_pairs_full,df_int,))

KeyboardInterrupt: 

In [None]:
new_cols = new_cols.rename({0:'x1', 1:'y1', 2:'w', 3:'h'}, axis = 1)

In [96]:
df_neg = pd.concat([df_neg, new_cols], axis = 1)

Unnamed: 0,couples,gene1,gene2,interacting,length_1,length_2,protein_coding_1,protein_coding_2,x1,y1,w,h
0,ENSG00000145391_ENSG00000203801,ENSG00000145391,ENSG00000203801,True,7365,1785,True,False,2552,1175,25,19
1,ENSG00000143195_ENSG00000145391,ENSG00000143195,ENSG00000145391,True,13360,7365,True,True,7785,2119,22,22
2,ENSG00000039139_ENSG00000143195,ENSG00000039139,ENSG00000143195,True,15633,13360,True,True,2933,11748,18,39
3,ENSG00000123728_ENSG00000143195,ENSG00000123728,ENSG00000143195,True,3972,13360,True,True,3388,9487,19,18
4,ENSG00000143195_ENSG00000238013,ENSG00000143195,ENSG00000238013,True,13360,268,True,False,12157,154,19,19
...,...,...,...,...,...,...,...,...,...,...,...,...
88101,ENSG00000103126_ENSG00000253197,ENSG00000103126,ENSG00000253197,True,3707,602,True,False,1995,344,24,23
88102,ENSG00000136099_ENSG00000288586,ENSG00000136099,ENSG00000288586,True,5088,1703,True,False,932,124,16,14
88103,ENSG00000221968_ENSG00000259060,ENSG00000221968,ENSG00000259060,True,1790,578,True,True,56,21,11,12
88104,ENSG00000147119_ENSG00000254369,ENSG00000147119,ENSG00000254369,True,2396,3992,True,False,1173,2673,12,6


In [None]:
df = pd.concat([df_int, df_neg], ignore_index = True, axis = 0)
df.to_csv(os.path.join(processed_files_dir, 'final_df.csv'), index = False)