In [1]:
its_jupyter_notebook = True

In [2]:
import pandas as pd
import os
import time
import numpy as np
import seaborn as sns
import pickle
import torch
from pathlib import Path
from tqdm.notebook import tqdm
import argparse
import math
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, auc
from scipy.stats import chi2_contingency, fisher_exact

sys.path.insert(0, '..')

import dataset.preprocessing as utils
from config import *

In [6]:
columns_to_keep = ['gene1', 'gene2', 'x1', 'y1', 'w', 'h']
df_paris = pd.read_csv(os.path.join(processed_files_dir, 'paris.csv')).filter(columns_to_keep, axis = 1)
df_paris['dataset'] = 'paris'
df_ricseq = pd.read_csv(os.path.join(processed_files_dir, 'ricseq.csv')).filter(columns_to_keep, axis = 1)
df_ricseq['dataset'] = 'ricseq'
df_mario = pd.read_csv(os.path.join(processed_files_dir, 'mario.csv')).filter(columns_to_keep, axis = 1)
df_mario['dataset'] = 'mario'

df_genes_paris_ricseq=pd.read_csv(os.path.join(processed_files_dir, 'df_genes.csv'))[['gene_id', 'cdna', 'length']]

In [7]:
def create_df_coord(df_coord):
    #dataset with all the interactinons of the datasets
    df_coord['x2'] = df_coord['x1'] + df_coord['w'] 
    df_coord['y2'] = df_coord['y1'] + df_coord['h'] 

    df_coord1 = df_coord[['gene1', 'x1', 'x2', 'dataset']].rename({'gene1':'gene_id', 'x1':'start', 'x2':'end'}, axis=1)
    df_coord2 = df_coord[['gene2', 'y1', 'y2', 'dataset']].rename({'gene2':'gene_id', 'y1':'start', 'y2':'end'}, axis=1)
    df_coord = pd.concat([df_coord1, df_coord2], axis = 0).drop_duplicates().reset_index(drop = True)
    return df_coord

In [8]:
df_coord = create_df_coord(
    pd.concat([df_paris, df_ricseq, df_mario], axis = 0).reset_index(drop = True)
).merge(df_genes_paris_ricseq[['gene_id', 'length']])

Splash è un discorso a parte perche ha altre annotazioni dei geni

In [9]:
df_splash = pd.read_csv(os.path.join(processed_files_dir, 'splash.csv')).filter(columns_to_keep, axis = 1)
df_splash['dataset'] = 'splash'
df_genes_splash=pd.read_csv(os.path.join(processed_files_dir, 'df_genes_splash.csv'))[['gene_id', 'cdna', 'length']]

df_coord_splash = create_df_coord(
    df_splash
).merge(df_genes_splash[['gene_id', 'length']])

In [10]:
assert (df_coord['end'] <= df_coord['length']).all()
assert (df_coord_splash['end'] <= df_coord_splash['length']).all()

In [11]:
def create_negative_sample(len_fake_neg_region, length, df_coord, num_tries = 100):
    for i in range(num_tries):
        try:
            start_coord = np.random.randint(0, length-(len_fake_neg_region+1))
            end_coord = start_coord + len_fake_neg_region
            for _, row_coord in df_coord.iterrows():
                #print( set(range(start_coord,end_coord)) )
                assert set(range(start_coord,end_coord)).intersection(set(range(int(row_coord.start),int(row_coord.end)))) == set()
            return start_coord, end_coord
        except:
            continue
    else:
        return np.nan, np.nan

In [12]:
len_fake_neg_region = int(np.round((df_coord.end - df_coord.start).mean(), 0))
print('length of fake negative region =', len_fake_neg_region)

length of fake negative region = 29


In [13]:
def obtain_cdna_slice(x):
    return x['cdna'][x.start:x.end]

def create_posneg(df_coord, df_genes):
    diz = {}
    gene_list = list(set(df_coord.gene_id))
    index = 0

    #per ogni gene campiono 1 positivo e 1 negativo
    for gene in tqdm(gene_list):
        df_coord_gene = df_coord[df_coord['gene_id'] == gene]
        positive_sampled = df_coord_gene.sample(1).iloc[0]
        length = positive_sampled.length
        gene_id = positive_sampled.gene_id
        dataset = positive_sampled.dataset
        positive_start = positive_sampled.start
        positive_end = positive_sampled.end
        negative_start, negative_end = create_negative_sample(len_fake_neg_region, length, df_coord_gene)
        diz[index] = {'gene_id':gene_id, 'start':positive_start, 'end':positive_end, 'length':length, 'dataset':dataset, 'how': 'positive'}
        index += 1
        diz[index] = {'gene_id':gene_id, 'start':negative_start, 'end':negative_end, 'length':length, 'dataset':dataset, 'how': 'negative'}
        index += 1
    df_posneg = pd.DataFrame.from_dict(diz, 'index')
    df_posneg = df_posneg.merge(df_genes[['gene_id', 'cdna']])
    df_posneg = df_posneg.dropna().reset_index(drop = True)
    df_posneg['start'] = df_posneg.start.astype(int)
    df_posneg['end'] = df_posneg.end.astype(int)
    df_posneg['cdna_slice'] = df_posneg.apply(obtain_cdna_slice, axis = 1)
    df_posneg = df_posneg.drop('cdna', axis = 1)
    df_posneg = df_posneg.rename({'cdna_slice':'cdna'}, axis = 1)
    df_posneg['id_query'] = df_posneg['gene_id'] + '_' + df_posneg['start'].astype(str) + '_' + df_posneg['end'].astype(str)
    return df_posneg

In [14]:
filepath = os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings', 'df_posneg_splash.csv')
if os.path.isfile(filepath):
    df_posneg_splash = pd.read_csv(filepath)
else:
    create_posneg(df_coord_splash, df_genes_splash)
    df_posneg_splash.to_csv(filepath, index = False)

filepath = os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings', 'df_posneg_paris_ricseq.csv')
if os.path.isfile(filepath):
    df_posneg_paris_ricseq = pd.read_csv(filepath)
else:
    df_posneg_paris_ricseq = create_posneg(df_coord, df_genes_paris_ricseq)
    df_posneg_paris_ricseq.to_csv(os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings', 'df_posneg_paris_ricseq.csv'), index = False)

In [15]:
filepath = os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings', 'df_posneg.csv')
if os.path.isfile(filepath):
    df_posneg = pd.read_csv(filepath)
else:
    df_posneg = pd.concat([df_posneg_paris_ricseq, df_posneg_splash], axis = 0).reset_index(drop = True)
    df_posneg.to_csv(os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings', 'df_posneg.csv'), index = False)

### Now the repeats

In [18]:
filepath = os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings', 'df_repeats.csv')
if os.path.isfile(filepath):
    df_posneg_splash = pd.read_csv(filepath)
else:
    mm = pd.read_csv(os.path.join(original_files_dir, 'repeats.mm.bed'), sep = '\t', header = None)
    hs = pd.read_csv(os.path.join(original_files_dir, 'repeats.hs.bed'), sep = '\t', header = None)

    df = pd.concat([mm, hs], axis = 0).reset_index(drop = True)

    df = df.rename({0:'chrom', 1:'start', 2:'end', 6:'feature'}, axis = 1)

    df = df[['chrom', 'start', 'end', 'feature']]
    
    info = utils.read_dataframe(os.path.join(original_files_dir, 'index_bio_regions.Tx.RI_ALL.txt'), columns_to_drop = ['Unnamed: 0'])
    info1 = info[['chrom_1', 'ensembl_gene_id_1']].rename({'chrom_1':'chrom', 'ensembl_gene_id_1':'gene_id'}, axis = 1)
    info2 = info[['chrom_2', 'ensembl_gene_id_2']].rename({'chrom_2':'chrom', 'ensembl_gene_id_2':'gene_id'}, axis = 1)
    info = pd.concat([info1, info2], axis = 0).drop_duplicates().reset_index(drop = True)
    
    # remove the genes not in our datasets
    genes_to_remove = set(df.chrom) - set(info.chrom)

    df = df[~df.chrom.isin(genes_to_remove)].reset_index(drop = True)

    df = df.merge(info)

    df = df.merge(df_genes_paris_ricseq).reset_index(drop = True)

    assert (df.length >= df.end).all()

    df['start'] = df.start.astype(int)
    df['end'] = df.end.astype(int)
    df['cdna_slice'] = df.apply(obtain_cdna_slice, axis = 1)
    df = df.drop('cdna', axis = 1)
    df = df.rename({'cdna_slice':'cdna'}, axis = 1)
    df['id_query'] = df['gene_id'] + '_' + df['start'].astype(str) + '_' + df['end'].astype(str)
    df.to_csv(filepath)

# Create df_query

In [35]:
emb_dir = os.path.join(processed_files_dir, 'nt_data', 'mean_embeddings')

In [43]:
df_query = pd.concat([df[['id_query', 'cdna']], df_posneg[['id_query', 'cdna']]], axis = 0)

In [44]:
# all_files = os.listdir(os.path.join(emb_dir, '32'))
# all_files = list(pd.Series(all_files).str.extractall('(.*)\.npy').reset_index()[0])
# df_query = df_query[~df_query.id_query.isin(all_files)]

In [45]:
df_query.to_csv(os.path.join(emb_dir, 'embedding_query.csv'))

In [42]:
def estimate_time_and_space(n_samples):
    #TIME
    minutes = 3219*n_samples/(228278)
    hours = minutes/60
    days = hours/24
    print('estimated # hours:', np.round(hours, 2))
    print('estimated # days:', np.round(days, 2))

    mb = 10.2*n_samples
    gb = mb/1000
    tb = gb/1000
    print('estimated terabytes (pessimistic):', np.round(tb, 2))
    mb = 1995*n_samples/(300)
    gb = mb/1000
    tb = gb/1000
    print('estimated terabytes (realistic):', np.round(tb, 2))
estimate_time_and_space(df_query.shape[0])

estimated # hours: 49.17
estimated # days: 2.05
estimated terabytes (pessimistic): 2.13
estimated terabytes (realistic): 1.39
