In [None]:
import csv
import numpy as np
np.random.seed(1337)  # for reproducibility
import h5py
import pandas as pd


# Generating TSS regions

Generate sequences of 299 bp long around each TSS : positively labeled sequences

In [None]:
# file which contains the TSS position on the genome with chromosome number
# start, stop and strand as column
refGene = pd.read_csv('refGene-new_mm10.csv', sep = ',')

In [None]:
index= range(1,23)

for i in index:
    
    f = h5py.File('chr' + str(i) + '.hdf5','r')
    seq = np.array(f[f.keys()[0]])
    f.close()

    refGene_parsed = refGene[(refGene.chr ==  'chr'+ str(i))]
    refGene_parsed = refGene_parsed.drop_duplicates(subset=['start', 'stop'], keep='last')

    refGene_parsed_start = refGene_parsed[(refGene_parsed.strand == '+')]
    refGene_parsed_stop = refGene_parsed[(refGene_parsed.strand == '-')]

    start = refGene_parsed_start['start'].values
    stop = refGene_parsed_stop['stop'].values

    sequ = seq.reshape(seq.shape[0],)
    adn_sq = sequ.astype('int')
    
    X_slide = np.array([])
    
    # take 149 bp at each side of a TSS if strand is positive
    for x in start:
        n = 149
        X_slide = np.append(X_slide, adn_sq[x-n:x+n+1])
    X_slide_start = X_slide.reshape(X_slide.shape[0]/299, 299)
    
    X_slide = np.array([])
    
    # calculate the complementary sequence of nucleotid if the strand is negative
    # and take 149 bp at each side of the TSS
    for x in stop:
        n = 149
        X_slide = np.append(X_slide, adn_sq[x-n:x+n+1])
    X_slide_stop = X_slide.reshape(X_slide.shape[0]/299, 299)  

    np.place(X_slide_stop, X_slide_stop==1., [5])
    np.place(X_slide_stop, X_slide_stop==2., [6])
    np.place(X_slide_stop, X_slide_stop==3., [7])
    np.place(X_slide_stop, X_slide_stop==4., [8])

    np.place(X_slide_stop, X_slide_stop==5., [2])
    np.place(X_slide_stop, X_slide_stop==6., [1])
    np.place(X_slide_stop, X_slide_stop==7., [4])
    np.place(X_slide_stop, X_slide_stop==8., [3])
    
    
    reverse = np.flip(X_slide_stop, axis=1)
    
    
    X1 = np.append(X_slide_start, reverse, axis=0)
    

    if (i == 1):
        res = X1
    else:
        res = np.append(res, X1, axis=0)

In [None]:
np.save('X0.npy',res)

# Generating non-TSS regions 
Generate n times more negatively labeled sequences in non TSS regions (299 bp away from any TSS)

In [None]:
# Convert a numpy array to multi array and if the shape is not correct then reshape it by removing x starting elements.

def convert_array_to_multi(myArray, number_of_lines, number_of_column) :
    
    if (len(myArray) != number_of_lines * number_of_column):
        #if the array has not the right shape, then reshape it by removing x starting elements
        resized_array = np.delete(myArray, range(0,len(myArray) - (number_of_lines * number_of_column ) ) , 0)
        res = np.reshape(resized_array,(number_of_lines,number_of_column))
    else:
        res = np.reshape(myArray,(number_of_lines,number_of_column))
        
    return res



# reorganize a multi array by putting elements list in random

def reorganize_random_multi_array(myArray) :
    
    rand = np.random.choice(len(myArray), len(myArray), replace=False) # create an array of random indexes btw 0 and len(myArray)
    res = myArray[rand]
    
    return res

In [None]:

# n: the ratio of negative samples to the balanced data 
n = 100
index= range(1,23)

for i in index:
    
    f = h5py.File('chr' + str(i) + '.hdf5','r')
    seq = np.array(f[f.keys()[0]])
    f.close()

    refGene_parsed = refGene_new[(refGene_new.chr == 'chr'+str(i))]
    refGene_parsed_ = refGene_parsed.drop_duplicates(subset=['start', 'stop'], keep='last')
    refGene_parsed_start = refGene_parsed_[(refGene_parsed_.strand == '+')]
    refGene_parsed_stop = refGene_parsed_[(refGene_parsed_.strand == '-')]

    start = refGene_parsed_start['start'].values
    stop = refGene_parsed_stop['stop'].values

    positions = np.append(start, stop)
    
    # delete sequence of 149*2 bp long at each side of a TSS
    del_range = 149*2

    del_arr_inc = np.array([])
    del_arr_dec = np.array([])
    for num in range(1,del_range+1):
        del_arr_inc = np.concatenate((del_arr_inc,[x+num for x in positions]),axis=0)
        del_arr_dec = np.concatenate((del_arr_dec,[x-num for x in positions]),axis=0)
        
    del_arr = np.concatenate((del_arr_dec, positions),axis=0)
    del_arr = np.concatenate((del_arr,del_arr_inc),axis=0)
    del_arr = del_arr[del_arr >= 0] # Remove Negative
    
    C = np.delete(seq, del_arr)
    C_index = np.delete(range(0,len(seq)), del_arr) 

    # the maximum negatively labeled sequence we can take is len(C) / 299
    m = len(positions)*n
    if (m*299 > len(C)):
        m = int(len(C) // 299)
    
    conv_array = convert_array_to_multi(C, m, 299)
    
    if (i == 1):
        res = reorganize_random_multi_array(conv_array)
    else:
        res = np.append(res, reorganize_random_multi_array(conv_array), axis=0)


In [None]:
np.save('X1.npy',res)