In [36]:
import csv
import numpy as np
np.random.seed(1337)  # for reproducibility
import pandas as pd
import h5py
import math

In [37]:
# file which contains the TSS position on the genome with chromosome number
# start, stop and strand as column
refGene = pd.read_csv('refGene-new_mm10.csv', sep = ',')

In [38]:
def get_item_of_index(myArray, indexArray) :
    # get the item of desired index into an Array
    
    res = []
    
    for x in range (0, len(indexArray)):
        res.append([])
        tmp = myArray[indexArray[x]]
        res[x].append(tmp)
        
    return res


def reorganize_random_multi_array(myArray) :
    # shuffle the elements of an array
    
    rand = np.random.choice(myArray.shape[0], len(myArray), replace=False) 
    
    
    res = myArray[rand]
    
    return res


def sliding_window(positions, elm_range) :
    # take an array as input and outputs
    # a serie of array of elm_range long around each position
    # of the input array
    
    myModel_range = []
    
    for x in range (0, len(positions)):
        myModel_range.append([])
        elm = positions[x]
        
        for y in range(-elm_range, elm_range+1):
            myModel_range[x].append(elm + y)

    
    return myModel_range



## Generating TSS class
 Generate 299 sequences of 299 bp long with a step of 1 bp around each TSS : data augmentation of positive label

In [41]:
index = range(1,23)

for i in index:
   
    f = h5py.File('chr' + str(i) + '.hdf5','r')
    sequence_chr = np.array(f[f.keys()[0]])
    f.close()

    refGene_parsed = refGene[(refGene.chr ==  'chr'+ str(i))]
    refGene_parsed = refGene_parsed.drop_duplicates(subset=['start', 'stop'], keep='last')

    refGene_parsed_start = refGene_parsed[(refGene_parsed.strand == '+')]
    refGene_parsed_stop = refGene_parsed[(refGene_parsed.strand == '-')]

    start = refGene_parsed_start['start'].values
    stop = refGene_parsed_stop['stop'].values
    all_strands = np.append(start,stop)

    sequence_chr = sequence_chr.reshape(sequence_chr.shape[0],)
    adn_sequence = sequence_chr.astype('int')


    Seq = adn_sequence
    myRng = 149
    
    # Create a rolling window of sequences of 299 bp long around each TSS 
    # if strand is positive.
    
    tmp1 = []
    tmp2 = []

    for y in range (0, len(start)):
        for x in range (0, myRng):
            tmp1 = np.append(tmp1, start[y]+ x + 1)
            tmp2 = np.append(tmp2, start[y]- x - 1)
        
        tmp = np.append(tmp2, start[y])
        tmp = np.append(tmp, tmp1)
        tmp = tmp.astype(int)


    myPos_start = tmp
    res_slide_start = sliding_window(myPos_start, myRng)
    res_start = np.array(get_item_of_index(Seq, res_slide_start))
    
    # Create sequences and take the complementary sequence of nucleotid
    # if strand is negative.

    tmp1 = []
    tmp2 = []

    for y in range (0, len(stop)):
        for x in range (0, myRng):
            tmp1 = np.append(tmp1, stop[y]+ x + 1)
            tmp2 = np.append(tmp2, stop[y]- x - 1)
        
        tmp = np.append(tmp2, stop[y])
        tmp = np.append(tmp, tmp1)
        tmp = tmp.astype(int)
    

    myPos_stop = tmp
    res_slide_stop = sliding_window(myPos_stop, myRng)
    res_stop = np.array(get_item_of_index(Seq, res_slide_stop))
    
    np.place(res_stop, res_stop==1, [5])
    np.place(res_stop, res_stop==2, [6])
    np.place(res_stop, res_stop==3, [7])
    np.place(res_stop, res_stop==4, [8])

    np.place(res_stop, res_stop==5, [2])
    np.place(res_stop, res_stop==6, [1])
    np.place(res_stop, res_stop==7, [4])
    np.place(res_stop, res_stop==8, [3])
    
    res_stop = np.flip(res_stop, axis=0)
    
    res_fin = np.append(res_stop, res_start, axis = 0)
    
    if (i == 1):
        res = res_fin
    else:
        res = np.append(res, np.array(res_fin), axis=0)

final_array = res.reshape(res.shape[0],res.shape[2])

In [42]:
np.save('X0.npy', final_array)

(1125550, 299)

## Generating non-TSS class
Generate a shuffled array of sequences of 299 bp long in the whole non TSS region

In [27]:
index= range(1,3)

for i in index:
    f = h5py.File('chr' + str(i) + '.hdf5','r')
    seq = np.array(f[f.keys()[0]])
    f.close()
   
    refGene_parsed_ = refGene[(refGene.chr ==  'chr'+ str(i))]
    refGene_parsed_ = refGene_parsed_.drop_duplicates(subset=['start', 'stop'], keep='last')

    refGene_parsed_start_ = refGene_parsed_[(refGene_parsed_.strand == '+')]
    refGene_parsed_stop_ = refGene_parsed_[(refGene_parsed_.strand == '-')]

    start = refGene_parsed_start_['start'].values
    stop = refGene_parsed_stop_['stop'].values
    positions = np.append(start, stop)
    
    del_range = 299*2

    del_arr_inc = np.array([])
    del_arr_dec = np.array([])
    
    for num in range(1,del_range+1):
        del_arr_inc = np.concatenate((del_arr_inc,[x+num for x in positions]),axis=0)
        del_arr_dec = np.concatenate((del_arr_dec,[x-num for x in positions]),axis=0)
    
    del_arr = np.concatenate((del_arr_dec, positions),axis=0)
    del_arr = np.concatenate((del_arr,del_arr_inc),axis=0)
    del_arr = del_arr[del_arr >= 0] # Remove Negatives
    
    final_del_array = np.delete(seq, del_arr)





In [34]:
end = int(math.fmod(final_del_array.shape[0],299))
final_del_array = final_del_array[:-end]

X_slide = final_del_array.reshape(int(final_del_array.shape[0]//299.),299)

In [30]:
X_slide.shape

(600985, 299)

In [32]:
X_0 = reorganize_random_multi_array(X_slide)

In [None]:
np.save('X1.npy', X_slide)