# Creating Training: Take 2

# libraries

In [2]:
import numpy as np
import os
import tensorflow as tf

# defining regions

The narrowPeak file contains all open regions for the LCL tissue

In [3]:
def import_regions(file_dir):
    regions = {}
    with open(file_dir) as f:
        for line in f:
            row = line.split()
            
            if row[0] not in regions:
                regions[row[0]] = []
            regions[row[0]].append([int(row[1]), int(row[2])])
            
    return regions

# bringing in LCL and ALL files

In [4]:
openLCL = import_regions("data/CENTIPEDEdata/wgEncodeAwgDnaseUwdukeGm12878UniPk.narrowPeak")
openLCL.keys()

dict_keys(['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr20', 'chr21', 'chr22', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrX'])

In [5]:
openALL = import_regions("data/CENTIPEDEdata/wgEncodeRegDnaseClusteredV3.bed")
openALL.keys()

dict_keys(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22', 'chrX', 'chrY'])

### how to manipulate dict

In [6]:
chromosomes = list(openALL.keys())[:-2] #remove sex chromosomes
output = dict.fromkeys(chromosomes)
output['chr1'] = [[10,20,0], [30,40,1]]
output['chr2'] = [[50,60,1], [70,80,0]]
output

{'chr1': [[10, 20, 0], [30, 40, 1]],
 'chr2': [[50, 60, 1], [70, 80, 0]],
 'chr3': None,
 'chr4': None,
 'chr5': None,
 'chr6': None,
 'chr7': None,
 'chr8': None,
 'chr9': None,
 'chr10': None,
 'chr11': None,
 'chr12': None,
 'chr13': None,
 'chr14': None,
 'chr15': None,
 'chr16': None,
 'chr17': None,
 'chr18': None,
 'chr19': None,
 'chr20': None,
 'chr21': None,
 'chr22': None}

## use try and except
### I think this is pretty solid now

In [19]:
def create_output(chromosomes, tissue, openALL):
    output = dict.fromkeys(chromosomes)
    
    for chrm in chromosomes:
        
        i=0
        regionlist = []
        for region in tissue[chrm]:
            open_region = [region[0], region[1], 1]

            try:
                # | ALL1 | ALL2 | LCL1 |   while no overlap
                while openALL[chrm][i][1] < open_region[0]: # ALL_end is smaller than LCL_start
                    # add in closed region
                    # [start, end, closed/open]
                    closed_region = [openALL[chrm][i][0], openALL[chrm][i][1], 0]
                    regionlist.append(closed_region)
                    i += 1
            except:
                # there are some LCL_region left
                # | LCL1 ALL1 ALL2 | LCL2 | LCL3 | end
                regionlist.append(open_region)
            
            # now, we must be at an overlap or past it
        
            try:
                # | ALL1 | LCL1 | ALL2 |   ALL_region is past LCL. no overlap
                if openALL[chrm][i][0] > open_region[1]: # ALL_start is bigger than LCL_end
                    # insert open region
                    regionlist.append(open_region)
        
                # | ALL1 LCL1 | ALL2    overlap exists
                else:
                    # insert open region
                    regionlist.append(open_region)
            
                    try:
                        # | ALL1 LCL1 ALL2 ALL3 | ALL4 |   skip until overlap ends
                        while openALL[chrm][i][0] <= open_region[1]:
                            i += 1
                    except:
                        # this means ALL has run out during an overlap
                        # | LCL1 ALL1 ALL2 | end
                        pass
            except:
                pass
                
        # tail end
        # there may still be some ALL_region remaining
        # | LCL1 | ALL1 | ALL2 | ALL3 | end
        for region in openALL[chrm][i:]:
            closed_region = [region[0], region[1], 0]
            regionlist.append(closed_region)
        
        output[chrm] = regionlist
        
    return output

In [20]:
chromosomes = list(openALL.keys())[:-2] #remove sex chromosomes
output = create_output(chromosomes, openLCL, openALL)

In [23]:
# very very sweeping overlook that my code is right
# this is more here for curiosity
for chrm in chromosomes:
    opnum = 0
    for row in output[chrm]:
        opnum += row[2]
    #print(chrm, opnum == len(openLCL[chrm]))
    print(chrm, opnum/len(output[chrm]))

chr1 0.10079543970612449
chr2 0.08806865522412931
chr3 0.08906982077913568
chr4 0.07654456587401834
chr5 0.08895746171175435
chr6 0.1042573778422835
chr7 0.08746071312452584
chr8 0.08146672237040846
chr9 0.09575519497579223
chr10 0.0883768733023607
chr11 0.09989918819971348
chr12 0.10230939226519337
chr13 0.07104799216454456
chr14 0.10053374883967643
chr15 0.10035502578354673
chr16 0.11478314459594792
chr17 0.13051407877817725
chr18 0.07417302280050578
chr19 0.16821369445116166
chr20 0.09642731432594157
chr21 0.08853446171372772
chr22 0.11846296106031029


In [24]:
# there should be no repeats
for chrm in chromosomes:
    myset = set(tuple(x) for x in output[chrm])
    print(len(output[chrm]) == len(myset))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


## make Ytrain

In [35]:
Ytrain = []
for chrm in chromosomes:
    for row in output[chrm]:
        Ytrain.append(row[2])
Ytrain = np.array(Ytrain)

full_len = sum([len(output[chrm]) for chrm in chromosomes])
full_len == len(Ytrain)

True

# bring in motif files

In [61]:
def import_motifs(path, regions):
    training_input = []
    for motiffile in os.listdir(path):
        motif_foot = {}
        with open(os.path.join(path, motiffile)) as f:
            for line in f:
                row = line.split()
                
                if row[0] not in motif_foot:
                    motif_foot[row[0]] = []
                motif_foot[row[0]].append([int(row[1]), int(row[2])])
        
        current_motif = { chrm: motif_foot[chrm] for chrm in chromosomes } # filter out unwanted chromosomes
        motif_col = []
        for chrm in chromosomes:
            i = 0
            for region in regions[chrm]:
                if i >= len(current_motif[chrm]):
                    motif_col.append(0)
                else:
                    motif_reg = current_motif[chrm][i]
                    if region[1] < motif_reg[0]: # no overlap
                        motif_col.append(0)
                        # move to next region
                    else:
                        if region[0] > motif_reg[1]: # no overlap
                            motif_col.append(0)
                            i += 1 # move motif[i] forward
                        else: # must be overlap
                            motif_col.append(1)
                            # don't move forward as next region might also overlap
                    
            # if motif[i] is past the last region in regionlist then that's okay
        
        training_input.append(motif_col)
    
    return(training_input)
                

In [62]:
path = "data/CENTIPEDEdata/motif.combo"
Xtrain = np.transpose(import_motifs(path, output))

In [65]:
Xtrain.shape

(1851152, 5)

# A little bit of using tensorflow and keras

In [74]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(32, activation=tf.nn.relu),
    tf.keras.layers.Dense(2, activation=tf.nn.softmax)
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metris=['accuracy'])
model.fit(Xtrain, Ytrain, batch_size = 32, epochs=10, validation_split = 0.1)

model.summary()

Train on 1666036 samples, validate on 185116 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              multiple                  192       
_________________________________________________________________
dense_4 (Dense)              multiple                  1056      
_________________________________________________________________
dense_5 (Dense)              multiple                  66        
Total params: 1,314
Trainable params: 1,314
Non-trainable params: 0
_________________________________________________________________
