In [3]:
#-----import packages-----#

#common python packages
import numpy as np
import pickle
import argparse
import os

#biological packages
import pybedtools
from pybedtools import featurefuncs
import pyBigWig

#machine learning packages
import sklearn
from sklearn import metrics
from matplotlib import pyplot as plt
import pandas as pd

In [4]:
#parsing command line arguments
# -----parsing command line arguments-----#
parser = argparse.ArgumentParser(description='Training CNN model to predict STARR-seq enhancers based on chromatin accessbility and histone marks')
parser.add_argument('-c', '--cell_types', type=str, help='comma separated string of cell_types')
parser.add_argument('-i', '--in_dir', type=str, help='directory containing 01_data_encoding intermediate tsv files')

#simulate command line input
cmdline_str='-c ' + " HepG2,K562 " + \
    ' -i ' + "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/dev/encoded_2overlap/ATAC/"

seq_names = ["ATAC", "H3K27ac", "H3K4me3", "H3K9ac", "H3K4me1"]

#check if the files are there
args = parser.parse_args(cmdline_str.split())
args.cell_types = args.cell_types.split(",")
for cell in args.cell_types:
    for seq in seq_names:
        pos_file = args.in_dir + cell + "." + seq + ".pos.tsv"
        if not os.path.exists(pos_file):
            print(pos_file + " file does not exist")
            exit(1)
        neg_file = args.in_dir + cell + "." + seq + ".neg.tsv"
        if not os.path.exists(neg_file):
            print(neg_file + " file does not exist")
            exit(1)
print("all files found!")

all files found!


<h6> hg38 Data Preprocessing </h6>

In [5]:
def get_data(cell_types, in_dir, seq_names):

    first_cell = True
    for cell in cell_types:
        print(cell)

        pos = []
        neg = []
        first_seq = True
        for seq in seq_names:
            print("-"+seq)

            pos_name = in_dir+cell+"."+seq+".pos.tsv"
            pos_mat = np.loadtxt(pos_name, delimiter='\t')

            neg_name = in_dir+cell+"."+seq+".neg.tsv"
            neg_mat = np.loadtxt(neg_name, delimiter='\t')

            if first_seq:
                for i in pos_mat:
                    pos.append(np.array([i]))
                for i in neg_mat:
                    neg.append(np.array([i]))
                first_seq = False
            else:
                for i in range(len(pos)):
                    pos[i] = np.vstack((pos[i], pos_mat[i,]))
                for i in range(len(neg)):
                    neg[i] = np.vstack((neg[i], neg_mat[i,]))

        if first_cell == True:
            X_pos = np.array(pos)
            X_neg = np.array(neg)
            first_cell = False
        else:
            X_pos = np.vstack((X_pos, pos))
            X_neg = np.vstack((X_neg, neg))

    X = np.vstack((X_pos, X_neg))
    y = np.array([1 for i in range(X_pos.shape[0])] + [0 for i in range(X_neg.shape[0])]).reshape(-1,1)
    print(X.shape)
    print(y.shape)
    
    return X, y

for in_dir in [args.in_dir, args.in_dir.replace("2overlap", "1overlap"), args.in_dir.replace("2overlap", "0overlap")]:
    X, y = get_data(args.cell_types, in_dir, seq_names)
    with open(in_dir + "hg38_signals.pickle", 'wb') as f:
        pickle.dump((X,y), f, protocol=4)
        
window_size = int(X.shape[2] * 10)

HepG2
-ATAC
-H3K27ac
-H3K4me3
-H3K9ac
-H3K4me1
K562
-ATAC
-H3K27ac
-H3K4me3
-H3K9ac
-H3K4me1
(158147, 5, 400)
(158147, 1)
HepG2
-ATAC
-H3K27ac
-H3K4me3
-H3K9ac
-H3K4me1
K562
-ATAC
-H3K27ac
-H3K4me3
-H3K9ac
-H3K4me1
(414282, 5, 400)
(414282, 1)
HepG2
-ATAC
-H3K27ac
-H3K4me3
-H3K9ac
-H3K4me1
K562
-ATAC
-H3K27ac
-H3K4me3
-H3K9ac
-H3K4me1
(243388, 5, 400)
(243388, 1)


<h6> mm10 Data Preprocessing </h6>

In [33]:
# --- ENCODE mm10 validation #1 ---#
def rename_tissue(feature):
    feature.name = feature[9]
    return feature

val_file1 = "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/datasets/validation/ENCODE/ENCFF095OCG.bed"
val_region1 = pybedtools.BedTool(val_file1).each(pybedtools.featurefuncs.midpoint).each(rename_tissue)
val_region1 = val_region1.slop(b=window_size/2, genome="mm10")
val_region1 = val_region1.filter(pybedtools.featurefuncs.greater_than, window_size-1)
val_region1 = val_region1.sort()
print(val_region1.head())
print(val_region1.count())

chr1	13003151	13007152	negative	0	.	13003747	13006556	0,0,255	negative
 chr1	31101021	31105022	negative	0	.	31101599	31104444	0,0,255	negative
 chr1	38197302	38201303	negative	0	.	38196744	38201861	0,0,255	negative
 chr1	39946111	39950112	forebrain [7/9], cranial nerve [7/9], dorsal root ganglion [7/9]	1	.	39945533	39950689	255,0,0	forebrain [7/9], cranial nerve [7/9], dorsal root ganglion [7/9]
 chr1	68778680	68782681	negative	0	.	68779329	68782031	0,0,255	negative
 chr1	75287729	75291730	forebrain [5/12]	1	.	75288287	75291172	255,0,0	forebrain [5/12]
 chr1	75405463	75409464	negative	0	.	75405116	75409810	0,0,255	negative
 chr1	97538619	97542620	forebrain [3/4], midbrain [3/4], hindbrain [3/4], neural tube [3/4]	1	.	97538497	97542741	255,0,0	forebrain [3/4], midbrain [3/4], hindbrain [3/4], neural tube [3/4]
 chr1	127754934	127758935	negative	0	.	127754802	127759066	0,0,255	negative
 chr1	158264756	158268757	midbrain [3/4], hindbrain [3/4], neck [3/4]	1	.	158265467	158268046	255,0,0	m

In [34]:
# --- ENCODE hg19 homologs -> mm10 validation #2 --- #
val_file2 = "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/datasets/validation/ENCODE/ENCFF915YMM.mm10.lifted.bed"
val_region2 = pybedtools.BedTool(val_file2).each(pybedtools.featurefuncs.midpoint).each(rename_tissue)
val_region2 = val_region2.slop(b=window_size/2, genome="mm10")
val_region2 = val_region2.filter(pybedtools.featurefuncs.greater_than, window_size-1)
val_region2 = val_region2.sort()
print(val_region2.head())
print(val_region2.count())

chr1	133742910	133746911	negative	0	.	133744226	133745595	255,0,0	negative
 chr1	182109007	182113008	branchial_arch[4/5]	1	.	182109962	182112053	0,0,255	branchial_arch[4/5]
 chr1	190059833	190063834	heart[8/11]	1	.	190061318	190062349	0,0,255	heart[8/11]
 chr10	42249592	42253593	negative	0	.	42250903	42252281	255,0,0	negative
 chr10	58381197	58385198	negative	0	.	58382774	58383620	255,0,0	negative
 chr11	88466068	88470069	negative	0	.	88467416	88468720	255,0,0	negative
 chr11	113731693	113735694	negative	0	.	113733041	113734346	255,0,0	negative
 chr12	8384538	8388539	heart[10/10]<br>ear[7/10]<br>other[5/10]	1	.	8385674	8387403	0,0,255	heart[10/10]<br>ear[7/10]<br>other[5/10]
 chr13	3724709	3728710	negative	0	.	3726203	3727215	255,0,0	negative
 chr13	46883320	46887321	negative	0	.	46884783	46885858	255,0,0	negative
 None
31


In [35]:
# --- VISTA mm9 -> mm10 validation ---#
val_file3 = "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/datasets/validation/VISTA/VISTA_mm9.mm10.lifted.bed"
val_region3 = pybedtools.BedTool(val_file3).each(pybedtools.featurefuncs.midpoint)#.each(rename_tissue)
val_region3 = val_region3.slop(b=window_size/2, genome="mm10")
val_region3 = val_region3.filter(pybedtools.featurefuncs.greater_than, window_size-1)
val_region3 = val_region3.sort()
print(val_region3.head())
print(val_region3.count())

chr1	5020950	5024951	negative
 chr1	9647594	9651595	positive;neural_tube[5/8];hindbrain_(rhombencephalon)[7/8];midbrain_(mesencephalon)[6/8];dorsal_root_ganglion[4/8];trigeminal_V_(ganglion,_cranial)[6/8];cranial_nerve[5/8]
 chr1	11025292	11029293	negative
 chr1	12498493	12502494	negative
 chr1	12508534	12512535	positive;midbrain_(mesencephalon)[4/8];forebrain[5/8];nose[6/8]
 chr1	12614067	12618068	negative
 chr1	13003151	13007152	negative
 chr1	19765625	19769626	negative
 chr1	31101021	31105022	negative
 chr1	33988988	33992989	negative
 None
1208


In [37]:
# --- VISTA hg19 homologs -> mm10 validation ---#
val_file4 = "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/datasets/validation/VISTA/VISTA_hg19.mm10.lifted.bed"
val_region4 = pybedtools.BedTool(val_file4).each(pybedtools.featurefuncs.midpoint)#.each(rename_tissue)
val_region4 = val_region4.slop(b=window_size/2, genome="mm10")
val_region4 = val_region4.filter(pybedtools.featurefuncs.greater_than, window_size-1)
val_region4 = val_region4.sort()
print(val_region4.head())
print(val_region4.count())

chr1	6727774	6731775	positive;neural_tube[7/9];hindbrain_(rhombencephalon)[7/9];midbrain_(mesencephalon)[7/9];dorsal_root_ganglion[7/9];forebrain[7/9];trigeminal_V_(ganglion,_cranial)[6/9]
 chr1	18387523	18391524	negative
 chr1	18954393	18958394	negative
 chr1	19105223	19109224	positive;hindbrain_(rhombencephalon)[3/6];midbrain_(mesencephalon)[6/6]
 chr1	19556394	19560395	negative
 chr1	19697994	19701995	positive;hindbrain_(rhombencephalon)[14/20];midbrain_(mesencephalon)[11/20]
 chr1	20104953	20108954	negative
 chr1	20919296	20923297	positive;hindbrain_(rhombencephalon)[5/8];midbrain_(mesencephalon)[5/8]
 chr1	39441235	39445236	positive;heart[5/9]
 chr1	40941537	40945538	positive;heart[4/6]
 None
1903


In [38]:
# --- merge validation regions from all three sources --- #
validation_regions = val_region1.cat(val_region2, postmerge=False).cat(val_region3, postmerge=False).cat(val_region4, postmerge=False)
validation_regions.saveas("./mm10/mm10.validation_regions.bed")
validation_regions.count()

3244

In [39]:
# ---- constant & function declarations ---- #
def tissue_pos(feature, tissue):
    if (tissue in feature.name) or (tissue.replace(" ", "_") in feature.name):
        return True
    else:
        return False

def tissue_neg1(feature, tissue):
    if (tissue in feature.name) or (tissue.replace(" ", "_") in feature.name):
        return False
    else:
        return True
    
def tissue_neg2(feature, tissue):
    if "negative" in feature.name:
        return True
    else:
        return False
    
def bigWigAverageOverBed(x, bigwig):
    return bigwig.stats(x.chrom, x.start, x.stop, nBins=int(window_size/10))

def get_signal(region, bigwig):
    return [bigWigAverageOverBed(x, bigwig) for x in region]

samples = ["forebrain", "heart", "hindbrain", "limb", "midbrain", "neural tube"]
seq_names = ["ATAC-seq", "ChIP-seq.H3K27ac", "ChIP-seq.H3K4me3", "ChIP-seq.H3K9ac", "ChIP-seq.H3K4me1"]
sample_dir = "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/datasets/validation/bigWig/"
file_samples = ["forebrain", "heart", "hindbrain", "limb", "midbrain", "neural_tube"]

In [40]:
# ---- file organizations ---- #
peak_set = []
track_set = []
for sam in file_samples:
    all_seq_peak = []
    all_seq_track = []
    for seq in seq_names:
        all_seq_peak.append(sample_dir + sam + "."+ seq + ".narrowPeak")
        all_seq_track.append(sample_dir + sam + "."+ seq + ".bigWig")
    peak_set.append(all_seq_peak)
    track_set.append(all_seq_track)

In [41]:
# ---- all cell-specific positive & negative signal extraction ---- #

X = []
Y = []
for i in range(len(samples)):
    
    #organized per track for all samples
    signal_X = []
    signal_Y = []
    for track in track_set[i]:        
        pos_bed = pybedtools.BedTool("./mm10/mm10.validation_regions.bed").filter(tissue_pos, samples[i])
        neg_bed = pybedtools.BedTool("./mm10/mm10.validation_regions.bed").filter(tissue_neg1, samples[i])
        pos = get_signal(pos_bed, pyBigWig.open(track))
        neg = get_signal(neg_bed, pyBigWig.open(track))
        x_combined = pos + neg
        signal_X.append(x_combined)
    print(samples[i], len(pos), len(neg))

    #reorganize per sample for all tracks
    signal_reform_X = []
    for j in range(len(x_combined)):
        per_track_X = []
        for t in range(len(track_set[i])):
            per_track_X.append(np.array(signal_X[t][j]))
        signal_reform_X.append(np.array(per_track_X))
    signal_reform_X = np.expand_dims(np.array(signal_reform_X), axis=4)
    X.append(signal_reform_X)
    
    y_combined = [1 for i in pos] + [0 for i in neg]
    Y.append(np.array(y_combined))

with open("./mm10/mm10_all_signals.pickle", 'wb') as f:
    pickle.dump((X,Y), f)

forebrain 524 2720




heart 315 2929
hindbrain 412 2832
limb 348 2896
midbrain 454 2790
neural tube 284 2960


In [43]:
# ---- all cell-specific positive & true negative signal extraction ---- #

X = []
Y = []
for i in range(len(samples)):
    
    #organized per track for all samples
    signal_X = []
    signal_Y = []
    for track in track_set[i]:        
        pos_bed = pybedtools.BedTool("./mm10/mm10.validation_regions.bed").filter(tissue_pos, samples[i])
        neg_bed = pybedtools.BedTool("./mm10/mm10.validation_regions.bed").filter(tissue_neg2, samples[i])
        pos = get_signal(pos_bed, pyBigWig.open(track))
        neg = get_signal(neg_bed, pyBigWig.open(track))
        x_combined = pos + neg
        signal_X.append(x_combined)
    print(samples[i], len(pos), len(neg))

    #reorganize per sample for all tracks
    signal_reform_X = []
    for j in range(len(x_combined)):
        per_track_X = []
        for t in range(len(track_set[i])):
            per_track_X.append(np.array(signal_X[t][j]))
        signal_reform_X.append(np.array(per_track_X))
    signal_reform_X = np.expand_dims(np.array(signal_reform_X), axis=4)
    X.append(signal_reform_X)
    
    y_combined = [1 for i in pos] + [0 for i in neg]
    Y.append(np.array(y_combined))

with open("./mm10/mm10_all_signals.true_neg.pickle", 'wb') as f:
    pickle.dump((X,Y), f)

forebrain 524 1563




heart 315 1563
hindbrain 412 1563
limb 348 1563
midbrain 454 1563
neural tube 284 1563
