In [1]:
#-----import packages-----#

#common python packages
import numpy as np
import string
import random
import os
import pickle
import argparse
import wget
import math
import tempfile
import matplotlib.pyplot as plt
from datetime import datetime

#machine learning packages
import sklearn
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

import tensorflow as tf

from keras.regularizers import l2
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.utils import shuffle
%matplotlib inline

#notify the OS about GPU
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['KERAS_BACKEND'] = 'tensorflow'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
#parsing command line arguments
# -----parsing command line arguments-----#
parser = argparse.ArgumentParser(description='Training CNN model to predict STARR-seq enhancers based on chromatin accessbility and histone marks')
parser.add_argument('-c', '--cell_types', type=str, help='comma separated string of cell_types')
parser.add_argument('-i', '--in_dir', type=str, help='directory containing 01_data_encoding intermediate tsv files')

#simulate command line input
cmdline_str='-c ' + " HepG2,K562,A549,HCT116,MCF-7 " + \
    ' -i ' + "/gpfs/ysm/scratch60/gerstein/zc264/ChromVar/enhancer-prediction/encode/dev/encoded_2overlap/DNase/"

seq_names = ["DNase", "H3K27ac", "H3K4me3", "H3K9ac", "H3K4me1"]

#check if the files are there
args = parser.parse_args(cmdline_str.split())
args.cell_types = args.cell_types.split(",")
for cell in args.cell_types:
    for seq in seq_names:
        pos_file = args.in_dir + cell + "." + seq + ".pos.tsv"
        if not os.path.exists(pos_file):
            print(pos_file + " file does not exist")
            exit(1)
        neg_file = args.in_dir + cell + "." + seq + ".neg.tsv"
        if not os.path.exists(neg_file):
            print(neg_file + " file does not exist")
            exit(1)
print("all files found!")

all files found!


In [3]:
# def get_data(cell_types, in_dir, seq_names):

#     first_cell = True
#     for cell in cell_types:
#         print(cell)

#         pos = []
#         neg = []
#         first_seq = True
#         for seq in seq_names:
#             print("-"+seq)

#             pos_name = in_dir+cell+"."+seq+".pos.tsv"
#             pos_mat = np.loadtxt(pos_name, delimiter='\t')

#             neg_name = in_dir+cell+"."+seq+".neg.tsv"
#             neg_mat = np.loadtxt(neg_name, delimiter='\t')

#             if first_seq:
#                 for i in pos_mat:
#                     pos.append(np.array([i]))
#                 for i in neg_mat:
#                     neg.append(np.array([i]))
#                 first_seq = False
#             else:
#                 for i in range(len(pos)):
#                     pos[i] = np.vstack((pos[i], pos_mat[i,]))
#                 for i in range(len(neg)):
#                     neg[i] = np.vstack((neg[i], neg_mat[i,]))

#         if first_cell == True:
#             X_pos = np.array(pos)
#             X_neg = np.array(neg)
#             first_cell = False
#         else:
#             X_pos = np.vstack((X_pos, pos))
#             X_neg = np.vstack((X_neg, neg))

#     X = np.vstack((X_pos, X_neg))
#     y = np.array([1 for i in range(X_pos.shape[0])] + [0 for i in range(X_neg.shape[0])]).reshape(-1,1)
#     print(X.shape)
#     print(y.shape)
    
#     return X, y

# X, y = get_data(args.cell_types, args.in_dir, seq_names)

In [4]:
with open(args.in_dir + "hg38_signals.pickle", 'rb') as f:
    X, y = pickle.load(f)
X = X[:, :, 125:275]
X = X.reshape(X.shape[0], -1)
#X = X.sum(axis=1)
y = y.reshape(-1)
print(X.shape)
print(y.shape)

(146542, 750)
(146542,)


In [5]:
#kfold division of the data
kf = sklearn.model_selection.KFold(n_splits=5, shuffle=True)

#collect the output of the kfolds
history_list = []
y_pred_list = []
y_test_list = []
accuracy_list = []
auROC_list = []
auPRC_list = []

kskip = 0

In [9]:
#iterate over each fold of data
for train_index, test_index in kf.split(y):
    
    x_train = X[train_index]
    x_test = X[test_index]
    y_train = y[train_index]
    y_test = y[test_index]

    model = Sequential()
    model.add(Dense(1, kernel_regularizer=l2(0.01), activation="linear"))
    model.add(Activation("sigmoid"))
    model.compile(loss='binary_crossentropy',
          optimizer='adadelta',
          metrics=['accuracy'])
    

    #train the model
    history_list.append(model.fit(x_train, y_train, validation_split=0.0, epochs=10, batch_size=128))

    # predict the results
    y_pred = model.predict(x_test).ravel()
    y_pred_list.append(y_pred)
    y_test_list.append(y_test.ravel())
    results = np.vstack((y_pred.ravel(), y_test.ravel()))

    accuracy_s = sklearn.metrics.accuracy_score(y_test, np.rint(y_pred))
    accuracy_list.append(accuracy_s)

    auroc_s = sklearn.metrics.roc_auc_score(y_test, np.rint(y_pred))
    auROC_list.append(auroc_s)

    auprc_s = sklearn.metrics.average_precision_score(y_test, np.rint(y_pred))
    auPRC_list.append(auprc_s)

    #iterate k fold counter
    kskip = kskip + 1

    #delete the model so the variable is cleared
    del model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
np.savetxt("svm.auROC.csv", np.array(auROC_list), delimiter=",")
np.savetxt("svm.auPRC.csv", np.array(auPRC_list), delimiter=",")

## Mouse Validation

In [5]:
model = Sequential()
model.add(Dense(1, kernel_regularizer=l2(0.01), activation="linear"))
model.add(Activation("sigmoid"))
model.compile(loss='binary_crossentropy',
      optimizer='adadelta',
      metrics=['accuracy'])
model.fit(X, y, validation_split=0.0, epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b7af6853be0>

In [11]:
with open("./mm10/mm10_all_signals.pickle", 'rb') as f:
    X,Y = pickle.load(f)
samples = ["forebrain", "heart", "hindbrain", "limb", "midbrain", "neural tube"]
for i in range(len(samples)):
    X[i][:, [0, 2], :] = X[i][:, [2, 0], :]
    X[i] = X[i][:, :, 125:275]
    X[i] = X[i].reshape(X[i].shape[0], -1)

In [12]:
y_pred_list = []
accuracy_list = []
auroc_list = []
auprc_list = []
for i in range(len(samples)):
    y_pred = model.predict(X[i]).ravel()
    y_pred_list.append(y_pred)

    accuracy_list.append(sklearn.metrics.accuracy_score(Y[i], np.rint(y_pred)))    
    auprc_list.append(sklearn.metrics.average_precision_score(Y[i], y_pred_list[i]))
    auroc_list.append(sklearn.metrics.roc_auc_score(Y[i], y_pred_list[i]))

In [14]:
print(accuracy_list)
print("auroc for all mm10 tissues:", auroc_list)
print("auprc for all mm10 tissues:", auprc_list)

[0.36189889025893957, 0.3788532675709001, 0.3104192355117139, 0.30456226880394577, 0.3557336621454994, 0.2697287299630086]
auroc for all mm10 tissues: [0.7923864082846879, 0.7985649796506744, 0.7700795519993417, 0.7601021226265321, 0.7943201016847456, 0.7707324181575942]
auprc for all mm10 tissues: [0.3865779292114275, 0.2686884213177673, 0.31919411401143205, 0.2695298671174265, 0.3413075079236296, 0.2333350582516644]
