In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Read data as dataframe
df = pd.read_csv("/home/joshscurll/data/3-Simon/promoterDHS_distalDHS_pairs.csv")

In [None]:
from Bio import SeqIO

# Define a function to convert a chromosome string to a genomic sequence
def chrom2seq(chrom):
    """Convert chromosome string "chrom" to a genomic sequence"""

    return list(SeqIO.parse(
        "/home/joshscurll/data/3-Simon/hg38.analysisSet.chroms/%s.fa" % chrom, "fasta"))[0].seq.upper()

In [None]:
chrom2seq('chr1')

In [None]:
# list of chromosomes
CHROMS = ["chr1", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
          "chr18", "chr19", "chr2", "chr20", "chr21", "chr22", "chr3", "chr4", "chr5", "chr6", 
          "chr7", "chr8", "chr9", "chrM", "chrX", "chrY"]

# Make dictionary
CHROM2SEQ = dict(zip(CHROMS,[chrom2seq(chrom) for chrom in CHROMS]))

In [None]:
seq = CHROM2SEQ['chr1'][925589:925790]
print(seq)

In [None]:
# Define a function to create a one-hot encoding for a genomic sequence
base2row = {'A':0,'T':1,'C':2,'G':3}
def seq2onehot(seq):
    """Create a one-hot encoding of a nucleotide sequence"""
    
    A = np.zeros((len(seq),4),dtype=bool)
    for i, base in enumerate(seq):
        if base=='N':
            continue
        A[i,base2row[base]] = True
    return A

In [None]:
seq1h = seq2onehot(seq)
np.shape(seq1h)
print(seq1h[:9,:])
seq[:9]

In [None]:
# Define a function to find the reverse compliment of a one-hot encoding
def onehot2rc(onehot):
    """Find the one-hot encoding of the reverse compliment
    of the genomic sequence with one-hot encoding onehot"""
    
    A = np.flipud(onehot) # First reverse
    B = np.zeros(np.shape(A),dtype=bool)
    B[:,0], B[:,1], B[:,2], B[:,3] = A[:,1], A[:,0], A[:,3], A[:,2]
    return B

In [None]:
A = seq1h[:9,:]
B = onehot2rc(A)
print(A)
print(B)

In [None]:
np.shape(df)

In [None]:
# Take a sample
df = df.sample(n=10000)

# Create new columns for promoter length and distal length
df["promoter_len"] = df["promoterDHSend"]-df["promoterDHSstart"]
df["distal_len"] = df["distalDHSend"]-df["distalDHSstart"]

In [None]:
df.head()

In [None]:
# Plot histogram of sequence lengths
plt.figure
plt.hist(df["promoter_len"],bins=30,color="r",alpha=0.4,label="promoter")
plt.hist(df["distal_len"],bins=30,color="b",alpha=0.4,label="distal")
plt.legend()
plt.xlabel("base pair")
plt.ylabel("count")
plt.show()

In [None]:
# Set a maximum sequence length and filter out longer sequences
seqlength = 2000
df = df[(df["promoter_len"]<seqlength) & (df["distal_len"]<seqlength)]

In [None]:
# Initialize one-hot encoding arrays (X)
N = df.shape[0]
encoding_promoter = np.zeros((N,seqlength,2,4),dtype=bool)
encoding_distal = np.zeros((N,seqlength,2,4),dtype=bool)

# Initialize Y
Y = np.zeros((N,1))

In [None]:
# Define X (one-hot encoding) and Y (linked) for Keras
row_no = 0
for i,row in df.iterrows():
    seq_promoter = CHROM2SEQ[row.chr][row.promoterDHSstart:row.promoterDHSend]
    seq_distal = CHROM2SEQ[row.chr][row.distalDHSstart:row.distalDHSend]
    try:
        onehot_promoter = seq2onehot(seq_promoter)
        onehot_distal = seq2onehot(seq_distal)
        encoding_promoter[row_no,:row.promoter_len,0,:] = onehot_promoter
        encoding_distal[row_no,:row.distal_len,0,:] = onehot_distal
        encoding_promoter[row_no,:row.promoter_len,1,:] = onehot2rc(onehot_promoter)
        encoding_distal[row_no,:row.distal_len,1,:] = onehot2rc(onehot_distal)
        Y[row_no] = row.linked
        row_no += 1
    except KeyError:
        print(seq_promoter)
        print(seq_distal)
        break
        
del onehot_promoter
del onehot_distal

In [None]:
print(np.shape(encoding_promoter))
print(np.shape(encoding_distal))
print(np.shape(Y))

In [None]:
from sklearn.model_selection import train_test_split

"""Test whether we can predict promoter and distal sequences using model"""

X = np.zeros((2*N,seqlength,2,4),dtype=bool)
Y = np.zeros((2*N,1),dtype=bool)
X[:N,:,:,:] = encoding_promoter
X[N:,:,:,:] = encoding_distal
Y[:N] = np.ones((N,1),dtype=bool)

# Create training, development and test data sets

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25)

X_train, X_dev, y_train, y_dev = train_test_split(
    X_train, y_train, test_size=0.33)


"""
# Create training, development and test data sets

train_idx, test_idx, y_train, y_test = train_test_split(
    np.arange(N), Y, test_size=0.25)

train_idx, dev_idx, y_train, y_dev = train_test_split(
    train_idx, y_train, test_size=0.33)

xp_train = encoding_promoter[train_idx,:,:]
xp_dev = encoding_promoter[dev_idx,:,:]
xp_test = encoding_promoter[test_idx,:,:]

xd_train = encoding_distal[train_idx,:,:]
xd_dev = encoding_distal[dev_idx,:,:]
xd_test = encoding_distal[test_idx,:,:]
"""

In [None]:
import keras 
from keras import backend as K

import numpy as np

from keras.layers import Dense, Dropout, Input
from keras.layers import Conv1D, Conv2D, GlobalMaxPooling1D, GlobalMaxPooling2D, MaxPooling2D
from keras.models import Model

from keras.datasets import mnist
from keras.utils import to_categorical

from keras.callbacks import EarlyStopping

In [None]:
# Define some functions for the model

def reluConv2D(x, filters, kernel_size, name):
    return Conv2D(filters=filters, kernel_size=(kernel_size,1),
                 activation='relu', padding='same', name=name)(x)

def gmp2d(x):
    return GlobalMaxPooling2D()(x)

"""
def columnwise_mp(x):
    return MaxPooling2D(pool_size=(int(np.shape(x)[1]),1))(x)
    #return MaxPooling2D(pool_size=(2000,1))(x)

def reluConv1D(x, filters, kernel_size, name):
    return Conv1D(filters=filters, kernel_size=kernel_size, 
                  activation='relu', padding='same', name=name)(x)

def gmp1d(x):
    return GlobalMaxPooling1D()(x)
"""

def convMP(x, filters, kernel_size, name):
    return gmp2d(reluConv2D(x, filters, kernel_size, name))

In [None]:
np.shape(encoding_promoter)

In [None]:
"""
import tensorflow as tf
X = tf.stack(encoding_promoter[:10,:,:,:].astype(float))
Xconv = reluConv2D(X,3,4,'TestConv')
"""

In [None]:
"""
Xconvgmp = gmp2d(Xconv)
np.shape(Xconvgmp)
"""

In [None]:
"""
Xconvmp = columnwise_mp(Xconv)
np.shape(Xconvmp)
"""

In [None]:
# Specify the model
onehot = Input(shape=(seqlength,2,4),name='promoterDHSoh')
fingerprint = convMP(onehot,16,6,'ConvPromoterDHS')
prob_association = Dense(1,activation='sigmoid',name='logistic_regr')(fingerprint)

In [None]:
model = Model(inputs=onehot,outputs=prob_association)
model.summary()

In [None]:
# Get the percentage of nonzeros in Y
pc = len(np.where(Y)[0]) / len(Y)
print(pc)

In [None]:
from keras.optimizers import Adam
from keras import metrics

# Compile the model
model.compile(optimizer=Adam(),loss='binary_crossentropy',metrics=['mse','accuracy'])

In [None]:
"""
Fit the model using a dictionary to weight class 1 more highly than class 0
so that both classes are represented equally in the model fitting
"""
class_weight = {0: 1, 1: min([1./pc-1.,50])}
model.fit(X_train,y_train,batch_size=256,epochs=1000,
          validation_data=[X_dev,y_dev],class_weight=class_weight,callbacks=[EarlyStopping(patience=5)])

In [None]:
"""
Visualizing the filters
"""
filters=model.get_layer('ConvPromoterDHS').get_weights()[0]
fig, axs = plt.subplots(int(np.ceil(np.shape(filters)[3]/4)),4, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()
for i in range(np.shape(filters)[3]):
    im=axs[i].imshow(filters[:,0,:,i].T)
    plt.colorbar(im,ax=axs[i])
plt.title('Filters')
plt.show()

In [None]:
np.shape(filters)

In [None]:
"""
Get the beta values
"""
beta=model.get_layer('logistic_regr').get_weights()[0]
print(beta)

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

In [None]:
"""
ROC score
"""
print('Predicting on development data...')
y_score = model.predict(X_dev)
score=roc_auc_score(y_dev, y_score, average='macro', sample_weight=None)
print(score)

In [None]:
from sklearn.metrics import roc_curve, auc

"""
ROC score
"""
fpr, tpr, _ = roc_curve(y_dev, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.show()
print('AUC: %f' % roc_auc)