In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
df = pd.read_csv("~/data/3-Simon/promoterDHS_distalDHS_pairs.csv")
df = df.sample(n=50000)
df['promoter_len'] = df['promoterDHSend']-df['promoterDHSstart']
df['distal_len'] = df['distalDHSend'] - df['distalDHSstart']

In [None]:
df.head()

In [None]:
plt.figure
plt.hist(df['promoter_len'],bins=30,color='g',alpha=0.4,label='promoter')
plt.hist(df['distal_len'],bins=30,color='r',alpha=0.4, label='distal')
plt.legend()
plt.show()

In [None]:
seqlength = 2000
df = df[(df['promoter_len']<seqlength)&(df['distal_len']<seqlength)]

In [None]:
Y = np.zeros((df.shape[0]),dtype=int)
c = 0
for i, row in df.iterrows():
    Y[c]=int(row.linked)
    c += 1

In [None]:
class_weight_vec = sklearn.utils.class_weight.compute_class_weight('balanced',[0.,1.],Y)

In [None]:
print(class_weight_vec)

In [None]:
N = df.shape[0]
encoding_promoter = np.zeros((N,seqlength*2,4),dtype=bool)
encoding_distal = np.zeros((N,seqlength*2,4),dtype=bool)

In [None]:
from Bio import SeqIO

def chrom2seq(chrom):
    """
    Convert chromosome string "chrom" to a genomic sequence
    """

    return list(SeqIO.parse(
        "/home/hildurk/data/3-Simon/hg38.analysisSet.chroms/%s.fa" % chrom, "fasta"))[0].seq.upper()

In [None]:
chrom2seq('chr1')

In [None]:
# list of chromosomes
CHROMS = ["chr1", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
          "chr18", "chr19", "chr2", "chr20", "chr21", "chr22", "chr3", "chr4", "chr5", "chr6", 
          "chr7", "chr8", "chr9", "chrM", "chrX", "chrY"]

# Make dictionary
CHROM2SEQ = dict(zip(CHROMS,[chrom2seq(chrom) for chrom in CHROMS]))

In [None]:
seq = CHROM2SEQ['chr1'][925589:925790]

In [None]:
print(seq)

In [None]:
import numpy as np

base2row = {'A':0,'T':1,'C':2,'G':3}
def double_seq2onehot(seq):
    """
    Create a one-hot encoding of a nucleotide sequence
    """
    
    A = np.zeros((len(seq)*2,4),dtype=bool)
    
    for i, base in enumerate(seq):
        if base == 'N': continue
        A[i,base2row[base]] = True
        
    for i, base in enumerate(seq[::-1]):
        if base == 'N': continue
        A[i+len(seq),base2row[base]] = True
        
    return A

In [None]:
c = 0
for i, row in df.iterrows():
    encoding_promoter[c,:row.promoter_len*2,:]=double_seq2onehot(CHROM2SEQ[row.chr][row.promoterDHSstart:row.promoterDHSend])
    encoding_distal[c,:row.distal_len*2,:]=double_seq2onehot(CHROM2SEQ[row.chr][row.distalDHSstart:row.distalDHSend])
    c += 1
    

In [None]:
import keras

we'll start from a logistic regression of weather a dital region is an enhancer

In [None]:
from keras.layers import Input, Conv1D, GlobalMaxPooling1D, Dense

In [None]:
from keras.models import Model

In [None]:
one_hot_encoding_promoter = Input(shape=(seqlength*2,4),name='promDHSoh') #oh is one hot
one_hot_encoding_distal = Input(shape=(seqlength*2,4),name='distalDHSoh') #oh is one hot

In [None]:
nb_filters = 20 #temp, we'll change it later
filter_len = 6 #temp, we'll change it later

conv_layer_distal = Conv1D(nb_filters,filter_len,padding='same',name='distalDHSconv',activation='relu')(one_hot_encoding_distal)
fingerprint_distal = GlobalMaxPooling1D()(conv_layer_distal)

conv_layer_promoter = Conv1D(nb_filters,filter_len,padding='same',name='promDHSconv',activation='relu')(one_hot_encoding_promoter)
fingerprint_promoter = GlobalMaxPooling1D()(conv_layer_promoter)

merged_vector = keras.layers.concatenate([fingerprint_distal,fingerprint_promoter],axis=-1)
prob_association = Dense(1,activation='sigmoid',name='logistic_regression')(merged_vector) #one neuron logistic regression

In [None]:
model = Model(inputs=[one_hot_encoding_distal,one_hot_encoding_promoter],outputs=prob_association)

In [None]:
model.summary()

In [None]:
from keras.optimizers import Adam

In [None]:
model.compile(optimizer=Adam(),loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_ind, test_ind, y_train, y_test = train_test_split(range(len(Y)), Y, test_size=0.20, random_state=42)

In [None]:
X_promoter_train = encoding_promoter[train_ind,:,:]
X_promoter_test = encoding_promoter[test_ind,:,:]

X_distal_train = encoding_distal[train_ind,:,:]
X_distal_test = encoding_distal[test_ind,:,:]

In [None]:
from keras.callbacks import EarlyStopping
model.fit([X_distal_train,X_promoter_train], y_train, batch_size=256, epochs=50, validation_split=0.2,
          class_weight={0:class_weight_vec[0],1:class_weight_vec[1]},callbacks=[EarlyStopping(patience=5)])

In [None]:
"""
Visualizing the distal filters
"""
filters=model.get_layer('distalDHSconv').get_weights()[0]
fig, axs = plt.subplots(3,4, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()
for i in range(12):
    im=axs[i].imshow(filters[:,:,i].T)
    #plt.yticks(range(4),['A','T','C','G'])
    plt.colorbar(im,ax=axs[i])
plt.title('Filters')
plt.show()

In [None]:
"""
Visualizing the distal filters
"""
filters=model.get_layer('promDHSconv').get_weights()[0]
fig, axs = plt.subplots(3,4, figsize=(15, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()
for i in range(12):
    im=axs[i].imshow(filters[:,:,i].T)
    #plt.yticks(range(4),['A','T','C','G'])
    labels[1] = 'Testing'
    axs[i].set_xticklabels(labels)
    plt.colorbar(im,ax=axs[i])
plt.title('Filters')
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

In [None]:
beta = model.get_layer('logistic_regression').get_weights()[0]

In [None]:
beta

In [None]:
"""
ROC score
"""
#y_score = model.predict_proba(xd_dev, verbose=0)
print('Predicting on test data')
y_score = model.predict([X_distal_test,X_promoter_test])
score=roc_auc_score(y_test, y_score, average='macro', sample_weight=None)
print(score)

In [None]:
import scipy.stats as ss

In [None]:
y_score.T

In [None]:
ss.pearsonr(y_test,np.array(y_score.T[0]))

In [None]:
plt.figure()
plt.scatter(y_test,np.array(y_score.T[0]),alpha=0.01,marker='.')
plt.show()

In [None]:
# look at Keras Merge for enhancers and promoters

In [None]:
y_predict = model.predict([X_promoter_test,X_distal_test])

In [None]:
y_predict[y_predict>0.5]=1

In [None]:
y_test[0]

In [None]:
y_predict

In [None]:
plt.figure()
plt.hist(Y)
plt.show()


In [None]:

from sklearn.metrics import roc_curve, auc
"""
ROC score
"""
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.show()
print('AUC: %f' % roc_auc)

In [None]:
plt.figure()
plt.plot(model.history.history['acc'])
plt.plot(model.history.history['val_acc'])
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()