In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("/home/hildurk/data/3-Simon/promoterDHS_distalDHS_pairs.csv")
df = df.sample(n=50000)
df["promoter_len"]=df["promoterDHSend"]-df["promoterDHSstart"]
df["distal_len"]=df["distalDHSend"]-df["distalDHSstart"]

In [None]:
df.head()

In [None]:
from Bio import SeqIO

def chrom2seq(chrom):
    """
    Convert chromosome string "chrom" to a genomic sequence
    """

    return list(SeqIO.parse(
        "/home/hildurk/data/3-Simon/hg38.analysisSet.chroms/%s.fa" % chrom, "fasta"))[0].seq.upper()

In [None]:
plt.figure
plt.hist(df["promoter_len"],bins=30,color="g",alpha=0.4,label="promoter")
plt.hist(df["distal_len"],bins=30,color="b",alpha=0.4,label="distal")
plt.legend()
plt.xlabel("Base Pair")
plt.ylabel("Count")
plt.show()

In [None]:
seqlength=2000
df=df[(df["promoter_len"]<seqlength) & (df["distal_len"]<seqlength)]

In [None]:
N=df.shape[0]
encoding_promoter = np.zeros((N,seqlength,4),dtype=bool)
encoding_distal = np.zeros((N,seqlength,4),dtype=bool)

In [None]:
chrom2seq('chr1')

In [None]:
# list of chromosomes
CHROMS = ["chr1", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
          "chr18", "chr19", "chr2", "chr20", "chr21", "chr22", "chr3", "chr4", "chr5", "chr6", 
          "chr7", "chr8", "chr9", "chrM", "chrX", "chrY"]

# Make dictionary
CHROM2SEQ = dict(zip(CHROMS,[chrom2seq(chrom) for chrom in CHROMS]))

In [None]:
seq = CHROM2SEQ['chr1'][925589:925790]

In [None]:
print(seq)

In [None]:
import numpy as np

base2row = {'A':0,'T':1,'C':2,'G':3}
def seq2onehot(seq):
    """
    Create a one-hot encoding of a nucleotide sequence
    """
    
    A = np.zeros((len(seq),4),dtype=bool)
    for i, base in enumerate(seq):
        if base=='N':
            continue
        A[i,base2row[base]] = True
    return A

In [None]:
seq1h = seq2onehot(seq)

In [None]:
np.shape(seq1h)

In [None]:
print(seq1h[:,:9])

In [None]:
seq[:9]

In [None]:

# Initialize one-hot encoding arrays (X)
N = df.shape[0]
encoding_promoter = np.zeros((N,seqlength,4),dtype=bool)
encoding_distal = np.zeros((N,seqlength,4),dtype=bool)

# Initialize Y
#Y = np.zeros((N,1))
Y = np.zeros((N),dtype=int)

In [None]:
# Define X (one-hot encoding) and Y (linked) for Keras
row_no = 0
for i,row in df.iterrows():
    seq_promoter = CHROM2SEQ[row.chr][row.promoterDHSstart:row.promoterDHSend]
    seq_distal = CHROM2SEQ[row.chr][row.distalDHSstart:row.distalDHSend]
    try:
        encoding_promoter[row_no,:row.promoter_len,:] = seq2onehot(seq_promoter)
        encoding_distal[row_no,:row.distal_len,:] = seq2onehot(seq_distal)
        Y[row_no] = row.linked
        row_no += 1
    except KeyError:
        print(seq_promoter)
        print(seq_distal)
        break

Now we have our data in the correct format and we are ready to import it into keras (to run tensorflow).

In [None]:
from sklearn.model_selection import train_test_split

#idx_train_all, idx_test, y_train_all, y_test = train_test_split(np.arange(N), Y, test_size=0.33,
#                                          random_state=1337, stratify=None)
idx_train, idx_test, y_train, y_test = train_test_split(np.arange(N), Y, test_size=0.33)
#idx_train, idx_dev, y_train, y_dev = train_test_split(np.arange(idx_train_all.shape[0]), y_train_all, test_size=0.20)

xp_train=encoding_promoter[idx_train,:,:]
#xp_dev=encoding_promoter[idx_dev,:,:]
xp_test=encoding_promoter[idx_test,:,:]

xd_train=encoding_distal[idx_train,:,:]
#xd_dev=encoding_distal[idx_dev,:,:]
xd_test=encoding_distal[idx_test,:,:]

#y_train = df["linked"][idx_train]
#y_dev   = df["linked"][idx_dev]
#y_test =df["linked"][idx_dev]

In [None]:
import keras

In [None]:
from keras.layers import Input,Conv1D,Dense,GlobalMaxPooling1D
nb_filter=12
filter_len=6
distal_one_hot_encoding = Input(shape=(seqlength,4),name='distalDHSoh')
promo_one_hot_encoding = Input(shape=(seqlength,4),name='promoterDHSoh')
distal_conv_layer=Conv1D(nb_filter,filter_len,padding='same',name='convDHSdistal',activation='relu')(distal_one_hot_encoding)
promo_conv_layer=Conv1D(nb_filter,filter_len,padding='same',name='convDHSpromo',activation='relu')(promo_one_hot_encoding)

In [None]:
fingerprint_distal=GlobalMaxPooling1D()(distal_conv_layer)
fingerprint_promo=GlobalMaxPooling1D()(promo_conv_layer)
merged_vector = keras.layers.concatenate([fingerprint_promo, fingerprint_distal], axis=-1)
#prob_association_distal=Dense(1,activation='sigmoid',name='logistic_regression_distal')(fingerprint_distal)
prob_association=Dense(1,activation='sigmoid',name='logistic_regression_distal')(merged_vector)
#prob_association_promo=Dense(1,activation='sigmoid',name='logistic_regression_promo')(fingerprint_promo)

In [None]:
from keras.models import Model
#model=Model(input=one_hot_encoding, output=prob_association)
#model = Model(inputs=[promo_one_hot_encoding,distal_one_hot_encoding], outputs=[prob_association_promo, prob_association_distal])
model = Model(inputs=[promo_one_hot_encoding,distal_one_hot_encoding], outputs= prob_association)

#for the two fingerprints model=Model(input=[one_hot_encoding_prom,one_hot_encoding_distal], output=prob_association)
#model.add(Merge([branch1, branch2], mode = 'concat'))

In [None]:
model.summary()

In [None]:
np.shape(xd_train)

In [None]:
# Get the percentage of nonzeros in Y
pc = len(np.where(Y)[0]) / len(Y)
print(pc)

In [None]:
import sklearn
class_weight_vec=sklearn.utils.class_weight.compute_class_weight('balanced',np.unique(Y),Y)

In [None]:
print(class_weight_vec)

print(y_train[24])

In [None]:
from keras.optimizers import Adam
from keras import metrics
from keras.utils import np_utils
from keras.callbacks import EarlyStopping

"""
Fit the model using a dictionary to weight class 1 more highly than class 0
so that both classes are represented equally in the model fitting
"""

class_weight = {0: class_weight_vec[0],1: class_weight_vec[1]}
#class_weight=sklearn.utils.class_weight.compute_class_weight('balanced',[0.,1.],Y)
model.compile(optimizer=Adam(),loss='binary_crossentropy',metrics=['mse','accuracy'])
#model.fit(xd_train,y_train,batch_size=256,epochs=30,validation_data=[xd_dev,y_dev])
model.fit([xp_train,xd_train],y_train,batch_size=400,epochs=40,class_weight=class_weight,
          validation_split=0.2,callbacks=[EarlyStopping(patience=5)])

In [None]:
from keras.callbacks import EarlyStopping
"""
Fit the model using a dictionary to weight class 1 more highly than class 0
so that both classes are represented equally in the model fitting
"""
class_weight = {0: 1, 1: min([round(1/pc),50])}
#model.fit(xd_train,y_train,batch_size=256,epochs=100,
#          validation_data=[xd_dev,y_dev],class_weight=class_weight,callbacks=[EarlyStopping(patience=10)])
#model.fit([xp_train,xd_train],y_train,batch_size=256,epochs=40,
#          class_weight=class_weight,callbacks=[EarlyStopping(patience=10)])

In [None]:
"""
Visualizing the promotor filters
"""
filters=model.get_layer('convDHSpromo').get_weights()[0]
fig, axs = plt.subplots(3,4, figsize=(20, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()
for i in range(12):
    im=axs[i].imshow(filters[:,:,i].T)
    plt.colorbar(im,ax=axs[i])
plt.title('Filters')
plt.show()

In [None]:
"""
Visualizing the distal enhancers filters
"""
filters=model.get_layer('convDHSdistal').get_weights()[0]
fig, axs = plt.subplots(3,4, figsize=(20, 6), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = .5, wspace=.001)
axs = axs.ravel()
for i in range(12):
    im=axs[i].imshow(filters[:,:,i].T)
    plt.colorbar(im,ax=axs[i])
plt.title('Filters')
plt.show()

In [None]:
np.shape(filters)

In [None]:
"""
Get the beta values
"""
beta=model.get_layer('logistic_regression_distal').get_weights()[0]
print(beta)

How to validate the model? accuracy will not work (lots of zeros). We need to look at false positives and false negatives.
We will use the ROC curve, calculate FPR and TPR.
y needs to have the shape y=np.zeros(#data,1)

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

In [None]:
"""
ROC score
"""
#y_score = model.predict_proba(xd_dev, verbose=0)
print('Predicting on test data')
y_score = model.predict([xp_test,xd_test])
score=roc_auc_score(y_test, y_score, average='macro', sample_weight=None)
print(score)

In [None]:
from sklearn.metrics import roc_curve, auc
"""
ROC score
"""
plt.rcParams.update({'font.size': 18})
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.05])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic curve')
plt.show()
print('AUC: %f' % roc_auc)

In [None]:
# summarize history for accuracy
plt.plot(model.history.history['acc'])
plt.plot(model.history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='lower right')
plt.show()

In [None]:
# summarize history for loss
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper right')
plt.show()

In [None]:
from sklearn.metrics import classification_report
y_new=y_test.astype(np.float)
print(y_score)
sklearn.metrics.classification_report(y_new, y_score)

In [None]:
y_predict=np.array(len(y_score))
y_predict[y_score>0.5]=1
print(y_predict)