In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
#Read data as dataframe
df = pd.read_csv("/home/joshscurll/data/3-Simon/promoterDHS_distalDHS_pairs.csv")

# Take a sample
df = df.sample(n=10000)

# Creat new columns for promoter length and distal length
df["promoter_len"] = df["promoterDHSend"]-df["promoterDHSstart"]
df["distal_len"] = df["distalDHSend"]-df["distalDHSstart"]

In [None]:
df.head()

In [None]:
# Plot histogram of sequence lengths
plt.figure
plt.hist(df["promoter_len"],bins=30,color="r",alpha=0.4,label="promoter")
plt.hist(df["distal_len"],bins=30,color="b",alpha=0.4,label="distal")
plt.legend()
plt.xlabel("base pair")
plt.ylabel("count")
plt.show()

In [None]:
# Set a maximum sequence length and filter out longer sequences
seqlength = 2000
df = df[(df["promoter_len"]<seqlength) & (df["distal_len"]<seqlength)]

In [None]:
from Bio import SeqIO

# Define a function to convert a chromosome string to a genomic sequence
def chrom2seq(chrom):
    """
    Convert chromosome string "chrom" to a genomic sequence
    """

    return list(SeqIO.parse(
        "/home/joshscurll/data/3-Simon/hg38.analysisSet.chroms/%s.fa" % chrom, "fasta"))[0].seq.upper()

In [None]:
chrom2seq('chr1')

In [None]:
# list of chromosomes
CHROMS = ["chr1", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
          "chr18", "chr19", "chr2", "chr20", "chr21", "chr22", "chr3", "chr4", "chr5", "chr6", 
          "chr7", "chr8", "chr9", "chrM", "chrX", "chrY"]

# Make dictionary
CHROM2SEQ = dict(zip(CHROMS,[chrom2seq(chrom) for chrom in CHROMS]))

In [None]:
seq = CHROM2SEQ['chr1'][925589:925790]
print(seq)

In [None]:
# Define a function to creat a one-hot encoding for a genomic sequence
base2row = {'A':0,'T':1,'C':2,'G':3}
def seq2onehot(seq):
    """
    Create a one-hot encoding of a nucleotide sequence
    """
    
    A = np.zeros((len(seq),4),dtype=bool)
    for i, base in enumerate(seq):
        if base=='N':
            continue
        A[i,base2row[base]] = True
    return A

In [None]:
seq1h = seq2onehot(seq)
np.shape(seq1h)
print(seq1h[:9,:])
seq[:9]

In [None]:
# Initialize one-hot encoding arrays (X)
N = df.shape[0]
encoding_promoter = np.zeros((N,seqlength,4),dtype=bool)
encoding_distal = np.zeros((N,seqlength,4),dtype=bool)

# Initialize Y
Y = np.zeros((N,1))

In [None]:
# Define X (one-hot encoding) and Y (linked) for Keras
row_no = 0
for i,row in df.iterrows():
    seq_promoter = CHROM2SEQ[row.chr][row.promoterDHSstart:row.promoterDHSend]
    seq_distal = CHROM2SEQ[row.chr][row.distalDHSstart:row.distalDHSend]
    try:
        encoding_promoter[row_no,:row.promoter_len,:] = seq2onehot(seq_promoter)
        encoding_distal[row_no,:row.distal_len,:] = seq2onehot(seq_distal)
        Y[row_no] = row.linked
        row_no += 1
    except KeyError:
        print(seq_promoter)
        print(seq_distal)
        break

In [None]:
np.shape(encoding_promoter)
np.shape(encoding_distal)
np.shape(Y)

In [None]:
import keras 
from keras import backend as K

import numpy as np

from keras.layers import Dense, Dropout, Input, Conv1D, GlobalMaxPooling1D
from keras.models import Model

from keras.datasets import mnist
from keras.utils import to_categorical

from keras.callbacks import EarlyStopping

In [None]:
def reluConv1d(x, filters, kernel_size, name):
    return Conv1D(filters=filters, kernel_size=kernel_size, 
                  activation='relu', padding='same', name=name)(x)

def gmp1d(x):
    return GlobalMaxPooling1D()(x)

def convMP(x, filters, kernel_size, name):
    return gmp1d(reluConv1d(x, filters, kernel_size, name))

In [None]:
onehot = Input(shape=(seqlength,4),name='distalDHSoh')
fingerprint = convMP(onehot,12,3,'Conv1DdistalDHS')
prob_association = Dense(1,activation='sigmoid')(fingerprint)

In [None]:
model = Model(input=onehot,outputs=prob_association)
model.summary()

In [None]:
from keras.optimizers import Adam
model.compile(optimizer=Adam(),loss='binary_crossentropy')

In [None]:
"""
xp_train = 
xp_dev =
xp_test =

xd_train =
xd_dev =
xd_test =

y_train =
y_dev =
y_test =
"""

In [None]:
"""
model.fit(x_train,y_train,batch_size=256,epochs=30,validation_data=[x_test,y_test])
filters = model.get_layer('Conv1DdistalDHS').get_weights()[0](filter_len,4,num_filters)

plt.imshow(filters[:,:,i])

beta = model.get_layer('logistic_regr').get_weights()[0]
"""