In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#Read data as dataframe
df = pd.read_csv("/home/donajialej/data/3-Simon/promoterDHS_distalDHS_pairs.csv")

In [None]:
from Bio import SeqIO

# Define a function to convert a chromosome string to a genomic sequence
def chrom2seq(chrom):
    """Convert chromosome string "chrom" to a genomic sequence"""

    return list(SeqIO.parse(
        "/home/donajialej/data/3-Simon/hg38.analysisSet.chroms/%s.fa" % chrom, "fasta"))[0].seq.upper()

In [None]:
# list of chromosomes
CHROMS = ["chr1", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", 
          "chr18", "chr19", "chr2", "chr20", "chr21", "chr22", "chr3", "chr4", "chr5", "chr6", 
          "chr7", "chr8", "chr9", "chrM", "chrX", "chrY"]

# Make dictionary
CHROM2SEQ = dict(zip(CHROMS,[chrom2seq(chrom) for chrom in CHROMS]))

In [None]:
seq = CHROM2SEQ['chr1'][925589:925790]
print(seq)

In [None]:
# Define a function to create a one-hot encoding for a genomic sequence
base2row = {'A':0,'C':1,'G':2,'T':3}
def seq2onehot(seq):
    """Create a one-hot encoding of a nucleotide sequence"""
    
    A = np.zeros((len(seq),4),dtype=bool)
    for i, base in enumerate(seq):
        if base=='N':
            continue
        A[i,base2row[base]] = True
    return A

In [None]:
seq1h = seq2onehot(seq)
np.shape(seq1h)
print(seq1h[:5,:])
seq[:5]

In [None]:
# Define a function to find the reverse compliment of a one-hot encoding
def onehot2rc(onehot):
    """Find the one-hot encoding of the reverse compliment
    of the genomic sequence with one-hot encoding onehot"""
    
    A = np.flipud(onehot) # First reverse
    A = np.fliplr(A) # Then take compliment (i.e. switch A&T and C&G)
    #B = np.zeros(np.shape(A),dtype=bool)
    #B[:,0], B[:,1], B[:,2], B[:,3] = A[:,1], A[:,0], A[:,3], A[:,2]
    return A

In [None]:
A = seq1h[:5,:]
B = onehot2rc(A)
print(A)
print(B)

In [None]:

# Sample the same number of linked==1 class as linked==0 class

idx1 = np.where(df["linked"])[0]
idx0 = np.where(df["linked"]==0)[0]
np.random.shuffle(idx1)
np.random.shuffle(idx0)

nsamples_per_class = 20000
print(np.shape(idx1))
print(np.shape(idx0))
idx1 = idx1[:nsamples_per_class]
idx0 = idx0[:nsamples_per_class]
print(np.shape(idx1))
print(np.shape(idx0))

# Save these indices
np.savez("/home/donajialej/BCdata_project3/SampleIdx.npz",
         idx1=idx1, idx0=idx0)
"""
# Load idx1 and idx0 if saved
sample_idx = np.load("/home/joshscurll/bcdata_Altius_Project/BCdata_project3/SampleIdx.npz")
idx1, idx0 = sample_idx['idx1'], sample_idx['idx0']
"""

In [None]:
np.concatenate([idx1, idx0])

In [None]:
# Take a sample
idx10 = np.concatenate([idx1,idx0])
np.random.shuffle(idx10)
newdf = df.iloc[idx10]
del df
df = newdf
del newdf

#df = df.sample(n=10000) # Comment this out when loading indices!

In [None]:
df.head()

In [None]:
# Create new columns for promoter length and distal length
df["promoter_len"] = df["promoterDHSend"]-df["promoterDHSstart"]
df["distal_len"] = df["distalDHSend"]-df["distalDHSstart"]

# Plot histogram of sequence lengths
plt.figure
plt.hist(df["promoter_len"],bins=30,color="r",alpha=0.4,label="promoter")
plt.hist(df["distal_len"],bins=30,color="b",alpha=0.4,label="distal")
plt.legend()
plt.xlabel("base pair")
plt.ylabel("count")
plt.show()

In [None]:
# Set a maximum sequence length and filter out longer sequences
seqlength = 2000
df = df[(df["promoter_len"]<seqlength) & (df["distal_len"]<seqlength)]

In [None]:
# Initialize one-hot encoding arrays (X)
N = df.shape[0]
encoding_promoter = np.zeros((N,seqlength,2,4),dtype=bool)
encoding_distal = np.zeros((N,seqlength,2,4),dtype=bool)

# Initialize Y
Y = np.zeros((N,1))

In [None]:
# Define X (one-hot encoding) and Y (linked) for Keras
row_no = 0
for i,row in df.iterrows():
    seq_promoter = CHROM2SEQ[row.chr][row.promoterDHSstart:row.promoterDHSend]
    seq_distal = CHROM2SEQ[row.chr][row.distalDHSstart:row.distalDHSend]
    try:
        onehot_promoter = seq2onehot(seq_promoter)
        onehot_distal = seq2onehot(seq_distal)
        encoding_promoter[row_no,:row.promoter_len,0,:] = onehot_promoter
        encoding_distal[row_no,:row.distal_len,0,:] = onehot_distal
        encoding_promoter[row_no,:row.promoter_len,1,:] = onehot2rc(onehot_promoter)
        encoding_distal[row_no,:row.distal_len,1,:] = onehot2rc(onehot_distal)
        Y[row_no] = row.linked
        row_no += 1
    except KeyError:
        print(seq_promoter)
        print(seq_distal)
        break
        
del onehot_promoter
del onehot_distal

In [None]:
print(np.shape(encoding_promoter))
print(np.shape(encoding_distal))
print(np.shape(Y))

In [None]:
from sklearn.model_selection import train_test_split

"""Test whether we can predict promoter and distal sequences using model"""

X = np.zeros((2*N,seqlength,2,4),dtype=bool)
Y = np.zeros((2*N,1),dtype=bool)
X[:N,:,:,:] = encoding_promoter
X[N:,:,:,:] = encoding_distal
Y[:N] = np.ones((N,1),dtype=bool)

# Create training, development and test data sets

X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.25)

X_train, X_dev, y_train, y_dev = train_test_split(
    X_train, y_train, test_size=0.33)