In [None]:
import keras
from keras.models import Sequential, Model, load_model

from keras.layers import Dense, Dropout, Activation, Flatten, Input, Lambda
from keras.layers import Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization, LocallyConnected2D, Permute
from keras.layers import concatenate, Reshape, Softmax, Conv2DTranspose, Embedding, Multiply
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
from keras import regularizers
from keras import backend as K
import keras.losses

import tensorflow as tf
from tensorflow.python.framework import ops

import isolearn.keras as iso

import numpy as np

import tensorflow as tf
import logging
logging.getLogger('tensorflow').setLevel(logging.ERROR)

import pandas as pd

import os
import pickle
import numpy as np

import scipy.sparse as sp
import scipy.io as spio

import matplotlib.pyplot as plt

import isolearn.io as isoio
import isolearn.keras as isol

import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from scipy.stats import pearsonr

# import seaborn as sns

from matplotlib import colors

from scipy.optimize import basinhopping, OptimizeResult

class IdentityEncoder(iso.SequenceEncoder) :
    
    def __init__(self, seq_len, channel_map) :
        super(IdentityEncoder, self).__init__('identity', (seq_len, len(channel_map)))
        
        self.seq_len = seq_len
        self.n_channels = len(channel_map)
        self.encode_map = channel_map
        self.decode_map = {
            nt: ix for ix, nt in self.encode_map.items()
        }
    
    def encode(self, seq) :
        encoding = np.zeros((self.seq_len, self.n_channels))
        
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.

        return encoding
    
    def encode_inplace(self, seq, encoding) :
        for i in range(len(seq)) :
            if seq[i] in self.encode_map :
                channel_ix = self.encode_map[seq[i]]
                encoding[i, channel_ix] = 1.
    
    def encode_inplace_sparse(self, seq, encoding_mat, row_index) :
        raise NotImplementError()
    
    def decode(self, encoding) :
        seq = ''
    
        for pos in range(0, encoding.shape[0]) :
            argmax_nt = np.argmax(encoding[pos, :])
            max_nt = np.max(encoding[pos, :])
            seq += self.decode_map[argmax_nt]

        return seq

    def decode_sparse(self, encoding_mat, row_index) :
        raise NotImplementError()

acgt_encoder = IdentityEncoder(145, {'A':0, 'C':1, 'G':2, 'T':3})

In [2]:
def load_predictor_model(model_path) :
    
    n_filters = 600
    filt_sizes = [25,11,7]
    n_dense = 64
    dropout_rate = 0.1
    
    sequence_input = Input(shape=(145, 4),name="pat_input")  
    convs = [None]*len(filt_sizes)
    
    for i in range(len(filt_sizes)):
        conv1           = Conv1D(n_filters, filt_sizes[i], padding='same', activation='linear', name = "pat_conv_" + str(i) + "_copy", trainable=False)(sequence_input)
        batchnorm1      = BatchNormalization(axis=-1,name = "pat_batchnorm_" + str(i) + "_copy", trainable=False)(conv1)
        relu1           = Activation('relu',name = "pat_relu_" + str(i) + "_copy")(batchnorm1)
        convs[i]        = Dropout(dropout_rate,name = "pat_dropout_" + str(i) + "_copy")(GlobalMaxPooling1D(name = "pat_pool_" + str(i) + "_copy")(relu1))
    
    concat1           = concatenate(convs,name="pat_concat_layer_copy")

    dense           = Dense(n_dense,activation='relu',name="pat_dense_copy", trainable=False)(concat1)
    output          = Dense(2,activation='linear',name="pat_output_copy", trainable=False)(dense) # 0 - HepG2, 1 - K562

    saved_model = Model(inputs=sequence_input,outputs=output)
    saved_model.compile(optimizer=keras.optimizers.Adam(lr=0.0002, beta_1=0.9, beta_2=0.999), loss="mse")
    

    saved_model.load_weights(model_path)
    return saved_model

In [4]:
def _store_sequence(run_dir, run_prefix, seq, curr_iter) :
    #Save sequence to file
    with open(run_dir + run_prefix + "_iter_" + str(int(curr_iter)) + ".txt", "a+") as f :
        f.write(seq + "\n")

def get_step_func(predictor, sequence_template, acgt_encoder) :
    
    available_positions = [
        j for j in range(len(sequence_template)) if sequence_template[j] == 'N'
    ]
    
    available_nt_dict = {
        0 : [1, 2, 3],
        1 : [0, 2, 3],
        2 : [1, 0, 3],
        3 : [1, 2, 0]
    }
        
    _predict_func = get_predict_func(predictor, len(sequence_template))
    
    def _step_func(x, sequence_template=sequence_template, available_positions=available_positions, available_nt_dict=available_nt_dict) :
        
        onehot = np.expand_dims(np.expand_dims(x.reshape((len(sequence_template), 4)), axis=0), axis=-1)
        
        #Choose random position and nucleotide identity
        rand_pos = np.random.choice(available_positions)
        
        curr_nt = np.argmax(onehot[0, rand_pos, :, 0])
        rand_nt = np.random.choice(available_nt_dict[curr_nt])
        
        #Swap nucleotides
        onehot[0, rand_pos, :, 0] = 0.
        onehot[0, rand_pos, rand_nt, 0] = 1.
        
        new_x = np.ravel(onehot)
        
        return new_x
    
    return _step_func

### MODIFY THIS FUNCTION TO SELECT H2K VS K2H OPTIMIZATION ###
def get_predict_func(predictor, seq_len) :
    
    def _predict_func(x, predictor=predictor, seq_len=seq_len) :
        
        onehot = np.expand_dims(x.reshape((seq_len, 4)), axis=0)
        
        score_pred = predictor.predict(x=[onehot], batch_size=1)
        score_pred = score_pred[0,0]-score_pred[0,1] ############################# for hepg2 - k562
#         score_pred = score_pred[0,1]-score_pred[0,0] ############################# for k562 - hepg2

        return -score_pred
    
    return _predict_func

def run_simulated_annealing(run_prefix, predictor, sequence_template, acgt_encoder, n_iters=1000, n_iters_per_temperate=100, temperature_init=1.0, temperature_func=None, verbose=False) :
    
    run_dir = "./samples/" + run_prefix + "/"
    run_prefix = "intermediate"
    
    if not os.path.exists(run_dir): os.makedirs(run_dir)
    
    if temperature_func is None :
        temperature_func = lambda t, curr_iter, t_init=temperature_init, total_iters=n_iters: t
    
    n_epochs = n_iters // n_iters_per_temperate
    
    predict_func = get_predict_func(predictor, len(sequence_template))
    step_func = get_step_func(predictor, sequence_template, acgt_encoder)
    
    #Random initialization
    random_sequence = ''.join([
        sequence_template[j] if sequence_template[j] != 'N' else np.random.choice(['A', 'C', 'G', 'T'])
        for j in range(len(sequence_template))
    ])

    x0 = np.ravel(acgt_encoder.encode(random_sequence))
    
    x = x0
    temperature = temperature_init
    
    seq_opt = ""
    tracked_scores = [predict_func(x)]
    for epoch_ix in range(n_epochs) :
        
        x_opt, f_opt = run_basinhopping(x, predict_func, step_func, n_iters=n_iters_per_temperate, temperature=temperature)
    
        onehot_opt = np.expand_dims(np.expand_dims(x_opt.reshape((len(sequence_template), 4)), axis=0), axis=-1)

        seq_opt = acgt_encoder.decode(onehot_opt[0, :, :, 0])
        score_opt = -f_opt
        tracked_scores.append(score_opt)
        
        if verbose :
            print("Iter " + str((epoch_ix + 1) * n_iters_per_temperate) + ", Temp = " + str(round(temperature, 4)) + ", Score = " + str(round(score_opt, 4)) + "...")

        _store_sequence(run_dir, run_prefix, seq_opt, (epoch_ix + 1) * n_iters_per_temperate)
        
        x = x_opt
        temperature = temperature_func(temperature, (epoch_ix + 1) * n_iters_per_temperate)
    
    return seq_opt, np.array(tracked_scores)
        
        
def run_basinhopping(x, predict_func, step_func, n_iters=1000, temperature=1.0) :
    
    def _dummy_min_opt(fun, x0, args=(), **options) :
        return OptimizeResult(fun=fun(x0), x=x0, nit=0, nfev=0, success=True)
    
    minimizer_kwargs = {
        'method' : _dummy_min_opt,
        'options' : { 'maxiter' : 0 }
    }
    
    opt_res = basinhopping(predict_func, x, minimizer_kwargs=minimizer_kwargs, stepsize=None, niter=n_iters, T=temperature, take_step=step_func)
    
    return opt_res.x, opt_res.fun


In [None]:
#Run the basinhopping algorithm
n_models = 1

#Specify file path to pre-trained predictor networks
base_path = 'predictor_models/single_predictors'
model_basename = 'wide'

sequence_template = 'N' * 145

n_sequences = 10
n_iters = 1000
n_iters_per_temperate = 100
run_prefix = f"sim_anneal_df_{n_iters}_iters_k2h_max"

verbose = False

t_init = 0.1
t_func = lambda t, curr_iter, t_init=t_init, total_iters=n_iters, t_min=0.05, exp_scale=1./0.7: t_init * t_min**(min(float(curr_iter / total_iters) * exp_scale, 1.0))

it_space = [0] + [(epoch_ix + 1) * n_iters_per_temperate for epoch_ix in range(n_iters // n_iters_per_temperate)]


######################################################
with open(run_prefix + "_sequences.csv", "at") as f:
    for i in range(n_models):
        K.clear_session()
        saved_predictor_model_path = f"{base_path}/{model_basename}_{i}.h5"
        saved_predictor = load_predictor_model(saved_predictor_model_path)
        print(f"Current model: {saved_predictor_model_path}")

        optimized_seqs = []
        optimized_trajs = []

        for sequence_ix in range(n_sequences) :

            seq, scores = run_simulated_annealing(run_prefix, saved_predictor, sequence_template, acgt_encoder, n_iters=n_iters, n_iters_per_temperate=n_iters_per_temperate, temperature_init=t_init, temperature_func=t_func, verbose=verbose)

            optimized_seqs.append(seq)
            optimized_trajs.append(scores.reshape(1, -1))


        optimized_trajs = np.concatenate(optimized_trajs, axis=0)
        sort_index = np.argsort(optimized_trajs[:,-1])[-1] # last index of optimized_trajs is the final score
        f.write(f"{optimized_seqs[sort_index]},{optimized_trajs[sort_index,-1]}, sim_anneal, {model_basename}_{i}\n")