In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Attention Module**

In [None]:
from keras import backend as K, initializers, regularizers, constraints
from keras.engine.topology import Layer

class Attention(Layer):
  def __init__(self, step_dim, W_regularizer=None, b_regularizer=None,
               W_constraint=None, b_constraint=None,
               bias=True, return_attention=False, **kwargs):
    
    self.supports_masking = True
    self.init = initializers.get('glorot_uniform')

    self.W_regularizer = regularizers.get(W_regularizer)
    self.b_regularizer = regularizers.get(b_regularizer)

    self.W_constraint = regularizers.get(W_constraint)
    self.b_constraint = regularizers.get(b_constraint)

    self.step_dim = step_dim
    self.features_dim = 0
    self.bias = bias
    super(Attention, self).__init__(**kwargs)

  def build(self, input_shape):
    assert len(input_shape) == 3

    self.W = self.add_weight((input_shape[-1],),
                             initializer=self.init,
                             name='{}_W'.format(self.name),
                             regularizer=self.W_regularizer,
                             constraint=self.W_constraint)

    self.features_dim = input_shape[-1]

    if self.bias:
      self.b = self.add_weight((input_shape[1],),
                               initializer='zero',
                               name='{}_b'.format(self.name),
                               regularizer=self.b_regularizer,
                               constraint=self.b_constraint)
    else:
      self.b = None

    self.built = True
  
  def compute_mask(self, input, input_mask=None):
    return None

  def call(self, x, mask=None):
    features_dim = self.features_dim
    step_dim = self.step_dim

    eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                          K.reshape(self.W, (features_dim, 1))), (-1, step_dim))
    
    if (self.bias):
      eij += self.b

    eij = K.tanh(eij)

    a = K.exp(eij)

    if mask is not None:
      a += K.cast(mask, K.floatx())

    a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

    weighted_input = x * K.expand_dims(a)

    result = K.sum(weighted_input, axis=1)

    if self.return_attention:
      return [result, a]
    
    return result

    def compute_output_shape(self, input_shape):
      if self.return_attention:
        return [(input_shape[0], input_shape[-1]),
                (input_shape[0], input_shape[1])]
      else:
        return input_shape[0], input_shape[-1]

# **Encoding Helpers**

In [None]:
aa_idx = {'A':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8, 'K':9, 
            'L':10, 'M':11, 'N':12, 'P':13, 'Q':14, 'R':15, 'S':16, 'T':17, 
            'V':18, 'W':19, 'Y':20, '-':21}

def onehot(seq, k=15, mask=True):
  s = list(seq)
  if (len(s) < k):
    s = s + (['-'] * (k - len(s)))
  else:
    s = s[0:k]
  
  vec = []
  for let in range(k):
    char = s[let]
    row = np.random.uniform(low=0.001, high=0.01,size=21)
    if (char not in aa_idx):
      char = '-'
    row[aa_idx[char]-1] = 1
    if ((char == '-') and (mask)):
      row = np.ones((21)) * (-400)
    
    vec.append(row)
    
  vec = np.array(vec).flatten()
  return vec

# **Data Generator Class**

In [None]:
from collections import namedtuple
from tensorflow.keras.utils import Sequence

DATA_ENTRY = namedtuple("data_entry_ic50", "sequence, hla, y_val, pep_length")

class DataGenerator(Sequence):
  def __init__(self, batch_size, samples, y_cat, mask=True):
        self.batch_size = batch_size
        self.samples = samples
        self.y_cat = y_cat
        self.mask = mask
        self.epitope_enc_map = {}
        self.hla_enc_map = {}
        self.init_data()
        self.on_epoch_end()

  def init_data(self):
    for sample in self.samples:
      sequence=sample[0]
      hla=sample[1]
      pep_length=int(sample[2])
      self.epitope_enc_map[sequence] = (onehot(sequence, 9, self.mask)).reshape(1, 9, 21)
      self.hla_enc_map[hla] = (onehot(hla, 34, mask)).reshape(1, 34, 21)

  def __len__(self):
    #return math.ceil(len(self.samples) / self.batch_size).astype(np.int)
    return (np.ceil(len(self.samples) / float(self.batch_size))).astype(np.int)

  def on_epoch_end(self):
    #'Updates indexes after each epoch'
    np.random.shuffle(self.samples)

  def __getitem__(self, idx):
    hla_alleles = []
    epitopes = []
    y_vals = []
    batch_sample = self.samples[idx * self.batch_size : (idx+1) * self.batch_size]

    i = 0
    for sample in batch_sample:

      y_val= float(sample[3])
      sequence = sample[0]
      hla = sample[1]

      # protein
      epitope_encoded = self.epitope_enc_map[sequence]
      epitopes.append(epitope_encoded)

      # hla
      hla_encoded = self.hla_enc_map[hla]
      hla_alleles.append(hla_encoded)

      # log_ic50 or binder v non-binder or immunogenicity
      if (self.y_cat != "binding"):
        y_vals.append(int(y_val))
      else:
        y_vals.append(y_val)

      i += 1

    ret_in = {'epitope':np.array(epitopes), 'hla':np.array(hla_alleles)}
    ret_out = np.array(y_vals)
      
    return (ret_in, [ret_out],)

# **MODEL 1: BiLSTM**

In [None]:
from keras import *
#from keras import Input
#from keras.layers import Bidrectional

def model_biLSTM():
    x = Input(shape(40, 21), )
    y = Bidirectional(CuDNNLSTM(units=64, return_sequences = True))(x)
    y = Bidirectional(CuDNNLSTM(units=64, return_sequences = True))(y)
    y = Attention(40)(y)
    rel = Dense(64, activation="relu")(y)
    drop = Dropout(0.1)(reg)
    out = Dense(1, activation="sigmoid")(drop)
    model = Model(inputs=x, outputs=out)
    return model


# **MODEL 2: LSTM**

In [None]:
MASK_VALUE = -400
def model_LSTM():
    x =  Input(shape(40, 21), )
    y = Masking(mask_value=MASK_VALUE, input_shape=(x.shape))(x)
    y = CuDNNLSTM(units=64, return_sequences = True)(y)
    dense = Dense(64)(y)
    drop = Dropout(0.2)(dense)
    act = Activation('tanh')(drop)
    out = Dense(1, activation='sigmoid')(act)
    return model

# **Cross Validation Code**

In [None]:
import keras
from sklearn.model_selection import KFold
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import Sequence
from keras.initializers import glorot_uniform

## NN Model is a model 
## training data in form of a df with 4 columns: Sequence, HLA, Y_val, Pep_Length
## y --  either immunogenicity or binding scores
## y_cat = either binding or else
## outfile where you want to output data to
## hla_encoding should be pseudo and NEVER full sequence
def train_gen_crossval(nn_model, training_data, y_cat, out_file, mask=True):
    
  f = open(out_file, "w")
  # Split data into training, test sets
  splits = list(KFold(n_splits=10, shuffle=True, random_state=20).split(training_data))

  print('we splitted')
    
  for (i, (train_index, test_index)) in enumerate(splits):
    keras.backend.clear_session()
    # get train_data, validation_data

    trains = training_data.iloc[train_index,:].values
    train_samples, validation_samples = split_set(trains, 0.2)
    print("split training set")
    
    test_samples = training_data.iloc[test_index,:].values
    print("got test samples")
  
    # make generator out of train data, batch size=16
    train_generator = DataGenerator(256, train_samples, y_cat, mask)
    print("made training generator:", train_generator)
  
    # make generator out of test data, batch size =16
    validation_generator = DataGenerator(256, validation_samples, y_cat, mask)
    print("made validation generator:", validation_generator)

    # call model() -- should be parametrized tho
    model = nn_model()
    
    print("made model")
    # compile model
    if (y_cat == 'binding'):
      model.compile(optimizer=Adam(lr=0.001), loss=["mean_squared_error"], metrics = ['mean_squared_error'])
    else:
      model.compile(optimizer=Adam(lr=0.001), loss=['binary_crossentropy'],
                    metrics=[keras.metrics.BinaryAccuracy(), 
                             keras.metrics.TruePositives(),
                             keras.metrics.TrueNegatives(), 
                             keras.metrics.FalsePositives(),
                             keras.metrics.FalseNegatives()])

    weights_path = './' + 'weights_{}.h5'.format(i)  
    print("Model compiled")
    # checkpoint
    ckpt = ModelCheckpoint(weights_path, save_best_only=True, 
                           save_weights_only=False, verbose=1, 
                           monitor='val_loss', mode='auto')
    early = EarlyStopping(monitor='val_loss', patience=5, verbose=1, min_delta=0.001)
    
    model.train_generator = train_generator

    model.fit(model.train_generator, epochs=1000, 
              steps_per_epoch=len(train_generator), 
              validation_steps=len(validation_generator),
              validation_data=validation_generator, 
              callbacks=[ckpt, early])
    # loads the best weights saved by the checkpoint
    model.load_weights(weights_path)

    hlas = []
    epitopes = []
    ys = []

    for sample in test_samples:
      hla_encoded = onehot(sample[1], 34).reshape(1, 34, 21)
      hlas.append(hla_encoded)
      epitope_encoded = onehot(sample[0], 9).reshape(1, 9, 21)
      epitopes.append(epitope_encoded)
      ys.append(sample[3])

    results = model.predict({'epitope': np.array(epitopes),
                             'hla': np.array(hlas)})
    
    results = np.array(results)
    print(results.shape)
    # write results
    for i in range(len(ys)):
        real = ys[i]
        predict_y = results[i]
        print("real: {}, predict: {}".format(real, predict_y))
        epitope = epitopes[i]
        hla = hlas[i]
        f.write("{},{},{},{}\n".format(hla, epitope, real, predict_y))
        
    f.close()

    print("Finish fold {}...".format(i))
    print('\n'*8)


# **Cross Validation Helpers**

In [None]:
def split_set(samples, ratio):
  for _ in range(10):
    np.random.shuffle(samples)
  
  train_count = math.ceil(len(samples) * (1 - ratio))
  return samples[:train_count], samples[train_count:]

def to_np(arr):
  X = []
  for x in arr:
    X.append(x)
  X = np.array(X)
  return X

# Training code

In [None]:
df_train_binding=pd.read_csv("../input/binding-train-pseudo/binding_train_pseudo.csv",index_col=0)

In [None]:
train_gen_crossval(model_LSTM, df_train_binding, "binding", "test_lstm_output.txt", mask=True)

In [None]:
train_gen_crossval(model_biLSTM, df_train_binding, "binding", "test_bilstm_output.txt", mask=False)

In [None]:
from keras.models import *
from keras.utils.generic_utils import CustomObjectScope

def main(binding_model, immunogenicity_model, out_file, test_X, test_y=None, 
         hla_encoding_method="pseudo", attention=False):
  if (attention):
    with CustomObjectScope({'Attention': Attention}):
      BA_model = load_model(binding_model)
      IMM_model = load_model(immunogenicity_model)
  else:
    BA_model = load_model(binding_model)
    IMM_model = load_model(immunogenicity_model)

  outfile = open("%s.txt" % (out_file), "r")
  for sample in test_X:
    hla = onehot(sample[0])
    peptide = onehot(sample[1])
    bind_out = BA_model.predict({
        'protein': np.array([hla]),
        'ligand': np.array([peptide]),
        })
    imm_out = IMM_model.predict({
      'protein': np.array([hla]),
      'ligand': np.array([peptide]), 
    })
    outfile.write('{},{},{} (log_ic50),{} (binary)'.format(sample[0], sample[1], out[0][0][0], out[1][0][0]))