In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
aa_idx = {'A':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8, 'K':9, 
            'L':10, 'M':11, 'N':12, 'P':13, 'Q':14, 'R':15, 'S':16, 'T':17, 
            'V':18, 'W':19, 'Y':20, '-':21}

def blosum(seq, k):
  print("BLOSUM-ing: ", seq)
  s = list(seq)
  if (len(s) < k):
    s = s + (['*'] * (k - len(s)))
  else:
    s = s[0:k]
  vec = pd.DataFrame([blosum_mat[i] for i in s]).iloc[:, (list(np.arange(20)) + [23])].values.flatten()
  return vec

def onehot(seq, k=15):
  print(seq)
  s = list(seq)
  if (len(s) < k):
    s = s + (['-'] * (k - len(s)))
  else:
    s = s[0:k]
  
  vec = []
  for let in range(k):
    char = s[let]
    row = np.zeros((21))
    if (char not in aa_idx):
      char = '-'
    row[aa_idx[char]-1] = 1
    vec.append(row)
    if (char == '-'):
      row = np.ones((21)) * (-400000)
  vec = np.array(vec).flatten()
  return vec

def softhot(seq, k=15):
  s = list(seq)
  if (len(s) < k):
    s = s + (['-'] * (k - len(s)))
  else:
    s = s[0:k]
  
  vec = []
  for let in range(k):
    char = s[let]
    row = np.ones((21)) * ((0.1)/(20)) 
    if (char not in aa_idx):
      char = '-'
    row[aa_idx[char]-1] = 0.9
    vec.append(row)
    if (char == '-'):
      row = np.ones((21)) * (-400000)
  vec = np.array(vec).flatten()
  return vec

# met refers to function
def encode_seq(method, peptide_seqs):
  methods_dict = {'ONEHOT':onehot, 'BLOSUM':blosum, 'SOFTHOT':softhot}
  met = methods_dict[method]
  mapped = peptide_seqs.apply(lambda x: met(x))
  return mapped

In [None]:
def AAC(peptide_seq):
    peptide_ls = list(peptide_seq)
    peptide_ls = [x for x in peptide_ls if x in aa_idx]
    peptide_mapped = np.array(list(map(lambda x: aa_idx[x] - 1, peptide_ls)))
    counts = np.bincount(peptide_mapped, minlength=20)
    counts = counts / (len(peptide_ls))
    
    return counts

In [None]:
# Map AAC encoding onto each pseudo-seq, HLA seq
def pseudo_to_AAC(df):
    return df['HLA'].apply(lambda x: AAC(x))

def to_np(vals):
    X = []
    for x in vals:
        X.append(x)
    X = np.array(X)
    return X

def process_input_df(df):
    X_hla = df['HLA'].apply(lambda x: AAC(x))
    # epitope sequences --> each string row maps to its encoding to make a Series of numpy arrays
    X_seqs = df['Sequence'].apply(lambda x: AAC(x))
    # hla pseudosequences --> same business
    X_hla = to_np(X_hla.values)
    X_seqs = to_np(X_seqs.values)
    # (num samples) by (40 where first 20 is encoding of epitope and next 20 pseudoseq encoding)
    X_full = np.concatenate((X_seqs, X_hla), axis=1)
    y = df['Y_val'].values
    
    return X_full, y

In [None]:
df_train_binding = pd.DataFrame(pd.read_csv("../input/binding-train-pseudo/binding_train_pseudo.csv", index_col=0))
df_train_binding = df_train_binding[df_train_binding['Pep_Length'] <= 9]
X, y = process_input_df(df_train_binding)


In [None]:
X.shape

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
from keras.wrappers.scikit_learn import KerasRegressor
from keras.models import Sequential
from keras.layers import Dense

# Network architecture:
# input layer takes 1x49x21 tensor
# hidden layer has 20 fully connected units
# output layer uses relu activation, outputs either
#     - binding strength between 0 and 1 inclusive
#     - 0 (no binding) or 1 (binding)
#     - 0 (no immunogen) or 1 (immunogen)
# 
def wider_model(hidden_units=20, input_dim=40):
    # create model
    model = Sequential()
    model.add(Dense(hidden_units, input_dim=input_dim, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

def model_30():
    model = wider_model(30)
    return model

def model_40():
    model = wider_model(40)
    return model

def model_50():
    model = wider_model(50)
    return model

def model_60():
    model = wider_model(60)
    return model

def model_70():
    model = wider_model(70)
    return model

def model_80():
    model = wider_model(80)
    return model


In [None]:
# Network architecture:
# input layer takes 1x49x21 tensor
# hidden layer has 20 fully connected units
# output layer uses relu activation, outputs either
#     - binding strength between 0 and 1 inclusive
#     - 0 (no binding) or 1 (binding)
#     - 0 (no immunogen) or 1 (immunogen)
# 
def wider_classif_model(hidden_units=20, input_dim=40):
    # create model
    model = Sequential()
    model.add(Dense(hidden_units, input_dim=input_dim, kernel_initializer='normal', activation='sigmoid'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def model_40():
    model = wider_classif_model(40)
    return model

def model_60():
    model = wider_classif_model(60)
    return model

def model_80():
    model = wider_classif_model(80)
    return model

x = wider_classif_model()

In [None]:
# evaluate model with standardized dataset

def evaluator(func, i, X, y, scoring='neg_mean_squared_error'):
    print("Got model")
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('mlp', KerasRegressor(build_fn=func, epochs=5, batch_size=5, verbose=1)))
    pipeline = Pipeline(estimators)
    kfold = KFold(n_splits=5)
    results = cross_val_score(pipeline, X, y, cv=kfold, scoring=scoring)
    print("%d: %.5f (%.5f) MSE" % (i, results.mean(), results.std()))
    print(("%d:" % i), results)

In [None]:
evaluator(wider_model, 20, X, y)

In [None]:
evaluator(model_40, 40, X, y)

In [None]:
evaluator(model_60, 60, X, y)

In [None]:
evaluator(model_80, 80, X, y)

In [None]:
# convert to threshold
import math
THRESHOLD = 1 - math.log(500) / math.log(50000)

def binarize_immuno(score):
    if score >= THRESHOLD: return 1
    else: return 0

df_train_binding_bin = df_train_binding.copy()
df_train_binding_bin['Y_val'] = df_train_binding_bin['Y_val'].apply(lambda x: binarize_immuno(x))

df_train_binding_bin

In [None]:
from sklearn.metrics import roc_auc_score
import keras

X_bin, y_bin = process_input_df(df_train_binding_bin)

def new_evaluator(mod, i, X, y, y_binary, out_file):

    scoring='neg_mean_squared_error'
    print("Got model")
    f = open(out_file, "w")

    splits = 5
    
    for i in range(splits):
        keras.backend.clear_session()
    
        model = mod(i)
        model.fit(X, y, batch_size=32, epochs=5)
        y_predict = model.predict(X)
        
        score = roc_auc_score(y_binary, y_predict)
        print("AUC:", score)
        f.write("AUC on fold %d : %f\n" % (i+1, score))


In [None]:
new_evaluator(wider_model, 20, X, y, y_bin, "mlp_metrics.txt")

In [None]:
evaluator(wider_classif_model, 20, X, y, scoring='roc_auc')

In [None]:
evaluator(wider_classif_model, 40, X, y, scoring='roc_auc')

In [None]:
evaluator(wider_classif_model, 60, X, y, scoring='roc_auc')

In [None]:
evaluator(wider_classif_model, 80, X, y, scoring='roc_auc')

# **Debugging Work**

In [None]:
df_train_binding=pd.read_csv("../input/binding-train-pseudo/binding_train_pseudo.csv",index_col=0)
df_train_binding = df_train_binding[df_train_binding['Pep_Length'] <= 9]
df_train_binding_bin.shape