In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split,
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from IPython.utils import io
import time
from sklearn.model_selection import KFold, StratifiedKFold

In [None]:
# obtain datasets

df_proteins = pd.read_csv('protein descriptors pydpi.csv')
df_drugs = pd.read_csv('drugbank drugs mordred + ECFP6 dropped columns.csv')
df_dpi = pd.read_csv('all drugbank targets.csv')

In [None]:
# obtain positive combos

all_combos = []
for r in range(0, len(df_dpi)):
    ids = df_dpi.loc[r, 'Drug IDs'].split('; ')
    protein = df_dpi.loc[r, 'STRING']
    for i in ids:
        all_combos.append([i, protein])

In [None]:
targets = df_proteins.columns.to_list()

In [None]:
drug_index = {}
for r in range(0, len(df_drugs)):
    items = df_drugs.loc[r, 'IDs']
    drug_index[items] = r

In [None]:
# set unseen drug set

val_drugspace = random.sample(list(set(drug_names)), 1000)

drugset = []
targetset = []
out = []
for i in all_combos:
    drug = i[0]
    if drug not in val_drugspace:
        continue
    protein = i[1]
    try:
        drug_index[drug]
    except:
        continue
    if protein not in targets:
        print(protein)
        continue
    drugset.append(drug)
    targetset.append(protein)
    out.append(1)
    
count = len(drugset) * 2
while len(drugset) < count:
    if len(drugset) % 1000 == 0:
        print(len(drugset))
    try:
        d1 = random.choice(val_drugspace)
        t1 = random.choice(target_names)
        if t1 not in repeats[d1]:
            drugset.append(d1)
            targetset.append(t1)
            out.append(0)
        else:
            continue
    except:
        continue
    
table = []
table1 = ()
for r in range(0, len(drugset)):
    if r % 500 == 0:
        print(r)
    if r % 5000 == 0:
        table1 = table1 + tuple(table)
        table = []
    try:
        index1 = drug_index[drugset[r]]
    except:
        print('drug not found')
        continue
    row1 = df_drugs.loc[index1].drop('IDs').to_list()
    try:
        row2 = df_proteins[targetset[r]].to_list()
    except:
        print('protein not found')
        continue
    row = row1 + row2
    table.append(tuple(row))
table1 = table1 + tuple(table)

del table
    
headers = df_drugs.columns.drop('IDs').to_list() + df_proteins['IDs'].to_list()
headers = [str(x) for x in headers]
val_set = pd.DataFrame(table1, columns=headers)
val_results = out

del table1

drop = []
for i in val_set.columns:
    if i not in df.columns.to_list()[2:]:
        drop.append(i)
        
val_set = val_set.drop(columns=drop)

In [None]:
drugset = []
targetset = []
out = []
for i in all_combos:
    drug = i[0]
    if drug in val_drugspace:
        continue
    protein = i[1]
    try:
        drug_index[drug]
    except:
        continue
    if protein not in targets:
        print(protein)
        continue
    drugset.append(drug)
    targetset.append(protein)
    out.append(1)

In [None]:
drug_names = drugset
drug_names = list(set(drug_names))
target_names = targetset
target_names = list(set(target_names))
print(f'Drug count: {len(drug_names)}, Target cound: {len(target_names)}')

In [None]:
repeats = {}
for i in all_combos:
    repeats[i[0]] = []
for i in all_combos:
    repeats[i[0]].append(i[1])

In [None]:
count = len(drugset) * 2
while len(drugset) < count:
    if len(drugset) % 1000 == 0:
        print(len(drugset))
    try:
        d1 = random.choice(drug_names)
        t1 = random.choice(target_names)
        if t1 not in repeats[d1]:
            drugset.append(d1)
            targetset.append(t1)
            out.append(0)
        else:
            continue
    except:
        continue

In [None]:
# construct dataset

table = []
table1 = ()
for r in range(0, len(drugset)):
    if r % 500 == 0:
        print(r)
    if r % 5000 == 0:
        table1 = table1 + tuple(table)
        table = []
    try:
        index1 = drug_index[drugset[r]]
    except:
        print('drug not found')
        continue
    row1 = df_drugs.loc[index1].drop('IDs').to_list()
    try:
        row2 = df_proteins[targetset[r]].to_list()
    except:
        print('protein not found')
        continue
    row = row1 + row2
    table.append(tuple(row))
table1 = table1 + tuple(table)

del table
    
headers = df_drugs.columns.drop('IDs').to_list() + df_proteins['IDs'].to_list()
headers = [str(x) for x in headers]
file = pd.DataFrame(table1, columns=headers)

drop = []
for i in file.columns:
    if i not in df.columns.to_list()[2:]:
        drop.append(i)
        
file = file.drop(columns=drop)
results = out

del table1

In [None]:
file.insert(0, 'Binding', out)
file = pd.DataFrame(file, columns=['Binding'], df_drugs.columns.to_list()[1:] + df_proteins.columns.to_list()[1:])

In [None]:
scaler = StandardScaler()
x = file.drop(columns=['Binding', 'Combination'])
y = file['Binding']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
s = joblib.save(scaler, 'scaler.save')

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow_addons as tfa

In [None]:
x_train = x_train.astype('float32')
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
model = Sequential()
n_cols = x_train.shape[1]
model.add(Dense(n_cols + 1, activation='relu', input_shape=(n_cols,)))
model.add(Dropout(0.2))
model.add(Dense(n_cols / 2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='linear'))

In [None]:
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
             metrics=[tf.keras.metrics.AUC(), tfa.metrics.F1Score(num_classes=2)])

In [None]:
early_stopping_monitor = EarlyStopping(patience=3)
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=50, batch_size = 32, callbacks=[early_stopping_monitor])

In [None]:
# temp scaling code obtained here: https://sourajit16-02-93.medium.com/neural-network-calibration-46997f8c872c

temp = tf.Variable(initial_value=1.0, trainable=True)

def compute_loss():
    y_pred_model_w_temp = tf.math.divide(y_pred, temp)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(y_test), y_pred_model_w_temp))
    return loss

optimizer = tf.optimizers.Adam(learning_rate=0.01)

print('Temperature Initial value: {}'.format(temp.numpy()))

for i in range(300):
     opts = optimizer.minimize(compute_loss, var_list=[temp])
        
print('Temperature Final value: {}'.format(temp.numpy()))

In [None]:
# 5 fold cross validaiton

kf = KFold(n_splits=5, shuffle=True)

In [None]:
from sklearn.utils import shuffle

df = shuffle(file)

In [None]:
file = np.array(df.drop(columns=['Combination', 'Synergy']))
results = np.array(df['Synergy'])

In [None]:
from sklearn.metrics import balanced_accuracy_score, multilabel_confusion_matrix, f1_score, confusion_matrix, accuracy_score, roc_auc_score, precision_score, recall_score, average_precision_score, roc_curve, auc, precision_recall_curve

In [None]:
acc = []
roc = []
prc = []
prec = []
f1 = []
rec = []
spec = []
curve = []
prc_curve = []
models = {}
scalers = {}
test_sets = {}
test_results = {}
iteration = 0

for train, val in kf.split(file, results):
    iteration += 1
    x_train = file[train]
    y_train = to_categorical(results[train])
    x_val = file[val]
    y_val = results[val]
    
    scaler = StandardScaler()
    
    x_train = scaler.fit_transform(x_train).astype('float32')
    x_val = scaler.transform(x_val).astype('float32')
    
    model = Sequential()
    n_cols = x_train.shape[1]
    model.add(Dense(n_cols + 1, activation='relu', input_shape=(n_cols,)))
    model.add(Dropout(0.2))
    model.add(Dense(n_cols / 2, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    model.add(Temperature())
    
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.001), 
              loss='categorical_crossentropy',
             metrics=[tf.keras.metrics.AUC(), tfa.metrics.F1Score(num_classes=2)])
    
    early_stopping_monitor = EarlyStopping(patience=3)
    model.fit(x_train, y_train, validation_split=0.2, epochs=50, batch_size = 32, callbacks=[early_stopping_monitor])
    
    predictions = model.predict(x_val)
    
    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)
            
    acc.append(accuracy_score(y_val, binary_pred))
    roc.append(roc_auc_score(y_val, binary_pred))
    prc.append(average_precision_score(y_val, binary_pred))
    f1.append(f1_score(y_val, binary_pred))
    rec.append(recall_score(y_val, binary_pred))
    prec.append(precision_score(y_val, binary_pred))
    spec.append(recall_score(y_val, binary_pred, pos_label=0))
    
    print(roc_auc_score(y_val, binary_pred))
    
    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve.append([fpr, tpr, thresholds])
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve.append([precision, recall, thresholds])
    
    models[f'model{iteration}'] = model
    scalers[f'scaler{iteration}'] = scaler
    test_sets[f'set{iteration}'] = x_val
    test_results[f'set{iteration}'] = y_val