In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from IPython.utils import io
import time
import joblib
import itertools
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow_addons as tfa
import ast
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import gc

In [None]:
# obtain datasets

df_combos = pd.read_csv('synthetic drugcomb combos.csv')

In [None]:
combos = df_combos.apply(lambda row: [row['Drug 1']] + [row['Drug 2']], axis=1).to_list()
combos = list(map(list, set(map(frozenset, combos))))
combos = list(map(list, set(map(tuple, map(set, combos)))))

In [None]:
drop = []
for i in combos:
    if len(i) == 1:
        drop.append(i)
        
combos = [x for x in combos if x not in drop]

In [None]:
drugspace = []
for i in combos:
    if i[0] not in drugspace:
        drugspace.append(i[0])
    if i[1] not in drugspace:
        drugspace.append(i[1])

In [None]:
all_drugs = drugspace

In [None]:
descriptors = pd.read_csv('drugcomb db drug descriptors.csv')
var = VarianceThreshold(0.02)
var.fit_transform(descriptors.drop(columns='IDs')).shape[1]
cols = var.get_feature_names_out(input_features=descriptors.drop(columns='IDs').columns)

drop = []
for i in descriptors.drop(columns='IDs').columns:
    if i not in cols:
        drop.append(i)
        
descriptors = descriptors.drop(columns=drop)

In [None]:
targets = pd.read_csv('drugcomb targets.csv')
var = VarianceThreshold(0.02)
var.fit_transform(targets.drop(columns='IDs')).shape[1]
cols = var.get_feature_names_out(input_features=targets.drop(columns='IDs').columns)

drop = []
for i in targets.drop(columns='IDs').columns:
    if i not in cols:
        drop.append(i)
targets = targets.drop(columns=drop)

In [None]:
col_names = descriptors.columns.to_list()[1:] + targets.columns.to_list()[1:]
temp = [x + ' - 1' for x in col_names]
col_names = col_names + temp

In [None]:
index_dict = {}
for r in range(0, len(descriptors)):
    index_dict[descriptors.loc[r, 'IDs']] = r
    
tar_dict = {}
for r in range(0, len(targets)):
    tar_dict[targets.loc[r, 'IDs']] = r

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
# set validation dataset

val_drugspace = random.sample(all_drugs, 500)
drugspace = val_drugspace
val_drugspace = drugspace

combo_dict = {}
for i in drugspace:
    combo_dict[i] = []
for i in combos:
    try:
        x = combo_dict[i[0]]
        x = combo_dict[i[1]]
        combo_dict[i[0]].append(i[1])
        combo_dict[i[1]].append(i[0])
    except:
        continue

print('setting positive combos')
existing_combinations = []
for i in drugspace:
    ingredients = combo_dict[i]
    for i1 in ingredients:
        existing_combinations.append([i, i1])
        
existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

n_combos = []
count = len(existing_combinations)
while len(n_combos) < count:
    ingredient1 = 0
    ingredient2 = 0
    while ingredient1 == ingredient2:
        ingredient1 = random.choice(drugspace)
        ingredient2 = random.choice(drugspace)
    if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
        continue
    if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
        n_combos.append([ingredient1, ingredient2])
        if len(n_combos) % 1000 == 0:
            print(len(n_combos))

print('constructing dataset')
table = []
for r in existing_combinations:
    if existing_combinations.index(r) % 1000 == 0:
        print(existing_combinations.index(r))
    try:
        index1 = index_dict[r[0]]
        row1 = descriptors.loc[index1].drop('IDs').to_list()
        
        tarindex1 = tar_dict[r[0]]
        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()
        
        index2 = index_dict[r[1]]
        row2 = descriptors.loc[index2].drop('IDs').to_list()
        
        tarindex2 = tar_dict[r[1]]
        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()
        
        row = row1 + tarrow1 + row2 + tarrow2
        # row = row1 + row2
        # row = tarrow1 + tarrow2
        table.append(row)
    except KeyboardInterrupt:
        raise
    except:
        del existing_combinations[existing_combinations.index(r)]
        continue
    
data = np.array(table)

del table

results = []
for r in range(0, len(data)):
    results.append(1)
    
table = []
for r in n_combos:
    if n_combos.index(r) % 1000 == 0:
        print(n_combos.index(r))
    try:
        index1 = index_dict[r[0]]
        row1 = descriptors.loc[index1].drop('IDs').to_list()
        
        tarindex1 = tar_dict[r[0]]
        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()
        
        index2 = index_dict[r[1]]
        row2 = descriptors.loc[index2].drop('IDs').to_list()
        
        tarindex2 = tar_dict[r[1]]
        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()
        
        row = row1 + tarrow1 + row2 + tarrow2
        # row = row1 + row2
        # row = tarrow1 + tarrow2
        table.append(row)
    except KeyboardInterrupt:
        raise
    except:
        del n_combos[n_combos.index(r)]
        continue
    
data1 = np.array(table)

del table

results1 = []
for r in range(0, len(data1)):
    results1.append(0)
    
file = np.concatenate((data, data1), axis=0)
results = results + results1
del data, data1
print('dataset shape:')
print(file.shape)

val_set = pd.DataFrame(file, columns=col_names)
val_results = results
val_combos = existing_combinations + n_combos

In [None]:
drugspace = [x for x in all_drugs if x not in val_drugspace]

In [None]:
combo_dict = {}
for i in drugspace:
    combo_dict[i] = []
for i in combos:
    try:
        x = combo_dict[i[0]]
        x = combo_dict[i[1]]
        combo_dict[i[0]].append(i[1])
        combo_dict[i[1]].append(i[0])
    except:
        continue

In [None]:
existing_combinations = []
for i in drugspace:
    ingredients = combo_dict[i]
    for i1 in ingredients:
        existing_combinations.append([i, i1])

In [None]:
existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

In [None]:
n_combos = []
count = len(existing_combinations)
while len(n_combos) < count:
    ingredient1 = 0
    ingredient2 = 0
    while ingredient1 == ingredient2:
        ingredient1 = random.choice(drugspace)
        ingredient2 = random.choice(drugspace)
    if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
        continue
    if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
        n_combos.append([ingredient1, ingredient2])
        if len(n_combos) % 1000 == 0:
            print(len(n_combos))

In [None]:
# construct dataset

table = []
for r in existing_combinations:
    if existing_combinations.index(r) % 1000 == 0:
        print(existing_combinations.index(r))
    try:
        index1 = index_dict[r[0]]
        row1 = descriptors.loc[index1].drop('IDs').to_list()
        
        tarindex1 = tar_dict[r[0]]
        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()
        
        index2 = index_dict[r[1]]
        row2 = descriptors.loc[index2].drop('IDs').to_list()
        
        tarindex2 = tar_dict[r[1]]
        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()
        
        row = row1 + tarrow1 + row2 + tarrow2
        # row = row1 + row2
        # row = tarrow1 + tarrow2
        table.append(row)
    except KeyboardInterrupt:
        raise
    except:
        # del existing_combinations[existing_combinations.index(r)]
        continue

data = np.array(table)

del table

results = []
for r in range(0, len(data)):
    results.append(1)

table = []
for r in n_combos:
    if n_combos.index(r) % 1000 == 0:
        print(n_combos.index(r))
    try:
        index1 = index_dict[r[0]]
        row1 = descriptors.loc[index1].drop('IDs').to_list()
        
        tarindex1 = tar_dict[r[0]]
        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()
        
        index2 = index_dict[r[1]]
        row2 = descriptors.loc[index2].drop('IDs').to_list()
        
        tarindex2 = tar_dict[r[1]]
        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()
        
        row = row1 + tarrow1 + row2 + tarrow2
        # row = row1 + row2
        # row = tarrow1 + tarrow2
        table.append(row)
    except KeyboardInterrupt:
        raise
    except:
        # del n_combos[n_combos.index(r)]
        continue

data1 = np.array(table)

del table

results1 = []
for r in range(0, len(data1)):
    results1.append(0)

file = np.concatenate((data, data1), axis=0)
results = results + results1
del data, data1
file.shape

In [None]:
file = pd.DataFrame(file, columns=col_names)

In [None]:
names = descriptors.columns.to_list()[1:] + targets.columns.to_list()[1:]

drop = [x for x in file.columns if x in names]
temp1 = file.drop(columns=drop)
temp1.columns = drop
drop = [x for x in file.columns if x not in names]
temp2 = file.drop(columns=drop)
temp2.columns = drop

file = pd.concat([file, pd.concat([temp1, temp2], axis=1)]).reset_index(drop=True)
del temp1, temp2
results = results + results

In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True)

In [None]:
from sklearn.utils import shuffle

file, results = shuffle(file, results)

In [None]:
file = np.array(file)
results = np.array(results)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve

In [None]:
# 5 fold cross validation
# temp scaling code obtained here: https://sourajit16-02-93.medium.com/neural-network-calibration-46997f8c872c

acc = []
roc = []
prc = []
prec = []
f1 = []
rec = []
spec = []
curve = []
prc_curve = []
models = {}
scalers = {}
test_sets = {}
test_results = {}
temperature = []
iteration = 0

for train, val in kf.split(file, results):
    iteration += 1
    x_train = file[train]
    y_train = to_categorical(results[train])
    x_val = file[val]
    y_val = results[val]
    
    scaler = StandardScaler()
    
    x_train = scaler.fit_transform(x_train).astype('float32')
    x_val = scaler.transform(x_val).astype('float32')
    
    model = tf.keras.models.Sequential()
    n_cols = x_train.shape[1]
    model.add(tf.keras.layers.Dense(n_cols, activation='relu', input_shape=(n_cols,)))
    model.add(Dropout(0.2))
    model.add(tf.keras.layers.Dense(int(n_cols / 2), activation='relu'))
    model.add(Dropout(0.5))
    model.add(tf.keras.layers.Dense(2, activation='linear'))
    
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0005, momentum=0.5), 
          loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
         metrics=[tf.keras.metrics.AUC(curve='roc'), 'accuracy', tfa.metrics.F1Score(num_classes=2)])
    
    early_stopping_monitor = EarlyStopping(patience=3)
    model.fit(x_train, y_train, validation_data=(x_val, to_categorical(y_val)), epochs=100, batch_size = 32, callbacks=[early_stopping_monitor])
    
    temp = tf.Variable(initial_value=1.0, trainable=True)
    y_pred = model.predict(x_val)
    y_test = to_categorical(y_val)

    def compute_loss():
        y_pred_model_w_temp = tf.math.divide(y_pred, temp)
        loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(y_test), y_pred_model_w_temp))
        return loss

    optimizer = tf.optimizers.Adam(learning_rate=0.01)

    print('Temperature Initial value: {}'.format(temp.numpy()))

    for i in range(300):
         opts = optimizer.minimize(compute_loss, var_list=[temp])

    print('Temperature Final value: {}'.format(temp.numpy()))
    
    temperature.append(temp.numpy())
    
    predictions = model.predict(x_val)
    predictions = tf.nn.softmax(predictions / temp)
    
    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)
            
    acc.append(accuracy_score(y_val, binary_pred))
    roc.append(roc_auc_score(y_val, binary_pred))
    prc.append(average_precision_score(y_val, binary_pred))
    f1.append(f1_score(y_val, binary_pred))
    rec.append(recall_score(y_val, binary_pred))
    prec.append(precision_score(y_val, binary_pred))
    spec.append(recall_score(y_val, binary_pred, pos_label=0))
    
    print(roc_auc_score(y_val, binary_pred))
    
    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve.append([fpr, tpr, thresholds])
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve.append([precision, recall, thresholds])
    
    models[f'model{iteration}'] = model
    scalers[f'scaler{iteration}'] = scaler
    test_sets[f'set{iteration}'] = x_val
    test_results[f'set{iteration}'] = y_val
    
    del x_train, x_val
    
    del model

    tf.keras.backend.clear_session()

    gc.collect()

In [None]:
random.shuffle(all_drugs)

In [None]:
# bootstrapping ensemble testing of one model, two model, four model, and combined

subsets = list(chunks(all_drugs, 500))
acc = []
roc = []
prc = []
prec = []
f1 = []
rec = []
spec = []
curve = []
prc_curve = []

acc1 = []
roc1 = []
prc1 = []
prec1 = []
f11 = []
rec1 = []
spec1 = []
curve1 = []
prc_curve1 = []

acc2 = []
roc2 = []
prc2 = []
prec2 = []
f12 = []
rec2 = []
spec2 = []
curve2 = []
prc_curve2 = []

acc4 = []
roc4 = []
prc4 = []
prec4 = []
f14 = []
rec4 = []
spec4 = []
curve4 = []
prc_curve4 = []

for i in subsets:
    val_drugspace = i
    val_set, val_results, val_combos = create_validation_set(val_drugspace)
    drugspace = [x for x in all_drugs if x not in val_drugspace]
    model1s = []
    model2s = []
    model4s = []
    
    combo_dict = {}
    for i in drugspace:
        combo_dict[i] = []
    for i in combos:
        try:
            x = combo_dict[i[0]]
            x = combo_dict[i[1]]
            combo_dict[i[0]].append(i[1])
            combo_dict[i[1]].append(i[0])
        except:
            continue

    existing_combinations = []
    for i in drugspace:
        ingredients = combo_dict[i]
        for i1 in ingredients:
            existing_combinations.append([i, i1])

    existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))
    
    for r in range(0, 3):
        if r == 0:
            for i in range(0, len(existing_combinations)):
                existing_combinations.append([existing_combinations[i][1], existing_combinations[i][0]])
                
            n_combos = []
            count = len(existing_combinations) / 2
            while len(n_combos) < count:
                ingredient1 = 0
                ingredient2 = 0
                while ingredient1 == ingredient2:
                    ingredient1 = random.choice(drugspace)
                    ingredient2 = random.choice(drugspace)
                if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
                    continue
                if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
                    n_combos.append([ingredient1, ingredient2])
                    if len(n_combos) % 1000 == 0:
                        print(len(n_combos))

            for i in range(0, len(n_combos)):
                n_combos.append([n_combos[i][1], n_combos[i][0]])
                
            table = []
            for r in existing_combinations:
                if existing_combinations.index(r) % 1000 == 0:
                    print(existing_combinations.index(r))
                try:
                    index1 = index_dict[r[0]]
                    row1 = descriptors.loc[index1].drop('IDs').to_list()

                    tarindex1 = tar_dict[r[0]]
                    tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                    index2 = index_dict[r[1]]
                    row2 = descriptors.loc[index2].drop('IDs').to_list()

                    tarindex2 = tar_dict[r[1]]
                    tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                    row = row1 + tarrow1 + row2 + tarrow2
                    # row = row1 + row2
                    # row = tarrow1 + tarrow2
                    table.append(row)
                except KeyboardInterrupt:
                    raise
                except:
                    del existing_combinations[existing_combinations.index(r)]
                    continue

            data = np.array(table)

            del table

            results = []
            for r in range(0, len(data)):
                results.append(1)

            table = []
            for r in n_combos:
                if n_combos.index(r) % 1000 == 0:
                    print(n_combos.index(r))
                try:
                    index1 = index_dict[r[0]]
                    row1 = descriptors.loc[index1].drop('IDs').to_list()

                    tarindex1 = tar_dict[r[0]]
                    tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                    index2 = index_dict[r[1]]
                    row2 = descriptors.loc[index2].drop('IDs').to_list()

                    tarindex2 = tar_dict[r[1]]
                    tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                    row = row1 + tarrow1 + row2 + tarrow2
                    # row = row1 + row2
                    # row = tarrow1 + tarrow2
                    table.append(row)
                except KeyboardInterrupt:
                    raise
                except:
                    del n_combos[n_combos.index(r)]
                    continue

            data1 = np.array(table)

            del table

            results1 = []
            for r in range(0, len(data1)):
                results1.append(0)

            file = np.concatenate((data, data1), axis=0)
            results = results + results1
            del data, data1
            file.shape
            
            file = pd.DataFrame(file, columns=col_names)
            
            scale_names = descriptors.columns.to_list()[1:]
            scale_names = [x for x in scale_names if x in file.columns]
            temp = [x + ' - 1' for x in scale_names]
            scale_names = scale_names + temp

            x_train, x_val, y_train, y_val = train_test_split(file, results, test_size=0.2)

            scaler = StandardScaler()

            standard_transformer = Pipeline(steps=[
                    ('standard', scaler)])

            preprocessor = ColumnTransformer(
                        remainder='passthrough', transformers=[
                            ('std', standard_transformer , scale_names),
                        ])
            x_train = preprocessor.fit_transform(x_train)
            x_val = preprocessor.transform(x_val)

            x_train = x_train.astype('float32')
            y_train = to_categorical(y_train)

            print('training model')
            model = tf.keras.models.Sequential()
            n_cols = x_train.shape[1]
            model.add(tf.keras.layers.Dense(n_cols, activation='relu', input_shape=(n_cols,)))
            model.add(Dropout(0.2))
            model.add(tf.keras.layers.Dense(int(n_cols / 2), activation='relu'))
            model.add(Dropout(0.5))
            model.add(tf.keras.layers.Dense(2, activation='linear'))

            model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0005, momentum=0.5), 
                      loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                     metrics=[tf.keras.metrics.AUC(), 'accuracy', tf.keras.metrics.Precision()])

            early_stopping_monitor = EarlyStopping(patience=3)
            model.fit(x_train, y_train, validation_data=(x_val, to_categorical(y_val)), epochs=100, batch_size = 32, callbacks=[early_stopping_monitor])

            temp = tf.Variable(initial_value=1.0, trainable=True)
            y_pred = model.predict(x_val)
            y_test = to_categorical(y_val)

            def compute_loss():
                y_pred_model_w_temp = tf.math.divide(y_pred, temp)
                loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(y_test), y_pred_model_w_temp))
                return loss

            optimizer = tf.optimizers.Adam(learning_rate=0.01)

            print('Temperature Initial value: {}'.format(temp.numpy()))

            for i in range(300):
                 opts = optimizer.minimize(compute_loss, var_list=[temp])

            print('Temperature Final value: {}'.format(temp.numpy()))
            
            model1s.append([model, preprocessor, temp.numpy()])
        else:
            existing_combinations = []
            for i in drugspace:
                ingredients = combo_dict[i]
                for i1 in ingredients:
                    existing_combinations.append([i, i1])

            existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))
            
            if r == 1:
                c = 2
            else:
                c = 4
                
            import math
            count = math.ceil(len(existing_combinations) / c)
            subset = list(chunks(existing_combinations, int(count)))
            
            for i in subset:
                existing_combinations = i

                for i in range(0, len(existing_combinations)):
                    existing_combinations.append([existing_combinations[i][1], existing_combinations[i][0]])

                n_combos = []
                count = len(existing_combinations) / 2
                while len(n_combos) < count:
                    ingredient1 = 0
                    ingredient2 = 0
                    while ingredient1 == ingredient2:
                        ingredient1 = random.choice(drugspace)
                        ingredient2 = random.choice(drugspace)
                    if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
                        continue
                    if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
                        n_combos.append([ingredient1, ingredient2])
                        if len(n_combos) % 1000 == 0:
                            print(len(n_combos))

                for i in range(0, len(n_combos)):
                    n_combos.append([n_combos[i][1], n_combos[i][0]])

                print('constructing dataset')
                table = []
                for r in existing_combinations:
                    if existing_combinations.index(r) % 1000 == 0:
                        print(existing_combinations.index(r))
                    try:
                        index1 = index_dict[r[0]]
                        row1 = descriptors.loc[index1].drop('IDs').to_list()

                        tarindex1 = tar_dict[r[0]]
                        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                        index2 = index_dict[r[1]]
                        row2 = descriptors.loc[index2].drop('IDs').to_list()

                        tarindex2 = tar_dict[r[1]]
                        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                        row = row1 + tarrow1 + row2 + tarrow2
                        # row = row1 + row2
                        # row = tarrow1 + tarrow2
                        table.append(row)
                    except KeyboardInterrupt:
                        raise
                    except:
                        del existing_combinations[existing_combinations.index(r)]
                        continue

                data = np.array(table)

                del table

                results = []
                for r in range(0, len(data)):
                    results.append(1)

                table = []
                for r in n_combos:
                    if n_combos.index(r) % 1000 == 0:
                        print(n_combos.index(r))
                    try:
                        index1 = index_dict[r[0]]
                        row1 = descriptors.loc[index1].drop('IDs').to_list()

                        tarindex1 = tar_dict[r[0]]
                        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                        index2 = index_dict[r[1]]
                        row2 = descriptors.loc[index2].drop('IDs').to_list()

                        tarindex2 = tar_dict[r[1]]
                        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                        row = row1 + tarrow1 + row2 + tarrow2
                        # row = row1 + row2
                        # row = tarrow1 + tarrow2
                        table.append(row)
                    except KeyboardInterrupt:
                        raise
                    except:
                        del n_combos[n_combos.index(r)]
                        continue

                data1 = np.array(table)

                del table

                results1 = []
                for r in range(0, len(data1)):
                    results1.append(0)

                file = np.concatenate((data, data1), axis=0)
                results = results + results1
                del data, data1
                print('dataset shape')
                print(file.shape)

                file = pd.DataFrame(file, columns=col_names)

                scale_names = descriptors.columns.to_list()[1:]
                scale_names = [x for x in scale_names if x in file.columns]
                temp = [x + ' - 1' for x in scale_names]
                scale_names = scale_names + temp

                x_train, x_val, y_train, y_val = train_test_split(file, results, test_size=0.2)

                scaler = StandardScaler()

                standard_transformer = Pipeline(steps=[
                        ('standard', scaler)])

                preprocessor = ColumnTransformer(
                        remainder='passthrough', transformers=[
                            ('std', standard_transformer , scale_names),
                        ])
                x_train = preprocessor.fit_transform(x_train)
                x_val = preprocessor.transform(x_val)
            
                x_train = x_train.astype('float32')
                y_train = to_categorical(y_train)

                print('training model')
                model = tf.keras.models.Sequential()
                n_cols = x_train.shape[1]
                model.add(tf.keras.layers.Dense(n_cols, activation='relu', input_shape=(n_cols,)))
                model.add(Dropout(0.2))
                model.add(tf.keras.layers.Dense(int(n_cols / 2), activation='relu'))
                model.add(Dropout(0.5))
                model.add(tf.keras.layers.Dense(2, activation='linear'))

                model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0005, momentum=0.5), 
                          loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                         metrics=[tf.keras.metrics.AUC(), 'accuracy', tf.keras.metrics.Precision()])

                early_stopping_monitor = EarlyStopping(patience=3)
                model.fit(x_train, y_train, validation_data=(x_val, to_categorical(y_val)), epochs=100, batch_size = 32, callbacks=[early_stopping_monitor])
                
                temp = tf.Variable(initial_value=1.0, trainable=True)
                y_pred = model.predict(x_val)
                y_test = to_categorical(y_val)

                def compute_loss():
                    y_pred_model_w_temp = tf.math.divide(y_pred, temp)
                    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(y_test), y_pred_model_w_temp))
                    return loss

                optimizer = tf.optimizers.Adam(learning_rate=0.01)

                print('Temperature Initial value: {}'.format(temp.numpy()))

                for i in range(300):
                     opts = optimizer.minimize(compute_loss, var_list=[temp])

                print('Temperature Final value: {}'.format(temp.numpy()))
                
                if c == 2:
                    model2s.append([model, preprocessor, temp.numpy()])
                else:
                    model4s.append([model, preprocessor, temp.numpy()])
                    
    predictions = []
    for r in range(0, 3):
        if r == 0:
            pred = model1s[0][0].predict(model1s[0][1].transform(val_set))
            pred = tf.nn.softmax(pred / model1s[0][2])
            pred1 = pred
            predictions.append(pred)
        elif r == 1:
            temp_pred = []
            for r1 in range(0, 2):
                pred = model2s[r1][0].predict(model2s[r1][1].transform(val_set))
                pred = tf.nn.softmax(pred / model2s[r1][2])
                temp_pred.append(pred)
            predictions.append(np.mean(np.array(temp_pred), axis=0))
            pred2 = np.mean(np.array(temp_pred), axis=0)
        else:
            temp_pred = []
            for r1 in range(0, 4):
                pred = model4s[r1][0].predict(model4s[r1][1].transform(val_set))
                pred = tf.nn.softmax(pred / model4s[r1][2])
                temp_pred.append(pred)
            predictions.append(np.mean(np.array(temp_pred), axis=0))
            pred4 = np.mean(np.array(temp_pred), axis=0)

    predictions = np.mean(np.array(predictions), axis=0)

    y_val = val_results

    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)

    acc.append(accuracy_score(y_val, binary_pred))
    f1.append(f1_score(y_val, binary_pred))
    rec.append(recall_score(y_val, binary_pred))
    prec.append(precision_score(y_val, binary_pred))
    spec.append(recall_score(y_val, binary_pred, pos_label=0))

    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve.append([fpr, tpr, thresholds])
    roc.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve.append([precision, recall, thresholds])
    prc.append(auc(recall, precision))

    predictions = pred1

    y_val = val_results

    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)

    acc1.append(accuracy_score(y_val, binary_pred))
    f11.append(f1_score(y_val, binary_pred))
    rec1.append(recall_score(y_val, binary_pred))
    prec1.append(precision_score(y_val, binary_pred))
    spec1.append(recall_score(y_val, binary_pred, pos_label=0))

    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve1.append([fpr, tpr, thresholds])
    roc1.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve1.append([precision, recall, thresholds])
    prc1.append(auc(recall, precision))

    predictions = pred2

    y_val = val_results

    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)

    acc2.append(accuracy_score(y_val, binary_pred))
    f12.append(f1_score(y_val, binary_pred))
    rec2.append(recall_score(y_val, binary_pred))
    prec2.append(precision_score(y_val, binary_pred))
    spec2.append(recall_score(y_val, binary_pred, pos_label=0))

    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve2.append([fpr, tpr, thresholds])
    roc2.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve2.append([precision, recall, thresholds])
    prc2.append(auc(recall, precision))

    predictions = pred4

    y_val = val_results

    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)

    acc4.append(accuracy_score(y_val, binary_pred))
    f14.append(f1_score(y_val, binary_pred))
    rec4.append(recall_score(y_val, binary_pred))
    prec4.append(precision_score(y_val, binary_pred))
    spec4.append(recall_score(y_val, binary_pred, pos_label=0))

    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve4.append([fpr, tpr, thresholds])
    roc4.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve4.append([precision, recall, thresholds])
    prc4.append(auc(recall, precision))

In [None]:
# train final models

drugspace = all_drugs
model1s = []
model2s = []
model4s = []

combo_dict = {}
for i in drugspace:
    combo_dict[i] = []
for i in combos:
    try:
        x = combo_dict[i[0]]
        x = combo_dict[i[1]]
        combo_dict[i[0]].append(i[1])
        combo_dict[i[1]].append(i[0])
    except:
        continue

existing_combinations = []
for i in drugspace:
    ingredients = combo_dict[i]
    for i1 in ingredients:
        existing_combinations.append([i, i1])

existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

for r in range(0, 3):
    if r == 0:
        for i in range(0, len(existing_combinations)):
            existing_combinations.append([existing_combinations[i][1], existing_combinations[i][0]])

        n_combos = []
        count = len(existing_combinations) / 2
        while len(n_combos) < count:
            ingredient1 = 0
            ingredient2 = 0
            while ingredient1 == ingredient2:
                ingredient1 = random.choice(drugspace)
                ingredient2 = random.choice(drugspace)
            if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
                continue
            if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
                n_combos.append([ingredient1, ingredient2])
                if len(n_combos) % 1000 == 0:
                    print(len(n_combos))

        for i in range(0, len(n_combos)):
            n_combos.append([n_combos[i][1], n_combos[i][0]])

        table = []
        for r in existing_combinations:
            if existing_combinations.index(r) % 1000 == 0:
                print(existing_combinations.index(r))
            try:
                index1 = index_dict[r[0]]
                row1 = descriptors.loc[index1].drop('IDs').to_list()

                tarindex1 = tar_dict[r[0]]
                tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                index2 = index_dict[r[1]]
                row2 = descriptors.loc[index2].drop('IDs').to_list()

                tarindex2 = tar_dict[r[1]]
                tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                row = row1 + tarrow1 + row2 + tarrow2
                # row = row1 + row2
                # row = tarrow1 + tarrow2
                table.append(row)
            except KeyboardInterrupt:
                raise
            except:
                del existing_combinations[existing_combinations.index(r)]
                continue

        data = np.array(table)

        del table

        results = []
        for r in range(0, len(data)):
            results.append(1)

        table = []
        for r in n_combos:
            if n_combos.index(r) % 1000 == 0:
                print(n_combos.index(r))
            try:
                index1 = index_dict[r[0]]
                row1 = descriptors.loc[index1].drop('IDs').to_list()

                tarindex1 = tar_dict[r[0]]
                tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                index2 = index_dict[r[1]]
                row2 = descriptors.loc[index2].drop('IDs').to_list()

                tarindex2 = tar_dict[r[1]]
                tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                row = row1 + tarrow1 + row2 + tarrow2
                # row = row1 + row2
                # row = tarrow1 + tarrow2
                table.append(row)
            except KeyboardInterrupt:
                raise
            except:
                del n_combos[n_combos.index(r)]
                continue

        data1 = np.array(table)

        del table

        results1 = []
        for r in range(0, len(data1)):
            results1.append(0)

        file = np.concatenate((data, data1), axis=0)
        results = results + results1
        del data, data1
        file.shape

        file = pd.DataFrame(file, columns=col_names)

        scale_names = descriptors.columns.to_list()[1:]
        scale_names = [x for x in scale_names if x in file.columns]
        temp = [x + ' - 1' for x in scale_names]
        scale_names = scale_names + temp

        x_train, x_val, y_train, y_val = train_test_split(file, results, test_size=0.2)

        scaler = StandardScaler()

        standard_transformer = Pipeline(steps=[
                ('standard', scaler)])

        preprocessor = ColumnTransformer(
                        remainder='passthrough', transformers=[
                            ('std', standard_transformer , scale_names),
                        ])
        x_train = preprocessor.fit_transform(x_train)
        x_val = preprocessor.transform(x_val)

        x_train = x_train.astype('float32')
        y_train = to_categorical(y_train)

        print('training model')
        model = tf.keras.models.Sequential()
        n_cols = x_train.shape[1]
        model.add(tf.keras.layers.Dense(n_cols, activation='relu', input_shape=(n_cols,)))
        model.add(Dropout(0.2))
        model.add(tf.keras.layers.Dense(int(n_cols / 2), activation='relu'))
        model.add(Dropout(0.5))
        model.add(tf.keras.layers.Dense(2, activation='linear'))

        model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0005, momentum=0.5), 
                  loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=[tf.keras.metrics.AUC(), 'accuracy', tf.keras.metrics.Precision()])

        early_stopping_monitor = EarlyStopping(patience=3)
        model.fit(x_train, y_train, validation_data=(x_val, to_categorical(y_val)), epochs=100, batch_size = 32, callbacks=[early_stopping_monitor])

        temp = tf.Variable(initial_value=1.0, trainable=True)
        y_pred = model.predict(x_val)
        y_test = to_categorical(y_val)

        def compute_loss():
            y_pred_model_w_temp = tf.math.divide(y_pred, temp)
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(y_test), y_pred_model_w_temp))
            return loss

        optimizer = tf.optimizers.Adam(learning_rate=0.01)

        print('Temperature Initial value: {}'.format(temp.numpy()))

        for i in range(300):
             opts = optimizer.minimize(compute_loss, var_list=[temp])

        print('Temperature Final value: {}'.format(temp.numpy()))

        model1s.append([model, preprocessor, temp.numpy()])
    else:
        existing_combinations = []
        for i in drugspace:
            ingredients = combo_dict[i]
            for i1 in ingredients:
                existing_combinations.append([i, i1])

        existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

        if r == 1:
            c = 2
        else:
            c = 4

        import math
        count = math.ceil(len(existing_combinations) / c)
        subset = list(chunks(existing_combinations, int(count)))

        for i in subset:
            existing_combinations = i

            for i in range(0, len(existing_combinations)):
                existing_combinations.append([existing_combinations[i][1], existing_combinations[i][0]])

            n_combos = []
            count = len(existing_combinations) / 2
            while len(n_combos) < count:
                ingredient1 = 0
                ingredient2 = 0
                while ingredient1 == ingredient2:
                    ingredient1 = random.choice(drugspace)
                    ingredient2 = random.choice(drugspace)
                if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
                    continue
                if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
                    n_combos.append([ingredient1, ingredient2])
                    if len(n_combos) % 1000 == 0:
                        print(len(n_combos))

            for i in range(0, len(n_combos)):
                n_combos.append([n_combos[i][1], n_combos[i][0]])

            print('constructing dataset')
            table = []
            for r in existing_combinations:
                if existing_combinations.index(r) % 1000 == 0:
                    print(existing_combinations.index(r))
                try:
                    index1 = index_dict[r[0]]
                    row1 = descriptors.loc[index1].drop('IDs').to_list()

                    tarindex1 = tar_dict[r[0]]
                    tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                    index2 = index_dict[r[1]]
                    row2 = descriptors.loc[index2].drop('IDs').to_list()

                    tarindex2 = tar_dict[r[1]]
                    tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                    row = row1 + tarrow1 + row2 + tarrow2
                    # row = row1 + row2
                    # row = tarrow1 + tarrow2
                    table.append(row)
                except KeyboardInterrupt:
                    raise
                except:
                    del existing_combinations[existing_combinations.index(r)]
                    continue

            data = np.array(table)

            del table

            results = []
            for r in range(0, len(data)):
                results.append(1)

            table = []
            for r in n_combos:
                if n_combos.index(r) % 1000 == 0:
                    print(n_combos.index(r))
                try:
                    index1 = index_dict[r[0]]
                    row1 = descriptors.loc[index1].drop('IDs').to_list()

                    tarindex1 = tar_dict[r[0]]
                    tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

                    index2 = index_dict[r[1]]
                    row2 = descriptors.loc[index2].drop('IDs').to_list()

                    tarindex2 = tar_dict[r[1]]
                    tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

                    row = row1 + tarrow1 + row2 + tarrow2
                    # row = row1 + row2
                    # row = tarrow1 + tarrow2
                    table.append(row)
                except KeyboardInterrupt:
                    raise
                except:
                    del n_combos[n_combos.index(r)]
                    continue

            data1 = np.array(table)

            del table

            results1 = []
            for r in range(0, len(data1)):
                results1.append(0)

            file = np.concatenate((data, data1), axis=0)
            results = results + results1
            del data, data1
            print('dataset shape')
            print(file.shape)

            file = pd.DataFrame(file, columns=col_names)

            scale_names = descriptors.columns.to_list()[1:]
            scale_names = [x for x in scale_names if x in file.columns]
            temp = [x + ' - 1' for x in scale_names]
            scale_names = scale_names + temp

            x_train, x_val, y_train, y_val = train_test_split(file, results, test_size=0.2)

            scaler = StandardScaler()

            standard_transformer = Pipeline(steps=[
                    ('standard', scaler)])

            preprocessor = ColumnTransformer(
                        remainder='passthrough', transformers=[
                            ('std', standard_transformer , scale_names),
                        ])
            x_train = preprocessor.fit_transform(x_train)
            x_val = preprocessor.transform(x_val)

            x_train = x_train.astype('float32')
            y_train = to_categorical(y_train)

            print('training model')
            model = tf.keras.models.Sequential()
            n_cols = x_train.shape[1]
            model.add(tf.keras.layers.Dense(n_cols, activation='relu', input_shape=(n_cols,)))
            model.add(Dropout(0.2))
            model.add(tf.keras.layers.Dense(int(n_cols / 2), activation='relu'))
            model.add(Dropout(0.5))
            model.add(tf.keras.layers.Dense(2, activation='linear'))

            model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0005, momentum=0.5), 
                      loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                     metrics=[tf.keras.metrics.AUC(), 'accuracy', tf.keras.metrics.Precision()])

            early_stopping_monitor = EarlyStopping(patience=3)
            model.fit(x_train, y_train, validation_data=(x_val, to_categorical(y_val)), epochs=100, batch_size = 32, callbacks=[early_stopping_monitor])

            temp = tf.Variable(initial_value=1.0, trainable=True)
            y_pred = model.predict(x_val)
            y_test = to_categorical(y_val)

            def compute_loss():
                y_pred_model_w_temp = tf.math.divide(y_pred, temp)
                loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(y_test), y_pred_model_w_temp))
                return loss

            optimizer = tf.optimizers.Adam(learning_rate=0.01)

            print('Temperature Initial value: {}'.format(temp.numpy()))

            for i in range(300):
                 opts = optimizer.minimize(compute_loss, var_list=[temp])

            print('Temperature Final value: {}'.format(temp.numpy()))

            if c == 2:
                model2s.append([model, preprocessor, temp.numpy()])
            else:
                model4s.append([model, preprocessor, temp.numpy()])

In [None]:
def create_validation_set(drugspace):
    combo_dict = {}
    for i in drugspace:
        combo_dict[i] = []
    for i in combos:
        try:
            x = combo_dict[i[0]]
            x = combo_dict[i[1]]
            combo_dict[i[0]].append(i[1])
            combo_dict[i[1]].append(i[0])
        except:
            continue

    print('setting positive combos')
    existing_combinations = []
    for i in drugspace:
        ingredients = combo_dict[i]
        for i1 in ingredients:
            existing_combinations.append([i, i1])

    existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

    n_combos = []
    count = len(existing_combinations)
    while len(n_combos) < count:
        ingredient1 = 0
        ingredient2 = 0
        while ingredient1 == ingredient2:
            ingredient1 = random.choice(drugspace)
            ingredient2 = random.choice(drugspace)
        if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
            continue
        if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
            n_combos.append([ingredient1, ingredient2])
            if len(n_combos) % 1000 == 0:
                print(len(n_combos))

    print('constructing dataset')
    table = []
    for r in existing_combinations:
        if existing_combinations.index(r) % 1000 == 0:
            print(existing_combinations.index(r))
        try:
            index1 = index_dict[r[0]]
            row1 = descriptors.loc[index1].drop('IDs').to_list()

            tarindex1 = tar_dict[r[0]]
            tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

            index2 = index_dict[r[1]]
            row2 = descriptors.loc[index2].drop('IDs').to_list()

            tarindex2 = tar_dict[r[1]]
            tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

            row = row1 + tarrow1 + row2 + tarrow2
            # row = row1 + row2
            # row = tarrow1 + tarrow2
            table.append(row)
        except KeyboardInterrupt:
            raise
        except:
            del existing_combinations[existing_combinations.index(r)]
            continue

    data = np.array(table)

    del table

    results = []
    for r in range(0, len(data)):
        results.append(1)

    table = []
    for r in n_combos:
        if n_combos.index(r) % 1000 == 0:
            print(n_combos.index(r))
        try:
            index1 = index_dict[r[0]]
            row1 = descriptors.loc[index1].drop('IDs').to_list()

            tarindex1 = tar_dict[r[0]]
            tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

            index2 = index_dict[r[1]]
            row2 = descriptors.loc[index2].drop('IDs').to_list()

            tarindex2 = tar_dict[r[1]]
            tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

            row = row1 + tarrow1 + row2 + tarrow2
            # row = row1 + row2
            # row = tarrow1 + tarrow2
            table.append(row)
        except KeyboardInterrupt:
            raise
        except:
            del n_combos[n_combos.index(r)]
            continue

    data1 = np.array(table)

    del table

    results1 = []
    for r in range(0, len(data1)):
        results1.append(0)

    file = np.concatenate((data, data1), axis=0)
    results = results + results1
    del data, data1
    print('dataset shape:')
    print(file.shape)

    val_set = pd.DataFrame(file, columns=col_names)
    val_results = results
    val_combos = existing_combinations + n_combos

    return val_set, val_results, val_combos