In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from IPython.utils import io
import time
import joblib
import itertools
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow_addons as tfa
import ast
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import gc

In [None]:
# obtain datasets

df_combos = pd.read_csv('herb synergy combos.csv')

In [None]:
df_herbs = pd.read_csv('updated herbs.csv')

In [None]:
df_ingredients = pd.read_csv('cancer ingredients.csv')

In [None]:
targets = pd.read_csv('herb targets.csv')
var = VarianceThreshold(0.02)
var.fit_transform(targets.drop(columns='IDs')).shape[1]
cols = var.get_feature_names_out(input_features=targets.drop(columns='IDs').columns)

drop = []
for i in targets.drop(columns='IDs').columns:
    if i not in cols:
        drop.append(i)
targets = targets.drop(columns=drop)

In [None]:
descriptors = pd.read_csv('cancer ingredients mordred descriptors + morgan fp dropped columns.csv')
var = VarianceThreshold(0.02)
var.fit_transform(descriptors.drop(columns='IDs')).shape[1]
cols = var.get_feature_names_out(input_features=descriptors.drop(columns='IDs').columns)

drop = []
for i in descriptors.drop(columns='IDs').columns:
    if i not in cols:
        drop.append(i)
        
descriptors = descriptors.drop(columns=drop)

In [None]:
col_names = descriptors.columns.to_list()[1:] + targets.columns.to_list()[1:]
temp = [x + ' - 1' for x in col_names]
col_names = col_names + temp

In [None]:
all_ingredients = [descriptors.loc[x, 'IDs'] for x in range(0, len(descriptors))]

In [None]:
combos = df_combos.apply(lambda row: [row['Drug 1']] + [row['Drug 2']], axis=1).to_list()

In [None]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
herb_dict = {}
for r in range(0, len(df_herbs)):
    items = df_herbs.loc[r, 'Ingredients'].split(', ')
    name = df_herbs.loc[r, 'id']
    herb_dict[name] = items

In [None]:
ingredient_dict = {}
for r in range(0, len(df_ingredients)):
    items = df_ingredients.loc[r, 'Ingredient_Smile']
    name = df_ingredients.loc[r, 'IDs']
    ingredient_dict[name] = items

In [None]:
index_dict = {}
for r in range(0, len(descriptors)):
    index_dict[descriptors.loc[r, 'IDs']] = r
    
tar_dict = {}
for r in range(0, len(targets)):
    tar_dict[targets.loc[r, 'IDs']] = r

In [None]:
# set validation dataset

val_drugspace = random.sample(all_ingredients, 500)
drugspace = val_drugspace
val_drugspace = drugspace

combo_dict = {}
for i in drugspace:
    combo_dict[i] = []
for i in combos:
    try:
        x = combo_dict[i[0]]
        x = combo_dict[i[1]]
        combo_dict[i[0]].append(i[1])
        combo_dict[i[1]].append(i[0])
    except:
        continue

print('setting positive combos')
existing_combinations = []
for i in drugspace:
    ingredients = combo_dict[i]
    for i1 in ingredients:
        existing_combinations.append([i, i1])
        
existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

drop = []
for i in list(combo_dict):
    if len(combo_dict[i]) == 0:
        drop.append(i)

drugspace = [x for x in drugspace if x not in drop]

print('setting negative combos')
n_combos = []
count = len(existing_combinations)
while len(n_combos) < count:
    ingredient1 = 0
    ingredient2 = 0
    while ingredient1 == ingredient2:
        ingredient1 = random.choice(drugspace)
        ingredient2 = random.choice(drugspace)
    if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
        continue
    if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
        n_combos.append([ingredient1, ingredient2])
        if len(n_combos) % 1000 == 0:
            print(len(n_combos))
                
print('constructing dataset')
table = []
for r in existing_combinations:
    if existing_combinations.index(r) % 1000 == 0:
        print(existing_combinations.index(r))
    try:
        index1 = index_dict[r[0]]
        row1 = descriptors.loc[index1].drop('IDs').to_list()
        
        tarindex1 = tar_dict[r[0]]
        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()
        
        index2 = index_dict[r[1]]
        row2 = descriptors.loc[index2].drop('IDs').to_list()
        
        tarindex2 = tar_dict[r[1]]
        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()
        
        row = row1 + tarrow1 + row2 + tarrow2
        # row = row1 + row2
        # row = tarrow1 + tarrow2
        table.append(row)
    except KeyboardInterrupt:
        raise
    except:
        del existing_combinations[existing_combinations.index(r)]
        continue
    
data = np.array(table)

del table

results = []
for r in range(0, len(data)):
    results.append(1)
    
table = []
for r in n_combos:
    if n_combos.index(r) % 1000 == 0:
        print(n_combos.index(r))
    try:
        index1 = index_dict[r[0]]
        row1 = descriptors.loc[index1].drop('IDs').to_list()
        
        tarindex1 = tar_dict[r[0]]
        tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()
        
        index2 = index_dict[r[1]]
        row2 = descriptors.loc[index2].drop('IDs').to_list()
        
        tarindex2 = tar_dict[r[1]]
        tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()
        
        row = row1 + tarrow1 + row2 + tarrow2
        # row = row1 + row2
        # row = tarrow1 + tarrow2
        table.append(row)
    except KeyboardInterrupt:
        raise
    except:
        del n_combos[n_combos.index(r)]
        continue
    
data1 = np.array(table)

del table

results1 = []
for r in range(0, len(data1)):
    results1.append(0)
    
file = np.concatenate((data, data1), axis=0)
results = results + results1
del data, data1
print('dataset shape:')
print(file.shape)

val_set = pd.DataFrame(file, columns=col_names)
val_results = results
val_combos = existing_combinations + n_combos

In [None]:
# conduct ensemble learning
from sklearn.utils import shuffle

fullspace = random.sample(all_ingredients, 1500 * 10)
fullspace = list(chunks(fullspace, 1500))
accuracies = []
rocs = []
models = {}
scalers = {}
end_drugspaces = []
counters = []
test_sets = {}
validation_pred= {}
validation_y = {}

for i in fullspace[:2]:
    index = fullspace.index(i)
    print(f'iteration: {index + 1}')
    drugspace = i
    combo_dict = {}
    for i in drugspace:
        combo_dict[i] = []
    for i in combos:
        try:
            x = combo_dict[i[0]]
            x = combo_dict[i[1]]
            combo_dict[i[0]].append(i[1])
            combo_dict[i[1]].append(i[0])
        except:
            continue

    print('setting positive combos')
    existing_combinations = []
    for i in drugspace:
        ingredients = combo_dict[i]
        for i1 in ingredients:
            existing_combinations.append([i, i1])

    existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))
        
    count = []
    drop = []
    for i in list(combo_dict):
        if len(combo_dict[i]) == 0:
            drop.append(i)
            continue
        for i1 in range(0, len(combo_dict[i])):
            count.append(i)

    drugspace = [x for x in drugspace if x not in drop]
    
    end_drugspaces.append(drugspace)
    
    counters.append(Counter(count))
        
    print('setting negative combos')
    n_combos = []
    count = len(existing_combinations)
    while len(n_combos) < count:
        ingredient1 = 0
        ingredient2 = 0
        while ingredient1 == ingredient2:
            ingredient1 = random.choice(drugspace)
            ingredient2 = random.choice(drugspace)
        if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
            continue
        if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
            n_combos.append([ingredient1, ingredient2])
            if len(n_combos) % 1000 == 0:
                print(len(n_combos))
        
    print('constructing dataset')
    table = []
    for r in existing_combinations:
        if existing_combinations.index(r) % 1000 == 0:
            print(existing_combinations.index(r))
        try:
            index1 = index_dict[r[0]]
            row1 = descriptors.loc[index1].drop('IDs').to_list()

            tarindex1 = tar_dict[r[0]]
            tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

            index2 = index_dict[r[1]]
            row2 = descriptors.loc[index2].drop('IDs').to_list()

            tarindex2 = tar_dict[r[1]]
            tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

            row = row1 + tarrow1 + row2 + tarrow2
            # row = row1 + row2
            # row = tarrow1 + tarrow2
            table.append(row)
        except KeyboardInterrupt:
            raise
        except:
            del existing_combinations[existing_combinations.index(r)]
            continue

    data = np.array(table)

    del table

    results = []
    for r in range(0, len(data)):
        results.append(1)

    table = []
    for r in n_combos:
        if n_combos.index(r) % 1000 == 0:
            print(n_combos.index(r))
        try:
            index1 = index_dict[r[0]]
            row1 = descriptors.loc[index1].drop('IDs').to_list()

            tarindex1 = tar_dict[r[0]]
            tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

            index2 = index_dict[r[1]]
            row2 = descriptors.loc[index2].drop('IDs').to_list()

            tarindex2 = tar_dict[r[1]]
            tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

            row = row1 + tarrow1 + row2 + tarrow2
            # row = row1 + row2
            # row = tarrow1 + tarrow2
            table.append(row)
        except KeyboardInterrupt:
            raise
        except:
            del n_combos[n_combos.index(r)]
            continue

    data1 = np.array(table)

    del table

    results1 = []
    for r in range(0, len(data1)):
        results1.append(0)
        
    file = np.concatenate((data, data1), axis=0)
    results = results + results1
    del data, data1
    print('dataset shape')
    print(file.shape)
    
    file = pd.DataFrame(file, columns=col_names)
    
    names = descriptors.columns.to_list()[1:] + targets.columns.to_list()[1:]

    drop = [x for x in file.columns if x in names]
    temp1 = file.drop(columns=drop)
    temp1.columns = drop
    drop = [x for x in file.columns if x not in names]
    temp2 = file.drop(columns=drop)
    temp2.columns = drop

    file = pd.concat([file, pd.concat([temp1, temp2], axis=1)]).reset_index(drop=True)
    del temp1, temp2
    results = results + results
    
    scale_names = descriptors.columns.to_list()[1:]
    scale_names = [x for x in scale_names if x in file.columns]
    temp = [x + ' - 1' for x in scale_names]
    scale_names = scale_names + temp
    
    x_train, y_train = shuffle(file, results)
    del file
    
    scaler = StandardScaler()

    standard_transformer = Pipeline(steps=[
            ('standard', scaler)])

    preprocessor = ColumnTransformer(
            remainder='passthrough',
            transformers=[('std', standard_transformer , scale_names),])
    x_train = preprocessor.fit_transform(x_train)
    
    scalers['scaler' + str(index)] = preprocessor
        
    x_train = x_train.astype('float32')
    y_train = to_categorical(y_train)
    
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)
        
    print('training model')
    model = tf.keras.models.Sequential()
    n_cols = x_train.shape[1]
    model.add(tf.keras.layers.Dense(n_cols, activation='relu', input_shape=(n_cols,)))
    model.add(Dropout(0.2))
    model.add(tf.keras.layers.Dense(int(n_cols / 2), activation='relu'))
    model.add(Dropout(0.5))
    # model.add(tf.keras.layers.Dense(2, activation='linear'))
    model.add(tf.keras.layers.Dense(2, activation='softmax'))
    
    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.0005, momentum=0.5), 
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
             metrics=[tf.keras.metrics.AUC(), 'accuracy', tf.keras.metrics.Precision()])
    
    early_stopping_monitor = EarlyStopping(patience=3)
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=100, batch_size = 32, callbacks=[early_stopping_monitor])
    
    print('iteration finished')
    model.save(f'herb ensemble models/individual target models/softmax models/model{str(index)}.h5')
    joblib.dump(preprocessor, f'herb ensemble models/individual target models/softmax models/scaler{str(index)}.save')
    x_val = pd.DataFrame(x_val, columns=col_names)
    y = []
    for i1 in y_val:
        if i1[0] == 1:
            y.append(0)
        else:
            y.append(1)
    x_val.insert(0, 'Results', y)
    x_val.to_csv(f'herb ensemble models/individual target models/softmax models/set{str(index)}.csv', index=False)

    del model

    tf.keras.backend.clear_session()

    gc.collect()

In [None]:
# temp scaling code obtained here: https://sourajit16-02-93.medium.com/neural-network-calibration-46997f8c872c

temp = tf.Variable(initial_value=1.0, trainable=True)

def compute_loss():
    y_pred_model_w_temp = tf.math.divide(y_pred, temp)
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(tf.convert_to_tensor(keras.utils.to_categorical(Y)), y_pred_model_w_temp))
    return loss

optimizer = tf.optimizers.Adam(learning_rate=0.01)

print(‘Temperature Initial value: {}’.format(temp.numpy()))

for i in range(len(y_pred)):
     opts = optimizer.minimize(compute_loss, var_list=[temp])
        
print(‘Temperature Final value: {}’.format(temp.numpy()))

In [None]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, precision_score, recall_score, roc_curve, auc, precision_recall_curve

In [None]:
# evaluate on test sets from saved models

acc = []
roc = []
prc = []
prec = []
f1 = []
rec = []
spec = []
curve = []
prc_curve = []

for i in range(0, 10):
    test_set = pd.read_csv(f'herb ensemble models/individual target models/test sets/set{i}.csv')
    model = tf.keras.models.load_model(f'herb ensemble models/individual target models/model{i}.h5')
    scaler = scalers[i]
    x = test_set.drop(columns='Results')
    y = test_set['Results'].to_list()
    temp = variables[i]
    
    logits = model.predict(x)
    predictions = tf.nn.softmax(logits / temp)
    
    y_val = y
    
    binary_pred = []
    for i in predictions:
        if i[1] >= 0.5:
            binary_pred.append(1)
        else:
            binary_pred.append(0)
    
    acc.append(accuracy_score(y_val, binary_pred))
    f1.append(f1_score(y_val, binary_pred))
    rec.append(recall_score(y_val, binary_pred))
    prec.append(precision_score(y_val, binary_pred))
    spec.append(recall_score(y_val, binary_pred, pos_label=0))
        
    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve.append([fpr, tpr, thresholds])
    roc.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve.append([precision, recall, thresholds])
    prc.append(auc(recall, precision))
    
    import gc
    
    del model
    
    tf.keras.backend.clear_session()

    gc.collect()

In [None]:
# similarity based weight adjustment

In [None]:
combined_drugspace = []
for i in fullspace:
    combined_drugspace = combined_drugspace + i

drugspace_smi = []
for i in combined_drugspace:
    drugspace_smi.append(ingredient_dict[i])

mol2 = [Chem.MolFromSmiles(x) for x in drugspace_smi]
mol2 = [AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=2048) for x in mol2]

In [None]:
combo_dict_list = []
dataset_sizes = []
for drugspace in fullspace:
    combo_dict = {}
    for i in drugspace:
        combo_dict[i] = []
    for i in combos:
        try:
            x = combo_dict[i[0]]
            x = combo_dict[i[1]]
            combo_dict[i[0]].append(i[1])
            combo_dict[i[1]].append(i[0])
        except:
            continue
    size = 0
    for i in list(combo_dict):
        size += len(combo_dict[i])
    combo_dict_list.append(combo_dict)
    dataset_sizes.append(size)

In [None]:
# evaluate on unseen drugs, sbwa vs. equal weights

acc = []
roc = []
prc = []
prec = []
f1 = []
rec = []
spec = []
curve = []
prc_curve = []

acc1 = []
roc1 = []
prc1 = []
prec1 = []
f11 = []
rec1 = []
spec1 = []
curve1 = []
prc_curve1 = []

for r in range(0, 15):
    print(f'iteration: {r + 1} / 10')
    val_index = r

    drugspace = ast.literal_eval(df_models.loc[r, 'End Drugspace'])
    val_set, val_results, val_combos = create_validation_set(drugspace)
    similarity_dict = create_similarity_dict(drugspace)
    similarity_list = create_similarity_list(val_combos, similarity_dict, val_index)
        
    models = []
    for r in range(0, 8):
        models.append(tf.keras.models.load_model(f'herb ensemble models/individual descriptor models/model{r}.h5'))
    
    all_predictions = []
    for r in range(0, 8):
        if r == val_index:
            continue
        temp_pred = tf.nn.softmax(models[r].predict(scalers[r].transform(val_set)) / variables[r])
        all_predictions.append(temp_pred)
        
    del models

    tf.keras.backend.clear_session()

    gc.collect()
        
    models = [None, None, None, None, None, None, None, None]
    for r in range(8, 15):
        models.append(tf.keras.models.load_model(f'herb ensemble models/individual descriptor models/model{r}.h5'))

    for r in range(8, 15):
        if r == val_index:
            continue
        temp_pred = tf.nn.softmax(models[r].predict(scalers[r].transform(val_set)) / variables[r])
        all_predictions.append(temp_pred)
        
    del models

    tf.keras.backend.clear_session()

    gc.collect()
    
    predictions = []
    for r in range(0, len(val_set)):
        if r % 1000 == 0:
            print(r)
        weights = [x[1] for x in similarity_list[r]]
        pred_list = []
        for i in all_predictions:
            pred_list.append(i[r])
        pred = np.average(np.array(pred_list), axis=0, weights=weights)
        predictions.append(pred)
        
    del val_set
        
    predictions = np.array(predictions)
        
    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)
            
    y_val = val_results
    
    acc.append(accuracy_score(y_val, binary_pred))
    f1.append(f1_score(y_val, binary_pred))
    rec.append(recall_score(y_val, binary_pred))
    prec.append(precision_score(y_val, binary_pred))
    spec.append(recall_score(y_val, binary_pred, pos_label=0))
        
    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve.append([fpr, tpr, thresholds])
    roc.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve.append([precision, recall, thresholds])
    prc.append(auc(recall, precision))
    
    predictions = np.mean(np.array(all_predictions), axis=0)
    
    print(f'accuracy is {accuracy_score(y_val, binary_pred)}, precision is {precision_score(y_val, binary_pred)}, recall is {recall_score(y_val, binary_pred)}')
    
    binary_pred = []
    for i in predictions:
        if i[0] > i[1]:
            binary_pred.append(0)
        else:
            binary_pred.append(1)
            
    y_val = val_results
    
    acc1.append(accuracy_score(y_val, binary_pred))
    f11.append(f1_score(y_val, binary_pred))
    rec1.append(recall_score(y_val, binary_pred))
    prec1.append(precision_score(y_val, binary_pred))
    spec1.append(recall_score(y_val, binary_pred, pos_label=0))
        
    fpr, tpr, thresholds = roc_curve(y_val, predictions[:,1])
    curve1.append([fpr, tpr, thresholds])
    roc1.append(auc(fpr, tpr))
    precision, recall, thresholds = precision_recall_curve(y_val, predictions[:,1])
    prc_curve1.append([precision, recall, thresholds])
    prc1.append(auc(recall, precision))
    
    print(f'accuracy is {accuracy_score(y_val, binary_pred)}, precision is {precision_score(y_val, binary_pred)}, recall is {recall_score(y_val, binary_pred)}')

In [None]:
# sbwa classes

def create_similarity_dict(val_drugspace):
    similarity_dict = {}
    for drug in val_drugspace:
        smi = ingredient_dict[drug]
        mol1 = Chem.MolFromSmiles(smi)
        mol1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 3, nBits=2048)
        s = DataStructs.BulkTanimotoSimilarity(mol1, mol2)
        similarity = [[combined_drugspace[x], s[x]] for x in range(0, len(s))]

        similarity.sort(key=lambda x: x[1], reverse=True)

        similar_molecules = []
        for i in similarity:
            if i[1] >= 0.70:
                similar_molecules.append(i)
            else:
                break

        similarity_dict[drug] = similar_molecules
        
    return similarity_dict

In [None]:
def create_similarity_list(val_combos, similarity_dict, val_index):
    similarity_list = []

    for i in val_combos:

        similar_molecules = similarity_dict[i[0]] + similarity_dict[i[1]]

        similar_models = []
        for i in fullspace:
            if fullspace.index(i) == val_index:
                continue
            temp = []
            found = False
            for i1 in similar_molecules:
                if i1[0] in i:
                    found = True
                    temp.append(i1)
            if found is True:
                similar_models.append([fullspace.index(i), temp])
            else:
                similar_models.append([fullspace.index(i), None])

        weights = []
        for r in range(0, 9):
            i = similar_models[r]
            if pd.isnull(i[1]) is True:
                weights.append([i[0], 1 / 9])
                continue
            scores = [x[1] for x in i[1]]
            s = np.array(scores).mean()
            temp_dict = combo_dict_list[i[0]]
            n = 0
            for i1 in i[1]:
                n += len(temp_dict[i1[0]])
            d = dataset_sizes[i[0]]
            weight = s * (n / (0.001 * d)) + (1 / 9)
            weights.append([i[0], weight])

        sums = [x[1] for x in weights]
        multiplier = 100 / sum(sums) * 0.01

        weights = [[x[0], x[1] * multiplier] for x in weights]

        similarity_list.append(weights)
        
    return similarity_list

In [None]:
def create_validation_set(drugspace):
    combo_dict = {}
    for i in drugspace:
        combo_dict[i] = []
    for i in combos:
        try:
            x = combo_dict[i[0]]
            x = combo_dict[i[1]]
            combo_dict[i[0]].append(i[1])
            combo_dict[i[1]].append(i[0])
        except:
            continue

    print('setting positive combos')
    existing_combinations = []
    for i in drugspace:
        ingredients = combo_dict[i]
        for i1 in ingredients:
            existing_combinations.append([i, i1])

    existing_combinations = list(map(list, set(map(frozenset, existing_combinations))))

    drop = []
    for i in list(combo_dict):
        if len(combo_dict[i]) == 0:
            drop.append(i)

    drugspace = [x for x in drugspace if x not in drop]

    print('setting negative combos')
    n_combos = []
    count = len(existing_combinations)
    while len(n_combos) < count:
        ingredient1 = 0
        ingredient2 = 0
        while ingredient1 == ingredient2:
            ingredient1 = random.choice(drugspace)
            ingredient2 = random.choice(drugspace)
        if ingredient2 in combo_dict[ingredient1] or ingredient1 in combo_dict[ingredient2]:
            continue
        if [ingredient1, ingredient2] not in n_combos and [ingredient2, ingredient1] not in n_combos:
            n_combos.append([ingredient1, ingredient2])
            if len(n_combos) % 1000 == 0:
                print(len(n_combos))


    print('constructing dataset')
    table = []
    for r in existing_combinations:
        if existing_combinations.index(r) % 1000 == 0:
            print(existing_combinations.index(r))
        try:
            index1 = index_dict[r[0]]
            row1 = descriptors.loc[index1].drop('IDs').to_list()

            tarindex1 = tar_dict[r[0]]
            tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

            index2 = index_dict[r[1]]
            row2 = descriptors.loc[index2].drop('IDs').to_list()

            tarindex2 = tar_dict[r[1]]
            tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

            row = row1 + tarrow1 + row2 + tarrow2
            # row = row1 + row2
            # row = tarrow1 + tarrow2
            table.append(row)
        except KeyboardInterrupt:
            raise
        except:
            del existing_combinations[existing_combinations.index(r)]
            continue

    data = np.array(table)

    del table

    results = []
    for r in range(0, len(data)):
        results.append(1)

    table = []
    for r in n_combos:
        if n_combos.index(r) % 1000 == 0:
            print(n_combos.index(r))
        try:
            index1 = index_dict[r[0]]
            row1 = descriptors.loc[index1].drop('IDs').to_list()

            tarindex1 = tar_dict[r[0]]
            tarrow1 = targets.loc[tarindex1].drop('IDs').to_list()

            index2 = index_dict[r[1]]
            row2 = descriptors.loc[index2].drop('IDs').to_list()

            tarindex2 = tar_dict[r[1]]
            tarrow2 = targets.loc[tarindex1].drop('IDs').to_list()

            row = row1 + tarrow1 + row2 + tarrow2
            # row = row1 + row2
            # row = tarrow1 + tarrow2
            table.append(row)
        except KeyboardInterrupt:
            raise
        except:
            del n_combos[n_combos.index(r)]
            continue

    data1 = np.array(table)

    del table

    results1 = []
    for r in range(0, len(data1)):
        results1.append(0)

    file = np.concatenate((data, data1), axis=0)
    results = results + results1
    del data, data1
    print('dataset shape:')
    print(file.shape)

    val_set = pd.DataFrame(file, columns=col_names)
    val_results = results
    val_combos = existing_combinations + n_combos
    
    return val_set, val_results, val_combos