### Create dataset for testing

In [7]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# --- SET YOUR CONFIGURATION ---
INPUT_CSV = r'C:\Users\andersen\PycharmProjects\DEEPDILI\Full_DeepDILI\PUBLIC_DILI - DILI Compound Generation and Scoring.csv'      # Your source file
SMILES_COL = 'SMILES'            # Name of your SMILES column
ID_COL = 'Compound'              # Name of your ID column
OUTPUT_SDF = "Mold2_input.sdf"

def generate_valid_mold2_sdf():
    df = pd.read_csv(INPUT_CSV)
    # Using SDWriter automatically ensures 7-column bond blocks
    writer = Chem.SDWriter(OUTPUT_SDF)

    for idx, row in df.iterrows():
        mol = Chem.MolFromSmiles(str(row[SMILES_COL]))
        if mol:
            # 1. Standardize for descriptors
            mol = Chem.AddHs(mol)
            AllChem.Compute2DCoords(mol)
            Chem.Kekulize(mol)

            # 2. Set ID in Title (Line 1) and Name tag
            mol.SetProp("_Name", str(row[ID_COL]))
            mol.SetProp("Name", str(row[ID_COL]))

            writer.write(mol)
    writer.close()

    # 3. Post-Processing: Strip the extra Line 3 to move V2000 up to Line 3
    with open(OUTPUT_SDF, 'r') as f:
        lines = f.readlines()

    cleaned_lines = []
    i = 0
    while i < len(lines):
        if i + 2 < len(lines) and "V2000" in lines[i+3]:
            # If the V2000 line is at index 3, it means index 2 is the extra line
            cleaned_lines.append(lines[i])   # Title
            cleaned_lines.append(lines[i+1]) # Software info
            cleaned_lines.append(lines[i+3]) # Move V2000 line up to Line 3
            i += 4
            while i < len(lines) and "$$$$" not in lines[i]:
                cleaned_lines.append(lines[i])
                i += 1
            if i < len(lines): cleaned_lines.append(lines[i])
        else:
            cleaned_lines.append(lines[i])
        i += 1

    with open(OUTPUT_SDF, 'w', newline='\n') as f:
        f.writelines(cleaned_lines)

generate_valid_mold2_sdf()

[17:40:08] Explicit valence for atom # 6 S, 7, is greater than permitted


In [1]:
###Loading packages
import os
#import numpy as np
import pandas as pd
import math
import itertools

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import class_weight
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold


from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

from os.path import isfile, join
from functools import reduce
from numpy.random import seed
seed(1)
import itertools
    
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings("ignore",category=FutureWarning)
    import numpy as np
    from keras.models import load_model

def model_predict(X, model, col_name, test_index):
    y_pred_prob = model.predict_proba(X)
    y_pred_prob = y_pred_prob[:, 1]

    ###save the predicted result to dataframe
    pred_result = pd.DataFrame()
    pred_result['id'] = test_index
    pred_result[col_name] = y_pred_prob
    
    return pred_result

def predictionDf(X_val, X_test):
    pred_val = pd.DataFrame()
    pred_val['id'] = X_val.index
    
    pred_test = pd.DataFrame()
    pred_test['id'] = X_test.index
    
    return pred_val, pred_test

def combinePredictions(X_val_s, X_test_s, X_val_index, X_test_index, pred_val, pred_test, clf, col_name):
    val_prob = model_predict(X_val_s, clf, col_name, X_val_index)
    pred_val = pd.merge(pred_val, val_prob, on='id', how='left') 

    test_prob = model_predict(X_test_s, clf, col_name, X_test_index)
    pred_test = pd.merge(pred_test, test_prob, on='id', how='left')
    return pred_val, pred_test 


def selectBaseClassifiers(knns, lrs, svms, rfs, xgboosts, selected_models):

    knns = knns[['id', *selected_models[selected_models.model == 'knn'].seed.unique()]]
    knns.columns = ['id', *['knn_' + str(col) for col in knns.columns[1:]]]

    lrs = lrs[['id', *selected_models[selected_models.model == 'lr'].seed.unique()]]
    lrs.columns = ['id', *['lr_' + str(col) for col in lrs.columns[1:]]]

    svms = svms[['id', *selected_models[selected_models.model == 'svm'].seed.unique()]]
    svms.columns = ['id', *['svm_' + str(col) for col in svms.columns[1:]]]

    rfs = rfs[['id', *selected_models[selected_models.model == 'rf'].seed.unique()]]
    rfs.columns = ['id', *['rf_' + str(col) for col in rfs.columns[1:]]]
 
    xgboosts = xgboosts[['id', *selected_models[selected_models.model == 'xgboost'].seed.unique()]]
    xgboosts.columns= ['id', *['xgboost_' + str(col) for col in xgboosts.columns[1:]]]
    
    prob_output = reduce(lambda x,y: pd.merge(x,y, on='id', how='left'), [knns, lrs, svms, rfs, xgboosts])
    return prob_output

def baseClassifiers(X, y, X_val, X_test, selected_models):
    #initial dataframe to hold prediction results
    pred_val_knn, pred_test_knn = predictionDf(X_val, X_test)
    pred_val_lr, pred_test_lr = predictionDf(X_val, X_test)
    pred_val_svm, pred_test_svm = predictionDf(X_val, X_test)
    pred_val_rf, pred_test_rf = predictionDf(X_val, X_test)
    pred_val_xgboost, pred_test_xgboost = predictionDf(X_val, X_test)

    for i, col_name in enumerate(train_index_df.columns[5:].values):
        print(i)
        ###get train dataset
        train_index = train_index_df[train_index_df[col_name] == 1].id.unique()

        ###get train dataset
        X_train = X[X.index.isin(train_index)]
        y_train = y[y.index.isin(train_index)]
            

        ### scale the input
        sc = MinMaxScaler()
        sc.fit(X_train)
        X_train = sc.transform(X_train)
        X_val_s = sc.transform(X_val)
        X_test_s = sc.transform(X_test)

        ##KNN
        ###fit model
        knn = KNeighborsClassifier(n_neighbors=7)
        knn.fit(X_train, y_train)
        ###predict test results
        pred_val_knn, pred_test_knn = combinePredictions(X_val_s, X_test_s, X_val.index, X_test.index, pred_val_knn, pred_test_knn, knn, col_name)

        ##LR
        ###fit model
        lr = LogisticRegression(C=0.1, max_iter=300, class_weight = 'balanced')
        lr.fit(X_train, y_train)
        ###predict test results
        pred_val_lr, pred_test_lr = combinePredictions(X_val_s, X_test_s, X_val.index, X_test.index, pred_val_lr, pred_test_lr, lr, col_name)

        ##SVM
        ###fit model
        svm = SVC(kernel='rbf', C=1, gamma='scale', probability=True,  class_weight = 'balanced', random_state=1)
        svm.fit(X_train, y_train)
        ###predict test results
        pred_val_svm, pred_test_svm = combinePredictions(X_val_s, X_test_s, X_val.index, X_test.index, pred_val_svm, pred_test_svm, svm, col_name)

        ##RF
        ###fit model
        rf = RandomForestClassifier(random_state=1, n_estimators=700, max_depth=11,  min_samples_leaf=5, class_weight='balanced', bootstrap = True, max_features='log2')
        rf.fit(X_train, y_train)
        ###predict test results
        pred_val_rf, pred_test_rf = combinePredictions(X_val_s, X_test_s, X_val.index, X_test.index, pred_val_rf, pred_test_rf, rf, col_name)

        ##XGBoost
        ###fit model
        xgboost = XGBClassifier(learning_rate=0.01, n_estimators=700, max_depth=11, subsample=0.7, scale_pos_weight=0.66)
        xgboost.fit(X_train, y_train)
        ###predict test results
        pred_val_xgboost, pred_test_xgboost = combinePredictions(X_val_s, X_test_s, X_val.index, X_test.index, pred_val_xgboost, pred_test_xgboost, xgboost, col_name)


    val_prob = selectBaseClassifiers(pred_val_knn, pred_val_lr, pred_val_svm, pred_val_rf, pred_val_xgboost, selected_models)
    test_prob = selectBaseClassifiers(pred_test_knn, pred_test_lr, pred_test_svm, pred_test_rf, pred_test_xgboost, selected_models)
    return val_prob, test_prob

def deepdili_predict(X, model, col_name, X_index):
    y_pred = model.predict(X)
    y_pred_class = np.where(y_pred > 0.5, 1, 0)
    pred_result = pd.DataFrame()
    pred_result['id'] = X_index
    pred_result['prob_'+col_name] = y_pred
    pred_result['class_'+col_name] = y_pred_class

    return pred_result

def dataPrep(df, mergedf, traindf, preddf):
    mergedf= mergedf.rename(columns={'PubChem':'CID'})
    df = pd.merge(df, mergedf.iloc[:,:4], on='CID', how='left')
    df = pd.merge(df, traindf[['CID', 'dilist_label', 'initial_approval_year']], on='CID', how='left')
    df['in_train'] = np.where(df.initial_approval_year < 1997, 1, 0)
    df = pd.merge(df, preddf[['prob_DeepDILI','class_DeepDILI', 'CID',]], on='CID', how='left')
    df = df[df.in_train == 0]
    df = df[['CompoundName','CID', 'Canonical SMILES', 'DILI_label', 'Prediction', 'prob_DeepDILI', 'class_DeepDILI']]
    return df

def measurements(y_test, y_pred):
    TN, FP, FN, TP = metrics.confusion_matrix(y_test, y_pred).ravel()
    acc = metrics.accuracy_score(y_test, y_pred)
    sensitivity = metrics.recall_score(y_test, y_pred)
    specificity = TN/(TN+FP)
    return [round(acc,3), round(sensitivity,3), round(specificity,3)]

def get_measurements(df, col1, col2, dataset, model_name):
    result = measurements(df[col1], df[col2])
    print('\033[1m{} model performance in {} dataset: \033[0m'.format(model_name, dataset))
    print("Accuracy:    {0:.3f}".format(result[0]))
    print("Sensitivity: {0:.3f}".format(result[1]))
    print("Specificity: {0:.3f}".format(result[2]))    




### DeepDILI predictions

In [3]:
###################################################################################################################
################################################### Data Preparation ##############################################
###################################################################################################################
###read train1(X) and train2(X_val) dataset
data = pd.read_csv('mold2_1002_full.csv',low_memory=False)
X_org,  y_org = data.iloc[:,4:], data['DILI_label']
X, X_val, y, y_val = train_test_split(X_org,  y_org, test_size=0.2, stratify=y_org, random_state=7)
cols = X.columns


### read test dataset
external = #NOVEL COMPOUNDS HERE
X_test = external[cols]

###import trainind data index (which was used in the training)
train_index_df = pd.read_csv('train_index.csv')
###import the selected models index
selected_models = pd.read_csv('selected_full_model_mcc.csv')

###################################################################################################################
################################################### DeepDILI prediction ###########################################
###################################################################################################################
val_prob, test_prob = baseClassifiers(X, y, X_val, X_test, selected_models)
###normalization
sc = StandardScaler()
sc.fit(val_prob.iloc[:, 1:])
test_prob_s = sc.transform(test_prob.iloc[:, 1:]) 
###import deepdili model
model_path = 'best_model.h5'
deepdili = load_model(model_path)
###make prediction
predictions = deepdili_predict(test_prob_s, deepdili, 'DeepDILI', test_prob.index)
###save the results
predictions = pd.concat([external, predictions], axis=1)
predictions.to_csv('/results/')

KeyError: "None of [Index(['D004', 'D005', 'D007', 'D013', 'D015', 'D016', 'D025', 'D027', 'D033',\n       'D034',\n       ...\n       'D768', 'D769', 'D770', 'D771', 'D772', 'D773', 'D774', 'D775', 'D776',\n       'D777'],\n      dtype='str', length=337)] are in the [columns]"