In [1]:
import os
import pickle
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import enlighten

from IPython.core.display import HTML
from IPython.display import SVG, Image, display

#RDKit related imports
from rdkit import RDLogger
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import Draw

#scikit imports
from sklearn.model_selection import train_test_split

#tensorflow
import tensorflow as tf
from tensorflow import keras
import tensorflow_decision_forests as tfdf

RDLogger.logger().setLevel(RDLogger.CRITICAL)



In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.test.gpu_device_name()))

Num GPUs Available:  13


2022-08-03 14:54:02.523932: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-03 14:54:02.554644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:54:02.578448: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-08-03 14:54:02.578595: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

In [3]:
not_used_desc = ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'Ipc']
desc_calc = MolecularDescriptorCalculator([x for x in [x[0] for x in Descriptors.descList] if x not in not_used_desc])



In [4]:
cols = pd.read_csv('backup_df.csv', nrows=1).columns
dataframe = pd.read_csv('backup_df.csv', index_col=False, usecols=cols[1:])

In [5]:
PandasTools.AddMoleculeColumnToFrame(dataframe, 'PUBCHEM_CANONICAL_SMILES', 'ROMol', includeFingerprints=True)

In [6]:
RF_dataset = dataframe.copy()

In [9]:
mol_descs = []
activity_binary = []
activity_regressor = []
ECFP6s = []
topols = []
atom_pairs = []



pbar = enlighten.Counter(total=len(RF_dataset.index), desc='Calculating molecular descriptors', unit='ticks')

for ID, row in RF_dataset.iterrows():
    # print(i)
    nan_flag = False

    mol = row.ROMol# if i == max_radius else row[f'FRAG_R{i}']          
    descriptor = desc_calc.CalcDescriptors(mol)
    for j in descriptor:
        if np.isnan(j):
            nan_flag = True
            break 
    if nan_flag:
        print(ID)
        print(f'Line {ID} contains NaN values, removing...')
        RF_dataset.drop(ID, inplace=True)
        continue

    ECFP6_vec = Chem.GetMorganFingerprintAsBitVect(mol, radius=3)
    topol_vec =  Chem.GetHashedTopologicalTorsionFingerprintAsBitVect(mol)
    atpair_vec = Chem.GetHashedAtomPairFingerprintAsBitVect(mol)

    ECFP6s.append(ECFP6_vec)
    topols.append(topol_vec)
    atom_pairs.append(atpair_vec)



    # print(descriptor)
    mol_descs.append(descriptor)
    activity_binary.append(row.PUBCHEM_ACTIVITY_OUTCOME)
    activity_regressor.append(row.PUBCHEM_ACTIVITY_SCORE)
    pbar.update()

In [15]:
print(len(topols))

305449


In [18]:
feat_lists = [mol_descs, ECFP6s, topols, atom_pairs]

for ID, i in enumerate(feat_lists):
    newlist = feat_lists.remove(ID)
    for j in newlist:
        print(len(i))
        print(len(j))
        assert len(i) == len(j)




SyntaxError: incomplete input (979687259.py, line 10)

In [9]:
print(len(mol_descs))
print(len(activity_binary))
print(len(activity_regressor))
print(len(desc_calc.descriptorNames))

305449
305449
305449
203


In [10]:
arr = np.array(mol_descs, dtype=np.float64)

In [11]:


model_df = pd.DataFrame(arr, columns=desc_calc.descriptorNames)
model_df['PUBCHEM_ACTIVITY_OUTCOME'] = activity_binary
model_df['PUBCHEM_ACTIVITY_SCORE'] = activity_regressor

model_df.to_csv('RF_TF_desc.csv', index=False)

In [12]:
try:
    len(model_df.index)
except:
    model_df = pd.read_csv('RF_TF_desc.csv')

In [None]:
# model_df.drop('Ipc', axis=1, inplace=True)

In [14]:
activity_binary = np.ravel(model_df['PUBCHEM_ACTIVITY_OUTCOME'])
activity_regressor = np.ravel(model_df['PUBCHEM_ACTIVITY_SCORE'])
del model_df['PUBCHEM_ACTIVITY_OUTCOME']
del model_df['PUBCHEM_ACTIVITY_SCORE']

In [15]:
model_df = model_df.to_numpy()

In [16]:
#split the descriptors
X_train, X_test, Y_BIN_train, Y_BIN_test, Y_REG_train, Y_REG_test = train_test_split(model_df, activity_binary, activity_regressor, random_state=20)

In [22]:
#remove inactives from the training set
print(Y_BIN_test)

actives = 0
for i in Y_BIN_test:
    if i == 'Active':
        actives += 1 

print(actives)
print(len(Y_BIN_test))



['Inactive' 'Inactive' 'Inactive' ... 'Inactive' 'Inactive' 'Inactive']
369
76363


In [33]:
y_test_no_inactives = np.array([i for i in Y_BIN_test if i == 'Active'])

act_index = np.argwhere(Y_BIN_test=='Inactive')
x_test_no_inactives = np.delete(X_test, act_index, 0)


In [34]:
print(y_test_no_inactives.shape)
print(x_test_no_inactives.shape)

(369,)
(369, 203)


In [24]:
#BINARY
test_descriptor_df = pd.DataFrame(X_test, columns=desc_calc.descriptorNames)

test_descriptor_df['PUBCHEM_ACTIVITY_OUTCOME'] = Y_BIN_test

train_descriptor_df = pd.DataFrame(X_train, columns=desc_calc.descriptorNames)
train_descriptor_df['PUBCHEM_ACTIVITY_OUTCOME'] = Y_BIN_train

In [38]:
test_descriptor_df2 = pd.DataFrame(x_test_no_inactives, columns=desc_calc.descriptorNames)
test_descriptor_df2['PUBCHEM_ACTIVITY_OUTCOME'] = y_test_no_inactives

In [39]:
#parameters
NUM_TREES = 2000

MIN_EXAMPLES = 6

MAX_DEPTH = 5

SUBSAMPLE = 0.65

SAMPLING_METHOD = "RANDOM"

VALIDATION_RATIO = 0.1

In [40]:
def run_RF_model(model, train_data, test_data, num_epochs=1, batch_size=1):
    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        train_data, label='PUBCHEM_ACTIVITY_OUTCOME'
    )
    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        test_data, label='PUBCHEM_ACTIVITY_OUTCOME'
    )
    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
    _, accuracy = model.evaluate(test_dataset, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

In [41]:
def show_feature_usage():
    feature_usages = []

    #there are only numerical descriptors
    for feature_name in desc_calc.descriptorNames:
        usage = tfdf.keras.FeatureUsage(name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
        feature_usages.append(usage)

    return feature_usages

In [42]:
def create_GBT_model():
    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        features=show_feature_usage(),
        exclude_non_specified_features=True,
        num_trees=NUM_TREES,
        max_depth=MAX_DEPTH,
        min_examples=MIN_EXAMPLES,
        subsample=SUBSAMPLE,
        validation_ratio=VALIDATION_RATIO,
        task=tfdf.keras.Task.CLASSIFICATION,
    )
    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
    return gbt_model

In [43]:
gbt_model1 = create_GBT_model()

Use /tmp/tmp_9pe0_4s as temporary training directory


2022-07-27 12:06:10.056556: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 12:06:10.056742: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 12:06:10.056835: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 12:06:10.057096: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-27 12:06:10.057194: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

In [44]:
gbt_model2 = create_GBT_model()

Use /tmp/tmpxsoc_fda as temporary training directory


In [None]:
run_RF_model(gbt_model1, train_data=train_descriptor_df, test_data=test_descriptor_df)

In [None]:
run_RF_model(gbt_model2, train_data=train_descriptor_df, test_data=test_descriptor_df2)

In [None]:
print(gbt_model1.summary())

In [None]:
train_descriptor_df = pd.DataFrame(X_test, columns=[desc_calc.descriptorNames])
train_descriptor_df['PUBCHEM_ACTIVITY_OUTCOME'] = Y1_test
train_descriptor_df['PUBCHEM_ACTIVITY_SCORE'] = Y2_test