In [16]:
import os
import pickle
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

import enlighten

from IPython.core.display import HTML
from IPython.display import SVG, Image, display

#RDKit related imports
from rdkit import RDLogger
from rdkit.Chem import PandasTools, AllChem as Chem, Descriptors
from rdkit.ML.Descriptors.MoleculeDescriptors import MolecularDescriptorCalculator
from rdkit.Chem import Draw

#scikit imports
from sklearn.model_selection import train_test_split

#tensorflow
import tensorflow as tf
from tensorflow import keras
import tensorflow_decision_forests as tfdf

RDLogger.logger().setLevel(RDLogger.CRITICAL)

In [24]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.test.gpu_device_name()))

Num GPUs Available:  13


2022-07-21 15:19:19.897007: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 15:19:19.897200: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 15:19:19.897296: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 15:19:19.897426: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-07-21 15:19:19.897522: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from S

In [34]:
not_used_desc = ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge', 'Ipc']
desc_calc = MolecularDescriptorCalculator([x for x in [x[0] for x in Descriptors.descList] if x not in not_used_desc])



In [3]:
cols = pd.read_csv('backup_df.csv', nrows=1).columns
dataframe = pd.read_csv('backup_df.csv', index_col=False, usecols=cols[1:])

In [4]:
PandasTools.AddMoleculeColumnToFrame(dataframe, 'PUBCHEM_CANONICAL_SMILES', 'ROMol', includeFingerprints=True)

In [5]:
RF_dataset = dataframe.copy()

In [6]:
mol_descs = []
activity_binary = []
activity_regressor = []


pbar = enlighten.Counter(total=len(RF_dataset.index), desc='Calculating molecular descriptors', unit='ticks')

for ID, row in RF_dataset.iterrows():
    # print(i)
    nan_flag = False

    mol = row.ROMol# if i == max_radius else row[f'FRAG_R{i}']          
    descriptor = desc_calc.CalcDescriptors(mol)
    for j in descriptor:
        if np.isnan(j):
            nan_flag = True
            break 
    if nan_flag:
        print(ID)
        print(f'Line {ID} contains NaN values, removing...')
        RF_dataset.drop(ID, inplace=True)
        continue
    # print(descriptor)
    mol_descs.append(descriptor)
    activity_binary.append(row.PUBCHEM_ACTIVITY_OUTCOME)
    activity_regressor.append(row.PUBCHEM_ACTIVITY_SCORE)
    pbar.update()

9561
Line 9561 contains NaN values, removing...
9637
Line 9637 contains NaN values, removing...
9644
Line 9644 contains NaN values, removing...
9667
Line 9667 contains NaN values, removing...
9669
Line 9669 contains NaN values, removing...
9693
Line 9693 contains NaN values, removing...
9701
Line 9701 contains NaN values, removing...
9708
Line 9708 contains NaN values, removing...
9709
Line 9709 contains NaN values, removing...
9722
Line 9722 contains NaN values, removing...
9967
Line 9967 contains NaN values, removing...
18834
Line 18834 contains NaN values, removing...
19009
Line 19009 contains NaN values, removing...
19021
Line 19021 contains NaN values, removing...
19039
Line 19039 contains NaN values, removing...
19051
Line 19051 contains NaN values, removing...
23705
Line 23705 contains NaN values, removing...
37288
Line 37288 contains NaN values, removing...
38542
Line 38542 contains NaN values, removing...
38557
Line 38557 contains NaN values, removing...
48679
Line 48679 conta

In [10]:
print(len(mol_descs))
print(len(activity_binary))
print(len(activity_regressor))
print(len(desc_calc.descriptorNames))

305449
305449
305449
204


In [15]:
arr = np.array(mol_descs, dtype=np.float64)

In [16]:


model_df = pd.DataFrame(arr, columns=desc_calc.descriptorNames)
model_df['PUBCHEM_ACTIVITY_OUTCOME'] = activity_binary
model_df['PUBCHEM_ACTIVITY_SCORE'] = activity_regressor

model_df.to_csv('RF_TF_desc.csv', index=False)

In [37]:
try:
    len(model_df.index)
except:
    model_df = pd.read_csv('RF_TF_desc.csv')

In [38]:
model_df.drop('Ipc', axis=1, inplace=True)

In [39]:
activity_binary = np.ravel(model_df['PUBCHEM_ACTIVITY_OUTCOME'])
activity_regressor = np.ravel(model_df['PUBCHEM_ACTIVITY_SCORE'])
del model_df['PUBCHEM_ACTIVITY_OUTCOME']
del model_df['PUBCHEM_ACTIVITY_SCORE']

In [40]:
model_df = model_df.to_numpy()

In [41]:
#split the descriptors
X_train, X_test, Y_BIN_train, Y_BIN_test, Y_REG_train, Y_REG_test = train_test_split(model_df, activity_binary, activity_regressor, random_state=20)

In [52]:
#BINARY
test_descriptor_df = pd.DataFrame(X_test, columns=desc_calc.descriptorNames)

test_descriptor_df['PUBCHEM_ACTIVITY_OUTCOME'] = Y_BIN_test

train_descriptor_df = pd.DataFrame(X_train, columns=desc_calc.descriptorNames)
train_descriptor_df['PUBCHEM_ACTIVITY_OUTCOME'] = Y_BIN_train

In [45]:
#parameters
NUM_TREES = 2000

MIN_EXAMPLES = 6

MAX_DEPTH = 5

SUBSAMPLE = 0.65

SAMPLING_METHOD = "RANDOM"

VALIDATION_RATIO = 0.1

In [53]:
def run_RF_model(model, train_data, test_data, num_epochs=1, batch_size=1):
    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        train_data, label='PUBCHEM_ACTIVITY_OUTCOME'
    )
    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
        test_data, label='PUBCHEM_ACTIVITY_OUTCOME'
    )
    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
    _, accuracy = model.evaluate(test_dataset, verbose=0)
    print(f"Test accuracy: {round(accuracy * 100, 2)}%")

In [54]:
def show_feature_usage():
    feature_usages = []

    #there are only numerical descriptors
    for feature_name in desc_calc.descriptorNames:
        usage = tfdf.keras.FeatureUsage(name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL)
        feature_usages.append(usage)

    return feature_usages

In [55]:
def create_GBT_model():
    gbt_model = tfdf.keras.GradientBoostedTreesModel(
        features=show_feature_usage(),
        exclude_non_specified_features=True,
        num_trees=NUM_TREES,
        max_depth=MAX_DEPTH,
        min_examples=MIN_EXAMPLES,
        subsample=SUBSAMPLE,
        validation_ratio=VALIDATION_RATIO,
        task=tfdf.keras.Task.CLASSIFICATION,
    )
    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
    return gbt_model

In [56]:
gbt_model = create_GBT_model()

Use /tmp/tmp3v9zzq3e as temporary training directory


In [57]:
run_RF_model(gbt_model, train_data=train_descriptor_df, test_data=test_descriptor_df)

  features_dataframe = dataframe.drop(label, 1)




  features_dataframe = dataframe.drop(label, 1)


Reading training dataset...
Training dataset read in 0:00:16.569998. Found 229086 examples.
Training model...
Model trained in 0:01:04.173103
Compiling model...


[INFO kernel.cc:1176] Loading model from path /tmp/tmp3v9zzq3e/model/ with prefix a28f6be13f91490d
[INFO abstract_model.cc:1248] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO kernel.cc:1022] Use fast generic engine


Model compiled.
Test accuracy: 99.5%


In [None]:
print(gbt_model.summary())

In [None]:
train_descriptor_df = pd.DataFrame(X_test, columns=[desc_calc.descriptorNames])
train_descriptor_df['PUBCHEM_ACTIVITY_OUTCOME'] = Y1_test
train_descriptor_df['PUBCHEM_ACTIVITY_SCORE'] = Y2_test