In [None]:
#!pip install rdkit-pypi
#!pip install tensorflow_datasets
#!pip install --upgrade tensorflow

In [5]:
#!pip install -e ../.

In [2]:
import rdkit


In [3]:
from feature_extractor import fingerprint_features

In [4]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

tfds.disable_progress_bar()

In [5]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

In [6]:
import numpy as np
def get_fingerprint(smiles):
    return np.frombuffer(fingerprint_features(smiles).ToBitString().encode(), 'u1') - ord('0')

In [7]:
import pandas as pd
data = pd.read_csv("../data/0_raw/data.csv").drop("mol_id", axis=1)
#data["fingerprint"] = data["smiles"].apply(get_fingerprint)

In [8]:
X = data["smiles"].values
y = data["P1"].values

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_not_train, y_train, y_not_train = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

X_valid, X_test, y_valid, y_test = train_test_split(X_not_train, y_not_train, test_size=0.5, random_state=42, stratify=y_not_train)

In [35]:
len(X_train), len(X_test), len(X_valid)

(3499, 750, 750)

In [36]:
X_train_fingerprint = [get_fingerprint(smile) for smile in X_train]
X_valid_fingerprint = [get_fingerprint(smile) for smile in X_valid]
X_test_fingerprint = [get_fingerprint(smile) for smile in X_test]

In [37]:
len(X_train_fingerprint), len(X_valid_fingerprint), len(X_test_fingerprint)

(3499, 750, 750)

In [38]:
train_features = np.float_(X_train_fingerprint)
train_labels = y_train
valid_features = np.float_(X_valid_fingerprint)
valid_labels = y_valid
test_features = np.float_(X_test_fingerprint)
test_labels = y_test


In [39]:
len(train_features), len(y_train)

(3499, 3499)

In [40]:
import tensorflow as tf
model = tf.keras.Sequential([
  tf.keras.layers.Dense(56, input_shape=(2048,)),
  tf.keras.layers.Dropout(0.6),
  tf.keras.layers.Dense(56, activation='relu'),
  tf.keras.layers.Dropout(0.4),
  tf.keras.layers.Dense(1,activation="sigmoid"),
  ])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])

In [19]:
from tensorflow import keras
inputs = keras.Input(shape=(2048,))
x = keras.layers.Dense(30, activation="relu")(inputs)
outputs = keras.layers.Dense(1, activation='sigmoid',)(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="servier_model")
model.summary()

Model: "servier_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 2048)]            0         
                                                                 
 dense_3 (Dense)             (None, 30)                61470     
                                                                 
 dense_4 (Dense)             (None, 1)                 31        
                                                                 
Total params: 61,501
Trainable params: 61,501
Non-trainable params: 0
_________________________________________________________________


In [74]:



def make_model(metrics=METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    #inputs = keras.Input(shape=(2048,))
    #keras.layers.Dense(30, activation="relu")
    model = keras.Sequential([
      keras.layers.Dense(
          16, activation='relu',
          input_shape=(2048,)),
      keras.layers.Dropout(0.5),
      keras.layers.Dense(1, activation='sigmoid',
                         bias_initializer=output_bias),
    ])

    model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

    return model

In [75]:
EPOCHS = 100
BATCH_SIZE = 128

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [80]:

from tensorflow import keras
METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
      keras.metrics.AUC(name='prc', curve='PR'), # precision-recall curve
]

def get_imbalance_params(y):
    neg, pos = np.bincount(y)
    total = neg + pos
    initial_bias = np.log([pos/neg])
    weight_for_0 = (1 / neg) * (total / 2.0)
    weight_for_1 = (1 / pos) * (total / 2.0)
    class_weight = {0: weight_for_0, 1: weight_for_1}
    return initial_bias, class_weight



def make_model(metrics=METRICS, output_bias=None):
    if output_bias is not None:
        output_bias = tf.keras.initializers.Constant(output_bias)
    inputs = keras.Input(shape=(2048,))
    x = keras.layers.Dense(30, activation="relu")(inputs)
    outputs = keras.layers.Dense(1, activation='sigmoid', bias_initializer=output_bias)(x)
    model = keras.Model(inputs=inputs, outputs=outputs, name="servier_model")
    
    model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=1e-3),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=metrics)

    return model

In [81]:
initial_bias, class_weight = get_imbalance_params(y_train)
model = make_model(output_bias = initial_bias)
model.summary()

Model: "servier_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_12 (InputLayer)       [(None, 2048)]            0         
                                                                 
 dense_28 (Dense)            (None, 30)                61470     
                                                                 
 dense_29 (Dense)            (None, 1)                 31        
                                                                 
Total params: 61,501
Trainable params: 61,501
Non-trainable params: 0
_________________________________________________________________


In [82]:
weighted_history = model.fit(
    train_features,
    train_labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=[early_stopping],
    validation_data=(valid_features, valid_labels),
    # The class weights go here
    class_weight=class_weight)

Epoch 1/100


2022-04-19 22:32:49.995500: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2022-04-19 22:32:51.341781: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 00017: early stopping


In [83]:
res = model.evaluate(test_features, test_labels)

