<a href="https://colab.research.google.com/github/ddhackiisc/code/blob/master/DILI/DILI_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install padelpy

In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

In [30]:
import pandas as pd
import numpy as np
from padelpy import from_smiles
from rdkit import Chem
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras import backend as K

In [4]:
from google.colab import files
uploaded = files.upload()

In [16]:
train_df = pd.read_csv("combined_train.csv")
train_smiles = train_df['smiles'].to_numpy()
train_labels = train_df['label'].to_numpy()

test_df = pd.read_csv("combined_test.csv")
test_smiles = test_df["smiles"].to_numpy()
test_labels = test_df["label"].to_numpy()



In [6]:
train_smiles_can = [Chem.MolToSmiles(Chem.MolFromSmiles(smile)) for smile in train_smiles]
test_smiles_can = [Chem.MolToSmiles(Chem.MolFromSmiles(smile)) for smile in test_smiles]

In [7]:
def create_bit_vector(fingerprints):
  bit_vector = np.empty((881))
  for j in range(881):
    bit_vector[j] = int(fingerprints['PubchemFP'+str(j)])
  return bit_vector


def pubchem_fingerprints(smiles_array):
  fingerprints = np.empty((len(smiles_array), 881))
  for i in range(len(smiles_array)):
    print("Processing Molecule {}".format(i))
    pubc_fingerp = from_smiles(smiles_array[i], descriptors=False, fingerprints=True, timeout=120)
    pubc_fingerp_b = create_bit_vector(pubc_fingerp)
    fingerprints[i,:] = pubc_fingerp_b
  return fingerprints

In [None]:
train_input = pubchem_fingerprints(train_smiles_can)
test_input = pubchem_fingerprints(test_smiles_can)

In [107]:
np.savetxt('train_input.csv', train_input, delimiter=',')
np.savetxt('test_input.csv', test_input, delimiter=',')
# Saved the obtained fingerprints as csv files because obtaining fingerprints takes a lot time

In [15]:
train_input = pd.read_csv('train_input.csv', header = None).to_numpy()
test_input = pd.read_csv('test_input.csv', header = None).to_numpy()

In [78]:
#make a simple nural network
inputs = keras.Input(shape=(881,), name='input')
dense = layers.Dense(200, activation='relu')(inputs)
dense = layers.Dense(200, activation='relu')(dense)
outputs = layers.Dense(1, activation='sigmoid', name='prediction')(dense)

model = keras.Model(inputs=inputs, outputs=outputs, name='Simple_NN')

In [79]:
model.summary()

Model: "Simple_NN"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           [(None, 881)]             0         
_________________________________________________________________
dense_18 (Dense)             (None, 200)               176400    
_________________________________________________________________
dense_19 (Dense)             (None, 200)               40200     
_________________________________________________________________
prediction (Dense)           (None, 1)                 201       
Total params: 216,801
Trainable params: 216,801
Non-trainable params: 0
_________________________________________________________________


In [93]:
model.compile(
    optimizer=keras.optimizers.SGD(),
    loss=keras.losses.BinaryCrossentropy(),
    metrics=['accuracy']
)

In [96]:
fit = model.fit(train_input, train_labels, epochs=10, verbose=2, validation_split=0.2, batch_size=16)

Epoch 1/10
23/23 - 0s - loss: 0.0149 - accuracy: 1.0000 - val_loss: 0.6825 - val_accuracy: 0.8242
Epoch 2/10
23/23 - 0s - loss: 0.0145 - accuracy: 1.0000 - val_loss: 0.6867 - val_accuracy: 0.8242
Epoch 3/10
23/23 - 0s - loss: 0.0146 - accuracy: 1.0000 - val_loss: 0.6849 - val_accuracy: 0.8242
Epoch 4/10
23/23 - 0s - loss: 0.0144 - accuracy: 1.0000 - val_loss: 0.6881 - val_accuracy: 0.8242
Epoch 5/10
23/23 - 0s - loss: 0.0141 - accuracy: 1.0000 - val_loss: 0.6909 - val_accuracy: 0.8242
Epoch 6/10
23/23 - 0s - loss: 0.0139 - accuracy: 1.0000 - val_loss: 0.6955 - val_accuracy: 0.8242
Epoch 7/10
23/23 - 0s - loss: 0.0137 - accuracy: 1.0000 - val_loss: 0.6930 - val_accuracy: 0.8242
Epoch 8/10
23/23 - 0s - loss: 0.0135 - accuracy: 1.0000 - val_loss: 0.6998 - val_accuracy: 0.8242
Epoch 9/10
23/23 - 0s - loss: 0.0136 - accuracy: 1.0000 - val_loss: 0.6983 - val_accuracy: 0.8242
Epoch 10/10
23/23 - 0s - loss: 0.0133 - accuracy: 1.0000 - val_loss: 0.6981 - val_accuracy: 0.8242


In [97]:
test_scores = model.evaluate(test_input, test_labels, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

7/7 - 0s - loss: 3.7229 - accuracy: 0.2828
Test loss: 3.7228574752807617
Test accuracy: 0.28282827138900757
