<a href="https://colab.research.google.com/github/ddhackiisc/code/blob/master/DILI/DILI_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install padelpy

In [None]:
import sys
import os
import requests
import subprocess
import shutil
from logging import getLogger, StreamHandler, INFO


logger = getLogger(__name__)
logger.addHandler(StreamHandler())
logger.setLevel(INFO)


def install(
        chunk_size=4096,
        file_name="Miniconda3-latest-Linux-x86_64.sh",
        url_base="https://repo.continuum.io/miniconda/",
        conda_path=os.path.expanduser(os.path.join("~", "miniconda")),
        rdkit_version=None,
        add_python_path=True,
        force=False):
    """install rdkit from miniconda
    ```
    import rdkit_installer
    rdkit_installer.install()
    ```
    """

    python_path = os.path.join(
        conda_path,
        "lib",
        "python{0}.{1}".format(*sys.version_info),
        "site-packages",
    )

    if add_python_path and python_path not in sys.path:
        logger.info("add {} to PYTHONPATH".format(python_path))
        sys.path.append(python_path)

    if os.path.isdir(os.path.join(python_path, "rdkit")):
        logger.info("rdkit is already installed")
        if not force:
            return

        logger.info("force re-install")

    url = url_base + file_name
    python_version = "{0}.{1}.{2}".format(*sys.version_info)

    logger.info("python version: {}".format(python_version))

    if os.path.isdir(conda_path):
        logger.warning("remove current miniconda")
        shutil.rmtree(conda_path)
    elif os.path.isfile(conda_path):
        logger.warning("remove {}".format(conda_path))
        os.remove(conda_path)

    logger.info('fetching installer from {}'.format(url))
    res = requests.get(url, stream=True)
    res.raise_for_status()
    with open(file_name, 'wb') as f:
        for chunk in res.iter_content(chunk_size):
            f.write(chunk)
    logger.info('done')

    logger.info('installing miniconda to {}'.format(conda_path))
    subprocess.check_call(["bash", file_name, "-b", "-p", conda_path])
    logger.info('done')

    logger.info("installing rdkit")
    subprocess.check_call([
        os.path.join(conda_path, "bin", "conda"),
        "install",
        "--yes",
        "-c", "rdkit",
        "python=={}".format(python_version),
        "rdkit" if rdkit_version is None else "rdkit=={}".format(rdkit_version)])
    logger.info("done")

    import rdkit
    logger.info("rdkit-{} installation finished!".format(rdkit.__version__))


if __name__ == "__main__":
    install()

In [95]:
import pandas as pd
import numpy as np
from padelpy import from_smiles
from rdkit import Chem

In [49]:
from google.colab import files
uploaded = files.upload()

Saving openbabel-openbabel-2-4-0.tar.gz to openbabel-openbabel-2-4-0.tar.gz


In [96]:
train_df = pd.read_csv("combined_train.csv")
train_smiles = train_df['smiles'].to_numpy()
train_labels = train_df['label'].to_numpy()

test_df = pd.read_csv("combined_test.csv")
test_smiles = test_df["smiles"].to_numpy()
test_labels = test_df["label"].to_numpy()



In [97]:
train_smiles_can = [Chem.MolToSmiles(Chem.MolFromSmiles(smile)) for smile in train_smiles]
test_smiles_can = [Chem.MolToSmiles(Chem.MolFromSmiles(smile)) for smile in test_smiles]

In [100]:
def create_bit_vector(fingerprints):
  bit_vector = np.empty((881))
  for j in range(881):
    bit_vector[j] = int(fingerprints['PubchemFP'+str(j)])
  return bit_vector


def pubchem_fingerprints(smiles_array):
  fingerprints = np.empty((len(smiles_array), 881))
  for i in range(len(smiles_array)):
    print("Processing Molecule {}".format(i))
    pubc_fingerp = from_smiles(smiles_array[i], descriptors=False, fingerprints=True, timeout=120)
    pubc_fingerp_b = create_bit_vector(pubc_fingerp)
    fingerprints[i,:] = pubc_fingerp_b
  return fingerprints

In [None]:
train_input = pubchem_fingerprints(train_smiles_can)
test_input = pubchem_fingerprints(test_smiles_can)

In [107]:
np.savetxt('train_input.csv', train_input, delimiter=',')
np.savetxt('test_input.csv', test_input, delimiter=',')
# Saved the obtained fingerprints as csv files because obtaining fingerprints takes a lot time