data_loader.py

In [2]:
import deepchem as dc

In [3]:
dc.splits.splitters

<module 'deepchem.splits.splitters' from '/home/ab/deepchem/deepchem/splits/splitters.py'>

In [4]:
help(dc.splits.splitters)

Help on module deepchem.splits.splitters in deepchem.splits:

NAME
    deepchem.splits.splitters - Contains an abstract base class that supports chemically aware data splits.

CLASSES
    builtins.object
        Splitter
            ButinaSplitter
            FingerprintSplitter
            IndexSplitter
            IndiceSplitter
            MaxMinSplitter
            MolecularWeightSplitter
            RandomGroupSplitter
            RandomSplitter
            RandomStratifiedSplitter
            ScaffoldSplitter
            SingletaskStratifiedSplitter
            SpecifiedSplitter
            TimeSplitterPDBbind
    
    class ButinaSplitter(Splitter)
     |  Class for doing data splits based on the butina clustering of a bulk tanimoto
     |  fingerprint matrix.
     |  
     |  Method resolution order:
     |      ButinaSplitter
     |      Splitter
     |      builtins.object
     |  
     |  Methods defined here:
     |  
     |  split(self, dataset, frac_train=None, frac_valid

In [5]:
from deepchem.splits.splitters import ScaffoldSplitter

In [6]:
help(ScaffoldSplitter)

Help on class ScaffoldSplitter in module deepchem.splits.splitters:

class ScaffoldSplitter(Splitter)
 |  Class for doing data splits based on the scaffold of small molecules.
 |  
 |  Method resolution order:
 |      ScaffoldSplitter
 |      Splitter
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, log_every_n=1000)
 |      Splits internal compounds into train/validation/test by scaffold.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from Splitter:
 |  
 |  __init__(self, verbose=False)
 |      Creates splitter object.
 |  
 |  k_fold_split(self, dataset, k, directories=None, **kwargs)
 |      Parameters
 |      ----------
 |      dataset: Dataset
 |      Dataset to do a k-fold split
 |      
 |      k: int
 |      number of folds
 |      
 |      directories: list of str
 |      list of length 2*k filepaths to save the result disk-datasets
 |     

In [8]:
from deepchem.splits.splitters import SpecifiedSplitter

In [9]:
help(SpecifiedSplitter)

Help on class SpecifiedSplitter in module deepchem.splits.splitters:

class SpecifiedSplitter(Splitter)
 |  Class that splits data according to user specification.
 |  
 |  Method resolution order:
 |      SpecifiedSplitter
 |      Splitter
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, input_file, split_field, verbose=False)
 |      Provide input information for splits.
 |  
 |  split(self, dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1, log_every_n=1000)
 |      Splits internal compounds into train/validation/test by user-specification.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from Splitter:
 |  
 |  k_fold_split(self, dataset, k, directories=None, **kwargs)
 |      Parameters
 |      ----------
 |      dataset: Dataset
 |      Dataset to do a k-fold split
 |      
 |      k: int
 |      number of folds
 |      
 |      directories: list of str
 |      list of length 2*k filepaths to 

In [10]:
from deepchem.splits.splitters import IndexSplitter

In [11]:
help(IndexSplitter)

Help on class IndexSplitter in module deepchem.splits.splitters:

class IndexSplitter(Splitter)
 |  Class for simple order based splits.
 |  
 |  Method resolution order:
 |      IndexSplitter
 |      Splitter
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  split(self, dataset, seed=None, frac_train=0.8, frac_valid=0.1, frac_test=0.1, log_every_n=None)
 |      Splits internal compounds into train/validation/test in provided order.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from Splitter:
 |  
 |  __init__(self, verbose=False)
 |      Creates splitter object.
 |  
 |  k_fold_split(self, dataset, k, directories=None, **kwargs)
 |      Parameters
 |      ----------
 |      dataset: Dataset
 |      Dataset to do a k-fold split
 |      
 |      k: int
 |      number of folds
 |      
 |      directories: list of str
 |      list of length 2*k filepaths to save the result disk-datasets
 |      
 |      kwargs
 |      

In [12]:
from deepchem.splits.splitters import IndiceSplitter

In [13]:
help(IndiceSplitter)

Help on class IndiceSplitter in module deepchem.splits.splitters:

class IndiceSplitter(Splitter)
 |  Class for splits based on input order.
 |  
 |  Method resolution order:
 |      IndiceSplitter
 |      Splitter
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, verbose=False, valid_indices=None, test_indices=None)
 |      Parameters
 |      -----------
 |      valid_indices: list of int
 |          indices of samples in the valid set
 |      test_indices: list of int
 |          indices of samples in the test set
 |  
 |  split(self, dataset, seed=None, frac_train=0.8, frac_valid=0.1, frac_test=0.1, log_every_n=None)
 |      Splits internal compounds into train/validation/test in designated order.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from Splitter:
 |  
 |  k_fold_split(self, dataset, k, directories=None, **kwargs)
 |      Parameters
 |      ----------
 |      dataset: Dataset
 |      Datas

In [14]:
from deepchem.splits.splitters import RandomGroupSplitter

In [15]:
help(RandomGroupSplitter)

Help on class RandomGroupSplitter in module deepchem.splits.splitters:

class RandomGroupSplitter(Splitter)
 |  Abstract base class for chemically aware splits..
 |  
 |  Method resolution order:
 |      RandomGroupSplitter
 |      Splitter
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, groups, *args, **kwargs)
 |      A splitter class that splits on groupings. An example use case is when there
 |      are multiple conformations of the same molecule that share the same topology.
 |      This splitter subsequently guarantees that resulting splits preserve groupings.
 |      
 |      Note that it doesn't do any dynamic programming or something fancy to try to
 |      maximize the choice such that frac_train, frac_valid, or frac_test is maximized.
 |      It simply permutes the groups themselves. As such, use with caution if the number
 |      of elements per group varies significantly.
 |      
 |      Parameters
 |      ----------
 |      groups: array li

In [20]:
from deepchem.utils.save import load_from_disk

In [21]:
import os
import unittest
import tempfile
import shutil

In [22]:
splittype="scaffold"


In [23]:
input_transforms=[]

In [25]:
output_transforms=["normalize"]

model_params={}

In [26]:
model_params={}

In [27]:
tasks=["log-solubility"]

In [28]:
task_type="regression"

In [89]:
import pandas as pd
data=pd.read_csv("/home/ab/deepchem/deepchem/models/tests/example.csv")

In [91]:
data.head()

Unnamed: 0,Compound ID,log-solubility,smiles
0,Amigdalin,0.974,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,2.885,Cc1occc1C(=O)Nc2ccccc2
2,citral,2.579,CC(C)=CCCC(C)=CC(=O)
3,Picene,6.618,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,2.232,c1ccsc1


In [43]:
input_data=os.path.join("/home/ab/deepchem/deepchem/models/tests/example.csv")

In [45]:
featurizer=dc.feat.CircularFingerprint(size=1024)
loader = dc.data.CSVLoader(tasks=tasks, smiles_field="smiles",featurizer=featurizer)

In [55]:
dataset1=loader.featurize(input_data)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from /home/ab/deepchem/deepchem/models/tests/example.csv
Loading shard 1 of size 8192.
Featurizing sample 0
TIMING: featurizing shard 0 took 0.031 s
TIMING: dataset construction took 0.042 s
Loading dataset from disk.


In [49]:
splitter = ScaffoldSplitter()

In [61]:
sf=ScaffoldSplitter()

In [65]:
sf.split(dataset1,frac_train=0.7,frac_valid=0.1,frac_test=0.2)

([9, 8, 7, 6, 5, 4, 3], [], [2, 1, 0])

In [72]:
train_dataset,valid_dataset,test_dataset=sf.split(dataset1,frac_train=0.6,frac_valid=0.2,frac_test=0.2)

In [73]:
len(valid_dataset)

2

In [76]:
splitter=IndexSplitter()

In [77]:
splitter.split(dataset1,frac_test=0.1,frac_train=0.7,frac_valid=0.2)

(range(0, 7), range(7, 9), range(9, 10))

In [78]:
test_data,train_data,valid_data=splitter.split(dataset1,frac_test=0.1,frac_train=0.7,frac_valid=0.2)

In [79]:
test_data

range(0, 7)

In [80]:
print(test_data)

range(0, 7)


In [81]:
splitter=IndiceSplitter()

In [86]:
splitter.split(dataset1,frac_train=0.8,frac_valid=0.1,frac_test=0.1)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [], [])

In [97]:
smiles=data['log-solubility'].values

In [98]:
splitter = SpecifiedSplitter(input_data,smiles)

KeyError: '[0.974 2.885 2.579 6.618 2.232 2.733 6.545 4.138 4.533 5.246] not in index'