Skip to content

Commit

Permalink
Merge pull request #3038 from chertianser/master
Browse files Browse the repository at this point in the history
Fixes to k-fold fingerprint splitting
  • Loading branch information
arunppsg committed Aug 25, 2022
2 parents 6aa4e2f + bf5cd64 commit db139bf
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 136 deletions.
254 changes: 134 additions & 120 deletions deepchem/splits/splitters.py
Expand Up @@ -83,12 +83,11 @@ def k_fold_split(self,
# to k-1.
frac_fold = 1. / (k - fold)
train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1]
fold_inds, rem_inds, _ = self.split(
rem_dataset,
frac_train=frac_fold,
frac_valid=1 - frac_fold,
frac_test=0,
**kwargs)
fold_inds, rem_inds, _ = self.split(rem_dataset,
frac_train=frac_fold,
frac_valid=1 - frac_fold,
frac_test=0,
**kwargs)
cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
cv_datasets.append(cv_dataset)
# FIXME: Incompatible types in assignment (expression has type "Dataset", variable has type "DiskDataset")
Expand Down Expand Up @@ -156,13 +155,12 @@ def train_valid_test_split(self,
A tuple of train, valid and test datasets as dc.data.Dataset objects.
"""
logger.info("Computing train/valid/test indices")
train_inds, valid_inds, test_inds = self.split(
dataset,
frac_train=frac_train,
frac_test=frac_test,
frac_valid=frac_valid,
seed=seed,
log_every_n=log_every_n)
train_inds, valid_inds, test_inds = self.split(dataset,
frac_train=frac_train,
frac_test=frac_test,
frac_valid=frac_valid,
seed=seed,
log_every_n=log_every_n)
if train_dir is None:
train_dir = tempfile.mkdtemp()
if valid_dir is None:
Expand Down Expand Up @@ -344,14 +342,15 @@ class RandomSplitter(Splitter):
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits internal compounds randomly into train/validation/test.
Expand Down Expand Up @@ -424,14 +423,15 @@ def __init__(self, groups: Sequence):
"""
self.groups = groups

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""Return indices for specified split
Parameters
Expand Down Expand Up @@ -582,8 +582,8 @@ def split(self,

index = indices[i]
set_frac = [
1 if set_target[i][task] == 0 else
set_counts[i][task] / set_target[i][task] for i in range(3)
1 if set_target[i][task] == 0 else set_counts[i][task] /
set_target[i][task] for i in range(3)
]
set_index = np.argmin(set_frac)
set_inds[set_index].append(index)
Expand Down Expand Up @@ -683,14 +683,15 @@ def k_fold_split( # type: ignore [override]
fold_datasets.append(fold_dataset)
return fold_datasets

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits compounds into train/validation/test using stratified sampling.
Expand Down Expand Up @@ -761,14 +762,15 @@ class IndexSplitter(Splitter):
ordered (for example).
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Splits internal compounds into train/validation/test in provided order.
Parameters
Expand Down Expand Up @@ -826,14 +828,15 @@ def __init__(self,
self.valid_indices = valid_indices
self.test_indices = test_indices

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits internal compounds into train/validation/test in designated order.
Expand Down Expand Up @@ -889,14 +892,15 @@ class MolecularWeightSplitter(Splitter):
This class requires RDKit to be installed.
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Splits on molecular weight.
Splits internal compounds into train/validation/test using the MW
Expand Down Expand Up @@ -961,14 +965,15 @@ class MaxMinSplitter(Splitter):
This class requires RDKit to be installed.
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits internal compounds into train/validation/test using the MaxMin diversity algorithm.
Expand Down Expand Up @@ -1023,18 +1028,16 @@ def distance(i, j):
return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

picker = MaxMinPicker()
testIndices = picker.LazyPick(
distFunc=distance,
poolSize=num_datapoints,
pickSize=num_test,
seed=seed)

validTestIndices = picker.LazyPick(
distFunc=distance,
poolSize=num_datapoints,
pickSize=num_valid + num_test,
firstPicks=testIndices,
seed=seed)
testIndices = picker.LazyPick(distFunc=distance,
poolSize=num_datapoints,
pickSize=num_test,
seed=seed)

validTestIndices = picker.LazyPick(distFunc=distance,
poolSize=num_datapoints,
pickSize=num_valid + num_test,
firstPicks=testIndices,
seed=seed)

allSet = set(range(num_datapoints))
testSet = set(testIndices)
Expand Down Expand Up @@ -1071,14 +1074,15 @@ def __init__(self, cutoff: float = 0.6):
super(ButinaSplitter, self).__init__()
self.cutoff = cutoff

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits internal compounds into train and validation based on the butina
clustering algorithm. This splitting algorithm has an O(N^2) run time, where N
Expand Down Expand Up @@ -1129,8 +1133,10 @@ def split(self,
for i in range(1, nfps):
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([1 - x for x in sims])
scaffold_sets = Butina.ClusterData(
dists, nfps, self.cutoff, isDistData=True)
scaffold_sets = Butina.ClusterData(dists,
nfps,
self.cutoff,
isDistData=True)
scaffold_sets = sorted(scaffold_sets, key=lambda x: -len(x))

train_cutoff = frac_train * len(dataset)
Expand Down Expand Up @@ -1211,14 +1217,15 @@ def __init__(self):
"""Create a FingerprintSplitter."""
super(FingerprintSplitter, self).__init__()

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits compounds into training, validation, and test sets based on the
Tanimoto similarity of their ECFP4 fingerprints. This splitting algorithm
Expand Down Expand Up @@ -1265,10 +1272,10 @@ def split(self,

# Split the second group into validation and test sets.

if valid_size == 0:
if valid_size == 0 or frac_valid == 0:
valid_inds = []
test_inds = test_valid_inds
elif test_size == 0:
elif test_size == 0 or frac_test == 0:
test_inds = []
valid_inds = test_valid_inds
else:
Expand Down Expand Up @@ -1298,6 +1305,10 @@ def _split_fingerprints(fps: List, size1: int,
DataStructs.BulkTanimotoSimilarity(fps[0], remaining_fp),
[0] * len(remaining_fp)
]
# Return identity if no tuple to split to
if size2 == 0:
return ((list(range(len(fps)))), [])

while len(remaining_fp) > 0:
# Decide which group to assign a molecule to.

Expand Down Expand Up @@ -1362,14 +1373,15 @@ class ScaffoldSplitter(Splitter):
This class requires RDKit to be installed.
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = 1000
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = 1000
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits internal compounds into train/validation/test by scaffold.
Expand Down Expand Up @@ -1415,7 +1427,8 @@ def split(self,
train_inds += scaffold_set
return train_inds, valid_inds, test_inds

def generate_scaffolds(self, dataset: Dataset,
def generate_scaffolds(self,
dataset: Dataset,
log_every_n: int = 1000) -> List[List[int]]:
"""Returns all scaffolds from the dataset.
Expand Down Expand Up @@ -1473,14 +1486,15 @@ def __init__(self, ids: Sequence[int], year_file: Optional[str] = None):
self.ids = ids
self.year_file = year_file

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits protein-ligand pairs in PDBbind into train/validation/test in time order.
Expand Down

0 comments on commit db139bf

Please sign in to comment.