Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes to k-fold fingerprint splitting #3038

Merged
merged 3 commits into from Aug 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
254 changes: 134 additions & 120 deletions deepchem/splits/splitters.py
Expand Up @@ -83,12 +83,11 @@ def k_fold_split(self,
# to k-1.
frac_fold = 1. / (k - fold)
train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1]
fold_inds, rem_inds, _ = self.split(
rem_dataset,
frac_train=frac_fold,
frac_valid=1 - frac_fold,
frac_test=0,
**kwargs)
fold_inds, rem_inds, _ = self.split(rem_dataset,
frac_train=frac_fold,
frac_valid=1 - frac_fold,
frac_test=0,
**kwargs)
cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
cv_datasets.append(cv_dataset)
# FIXME: Incompatible types in assignment (expression has type "Dataset", variable has type "DiskDataset")
Expand Down Expand Up @@ -156,13 +155,12 @@ def train_valid_test_split(self,
A tuple of train, valid and test datasets as dc.data.Dataset objects.
"""
logger.info("Computing train/valid/test indices")
train_inds, valid_inds, test_inds = self.split(
dataset,
frac_train=frac_train,
frac_test=frac_test,
frac_valid=frac_valid,
seed=seed,
log_every_n=log_every_n)
train_inds, valid_inds, test_inds = self.split(dataset,
frac_train=frac_train,
frac_test=frac_test,
frac_valid=frac_valid,
seed=seed,
log_every_n=log_every_n)
if train_dir is None:
train_dir = tempfile.mkdtemp()
if valid_dir is None:
Expand Down Expand Up @@ -344,14 +342,15 @@ class RandomSplitter(Splitter):

"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits internal compounds randomly into train/validation/test.

Expand Down Expand Up @@ -424,14 +423,15 @@ def __init__(self, groups: Sequence):
"""
self.groups = groups

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""Return indices for specified split

Parameters
Expand Down Expand Up @@ -582,8 +582,8 @@ def split(self,

index = indices[i]
set_frac = [
1 if set_target[i][task] == 0 else
set_counts[i][task] / set_target[i][task] for i in range(3)
1 if set_target[i][task] == 0 else set_counts[i][task] /
set_target[i][task] for i in range(3)
]
set_index = np.argmin(set_frac)
set_inds[set_index].append(index)
Expand Down Expand Up @@ -683,14 +683,15 @@ def k_fold_split( # type: ignore [override]
fold_datasets.append(fold_dataset)
return fold_datasets

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits compounds into train/validation/test using stratified sampling.

Expand Down Expand Up @@ -761,14 +762,15 @@ class IndexSplitter(Splitter):
ordered (for example).
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Splits internal compounds into train/validation/test in provided order.

Parameters
Expand Down Expand Up @@ -826,14 +828,15 @@ def __init__(self,
self.valid_indices = valid_indices
self.test_indices = test_indices

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Splits internal compounds into train/validation/test in designated order.

Expand Down Expand Up @@ -889,14 +892,15 @@ class MolecularWeightSplitter(Splitter):
This class requires RDKit to be installed.
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""Splits on molecular weight.

Splits internal compounds into train/validation/test using the MW
Expand Down Expand Up @@ -961,14 +965,15 @@ class MaxMinSplitter(Splitter):
This class requires RDKit to be installed.
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits internal compounds into train/validation/test using the MaxMin diversity algorithm.

Expand Down Expand Up @@ -1023,18 +1028,16 @@ def distance(i, j):
return 1 - DataStructs.DiceSimilarity(fps[i], fps[j])

picker = MaxMinPicker()
testIndices = picker.LazyPick(
distFunc=distance,
poolSize=num_datapoints,
pickSize=num_test,
seed=seed)

validTestIndices = picker.LazyPick(
distFunc=distance,
poolSize=num_datapoints,
pickSize=num_valid + num_test,
firstPicks=testIndices,
seed=seed)
testIndices = picker.LazyPick(distFunc=distance,
poolSize=num_datapoints,
pickSize=num_test,
seed=seed)

validTestIndices = picker.LazyPick(distFunc=distance,
poolSize=num_datapoints,
pickSize=num_valid + num_test,
firstPicks=testIndices,
seed=seed)

allSet = set(range(num_datapoints))
testSet = set(testIndices)
Expand Down Expand Up @@ -1071,14 +1074,15 @@ def __init__(self, cutoff: float = 0.6):
super(ButinaSplitter, self).__init__()
self.cutoff = cutoff

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits internal compounds into train and validation based on the butina
clustering algorithm. This splitting algorithm has an O(N^2) run time, where N
Expand Down Expand Up @@ -1129,8 +1133,10 @@ def split(self,
for i in range(1, nfps):
sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
dists.extend([1 - x for x in sims])
scaffold_sets = Butina.ClusterData(
dists, nfps, self.cutoff, isDistData=True)
scaffold_sets = Butina.ClusterData(dists,
nfps,
self.cutoff,
isDistData=True)
scaffold_sets = sorted(scaffold_sets, key=lambda x: -len(x))

train_cutoff = frac_train * len(dataset)
Expand Down Expand Up @@ -1211,14 +1217,15 @@ def __init__(self):
"""Create a FingerprintSplitter."""
super(FingerprintSplitter, self).__init__()

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits compounds into training, validation, and test sets based on the
Tanimoto similarity of their ECFP4 fingerprints. This splitting algorithm
Expand Down Expand Up @@ -1265,10 +1272,10 @@ def split(self,

# Split the second group into validation and test sets.

if valid_size == 0:
if valid_size == 0 or frac_valid == 0:
valid_inds = []
test_inds = test_valid_inds
elif test_size == 0:
elif test_size == 0 or frac_test == 0:
test_inds = []
valid_inds = test_valid_inds
else:
Expand Down Expand Up @@ -1298,6 +1305,10 @@ def _split_fingerprints(fps: List, size1: int,
DataStructs.BulkTanimotoSimilarity(fps[0], remaining_fp),
[0] * len(remaining_fp)
]
# Return identity if no tuple to split to
if size2 == 0:
return ((list(range(len(fps)))), [])

while len(remaining_fp) > 0:
# Decide which group to assign a molecule to.

Expand Down Expand Up @@ -1362,14 +1373,15 @@ class ScaffoldSplitter(Splitter):
This class requires RDKit to be installed.
"""

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = 1000
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = 1000
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits internal compounds into train/validation/test by scaffold.

Expand Down Expand Up @@ -1415,7 +1427,8 @@ def split(self,
train_inds += scaffold_set
return train_inds, valid_inds, test_inds

def generate_scaffolds(self, dataset: Dataset,
def generate_scaffolds(self,
dataset: Dataset,
log_every_n: int = 1000) -> List[List[int]]:
"""Returns all scaffolds from the dataset.

Expand Down Expand Up @@ -1473,14 +1486,15 @@ def __init__(self, ids: Sequence[int], year_file: Optional[str] = None):
self.ids = ids
self.year_file = year_file

def split(self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
def split(
self,
dataset: Dataset,
frac_train: float = 0.8,
frac_valid: float = 0.1,
frac_test: float = 0.1,
seed: Optional[int] = None,
log_every_n: Optional[int] = None
) -> Tuple[List[int], List[int], List[int]]:
"""
Splits protein-ligand pairs in PDBbind into train/validation/test in time order.

Expand Down