-
Notifications
You must be signed in to change notification settings - Fork 4
Creates a new way to test multiple selections for each protein; parallelizes LDDT script #116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
34eb10e
cfa619a
50ef437
ba249f1
51ee774
2945059
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,4 +1,5 @@ | ||
| import argparse | ||
| import itertools | ||
| import re | ||
| import sys | ||
| import traceback | ||
|
|
@@ -9,9 +10,9 @@ | |
| from atomworks.io.transforms.atom_array import ensure_atom_array_stack | ||
| from atomworks.io.utils.io_utils import load_any | ||
| from biotite.structure import AtomArray, AtomArrayStack | ||
| from joblib import delayed, Parallel | ||
| from loguru import logger | ||
| from sampleworks.eval.constants import OCCUPANCY_LEVELS | ||
| from sampleworks.eval.eval_dataclasses import ProteinConfig | ||
| from sampleworks.eval.eval_dataclasses import Experiment, ProteinConfig | ||
| from sampleworks.eval.grid_search_eval_utils import parse_args, scan_grid_search_results | ||
| from sampleworks.eval.structure_utils import get_reference_atomarraystack | ||
| from sampleworks.metrics.lddt import AllAtomLDDT | ||
|
|
@@ -175,7 +176,11 @@ def translate_selection(selection: str) -> str: | |
| # current selection strings are pymol like, and we want to convert to atomworks/pandas like | ||
| # this should be a temporary measure only until we switch to atomworks style in the RSCC script | ||
|
|
||
| logger.warning( | ||
| if any(x in selection for x in ("==", ">", "<", "<=", ">=", " in ")): | ||
| # assume this is already atomworks/pandas style and ignore. | ||
| return selection | ||
|
|
||
| DeprecationWarning( | ||
| "DEPRECATED: translate_selection converts from some pymol-like selection strings to " | ||
| "AtomWorks selection strings, but is not guaranteed to be correct for all cases." | ||
| ) | ||
|
|
@@ -215,8 +220,7 @@ def main(args: argparse.Namespace): | |
| logger.info("Pre-loading reference structures for each protein for coordinate extraction") | ||
| reference_atom_arrays = {} | ||
| for protein_key, protein_config in protein_configs.items(): | ||
| # TODO: should the reference occupancy should be specified in the config? | ||
| for occ in OCCUPANCY_LEVELS: | ||
| for occ, sel in itertools.product(args.occupancies, protein_config.selection): | ||
| ref_path, reference_proteins = get_reference_atomarraystack(protein_config, occ) | ||
| if reference_proteins is None: | ||
| logger.warning( | ||
|
|
@@ -227,63 +231,144 @@ def main(args: argparse.Namespace): | |
| logger.info( | ||
| f"Loaded ref structure for {protein_key} and occupancy {occ}: {ref_path}" | ||
| ) | ||
| # ignoring the altloc_id and occupancy arrays | ||
| reference_protein_stack, _, _ = map_altlocs_to_stack(reference_proteins) | ||
| if reference_proteins is not None: | ||
| reference_atom_arrays[(protein_key, occ)] = reference_protein_stack | ||
| reference_protein_stack, _, _ = map_altlocs_to_stack( | ||
| reference_proteins, selection=translate_selection(sel), return_full_array=True | ||
| ) | ||
| # hierarchical dictionary cache makes it lighter weight to parallelize. | ||
| if (protein_key, occ) not in reference_atom_arrays: | ||
| reference_atom_arrays[(protein_key, occ)] = {} | ||
|
|
||
| reference_atom_arrays[(protein_key, occ)][sel] = reference_protein_stack | ||
| except Exception as e: | ||
| logger.error( | ||
| f"Error loading ref structure for {protein_key} and occupancy {occ}: {e}" | ||
| ) | ||
| logger.error(f" Traceback: {traceback.format_exc()}") | ||
|
|
||
| all_results = [] | ||
| for _i, _exp in enumerate(all_experiments): | ||
| result = _exp.__dict__.copy() | ||
| if _exp.protein not in protein_configs: | ||
| # Do the quick pass through all the "rows" of our output table to filter in those we can run. | ||
| filtered_experiments = [] | ||
| null_results = [] | ||
| for _exp in all_experiments: | ||
| if _exp.protein in protein_configs: | ||
| protein = _exp.protein | ||
| elif _exp.protein.upper() in protein_configs: | ||
| protein = _exp.protein.upper() | ||
| elif _exp.protein.lower() in protein_configs: | ||
| protein = _exp.protein.lower() | ||
| else: | ||
| # These we just skip over--we assume that the user has told us via the config file | ||
| # what results they are interested in. | ||
| logger.warning(f"Skipping protein with no configuration: {_exp.protein}") | ||
| continue | ||
|
|
||
| protein_config = protein_configs[_exp.protein] | ||
| protein_config = protein_configs[protein] | ||
| if protein_config.protein != protein: | ||
| raise ValueError( | ||
| f"Protein name mismatch: expected {protein_config.protein}, got {protein}, make" | ||
| f"sure you loaded your protein configs with ProteinConfig.from_csv()." | ||
| ) | ||
|
|
||
| atom_array_key = (_exp.protein, _exp.occ_a) | ||
| if atom_array_key not in reference_atom_arrays: | ||
| if (protein, _exp.occ_a) not in reference_atom_arrays: | ||
| logger.warning( | ||
| f"Skipping {_exp.protein_dir_name}: no reference atom array stack available " | ||
| f"for {_exp.protein} and occupancy {_exp.occ_a}." | ||
| f"for {_exp.protein}, occupancy {_exp.occ_a}." | ||
| ) | ||
| # record empty results for all selections, indicating they could not be computed. | ||
| for _sel in protein_config.selection: | ||
| exp_copy = _exp.__dict__.copy() | ||
| exp_copy["selection"] = _sel | ||
| null_results.append(exp_copy) | ||
| continue | ||
|
|
||
| try: | ||
| reference_atom_array_stack = reference_atom_arrays[atom_array_key] | ||
| # these shouldn't have altlocs, don't need altloc="all". | ||
| predicted_atom_array_stack = load_any(_exp.refined_cif_path) | ||
| clustering_results = nn_lddt_clustering( | ||
| reference_atom_array_stack, | ||
| ensure_atom_array_stack(predicted_atom_array_stack), | ||
| translate_selection(protein_config.selection), | ||
| ) | ||
| protein_reference_atom_arrays = reference_atom_arrays[(protein, _exp.occ_a)] | ||
| for _sel in protein_config.selection: | ||
| if _sel not in protein_reference_atom_arrays: | ||
| logger.warning( | ||
| f"Skipping {_exp.protein_dir_name}: no reference atom array stack available " | ||
| f"for {_exp.protein}, occupancy {_exp.occ_a} and selection '{_sel}'." | ||
| ) | ||
| exp_copy = _exp.__dict__.copy() | ||
| exp_copy["selection"] = _sel | ||
| null_results.append(exp_copy) | ||
| continue | ||
|
|
||
| result.update( | ||
| { | ||
| k: clustering_results[k] | ||
| for k in ("occupancies", "avg_silhouette", "avg_silhouette_to_ref") | ||
| } | ||
| ) | ||
| except Exception as e: | ||
| logger.error(f"Error processing experiment {_exp.exp_dir}: {e}") | ||
| logger.error(f" Traceback: {traceback.format_exc()}") | ||
| result["error"] = str(e) | ||
| result["avg_silhouette"] = np.nan | ||
| result["avg_silhouette_to_ref"] = np.nan | ||
| result["occupancies"] = [] | ||
| px_seln_refernce_atom_array = protein_reference_atom_arrays[_sel] | ||
| filtered_experiments.append((_exp, protein_config, px_seln_refernce_atom_array, _sel)) | ||
|
|
||
| all_results.append(result) | ||
| # now we can more easily parallelize this loop. | ||
| logger.debug("Starting LDDT evaluation loop. This may take a while...") | ||
| all_results = Parallel(n_jobs=args.n_jobs)( | ||
| delayed(process_exp_with_selection)( | ||
| _exp, protein_config, px_seln_refernce_atom_array, selection | ||
| ) | ||
| for _exp, protein_config, px_seln_refernce_atom_array, selection in filtered_experiments | ||
| ) | ||
|
|
||
| df = pd.DataFrame(all_results) | ||
| df = pd.DataFrame(null_results + all_results) | ||
| df.to_csv(grid_search_dir / "lddt_results.csv", index=False) | ||
|
|
||
|
|
||
| def process_exp_with_selection( | ||
| exp: Experiment, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm curious - how should we name these given that we will be interacting with the diffuse hub schema? should we align how we describe experiments for ourselves both here and there?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a good question. Personally I think the way to do this is to push all the metadata into the CIF files, and then we can do away with this kind of parsing. Oh, but you're asking whether we should rename the "Experiment" class here? Yes, it would make sense to rename it. |
||
| protein_config: ProteinConfig, | ||
| px_seln_refernce_atom_array: AtomArrayStack, | ||
| selection_string: str, | ||
| ) -> dict[str, str | float | list[float]]: | ||
| """ | ||
|
|
||
| Parameters | ||
| ---------- | ||
| exp: Experiment, a description of the structure generation experiment | ||
| protein_config: ProteinConfig, specifying the locations of reference structures and maps | ||
| px_seln_refernce_atom_array: AtomArrayStack, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo here that would be worth not propogating I think: px_seln_refernce_atom_array -> px_seln_reference_atom_array
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! |
||
| the atom array stack for the reference structure, which in principle could be fetched | ||
| using the protein_config, but for efficiency we load once previously and pass in here, | ||
| since this method will run many times in parallel using the same structure | ||
| selection_string: str, the selection string for the evaluation | ||
|
|
||
| Returns | ||
| ------- | ||
| A dictionary of results of LDDT-based clustering that can be collated in a dataframe. | ||
| In addition to the data in the `exp` object, this dictionary contains: | ||
| - occupancies: list[float], the occupancies of the selected atoms, computed as the | ||
| fraction of structures in the experiment that are closest to one or the other | ||
| altloc of the reference structure. . | ||
| - avg_silhouette: float, the average silhouette score for the LDDT-based clustering | ||
| - avg_silhouette_to_ref: float, | ||
| a sort of silhouette score, measuring each structure's relative "closeness" to the | ||
| assigned reference altloc. | ||
| """ | ||
| logger.debug(f"Evaluating selection {selection_string} for protein {protein_config}") | ||
| result = exp.__dict__.copy() | ||
| result["selection"] = selection_string | ||
|
|
||
| try: | ||
| # generated structures shouldn't have altlocs, don't need altloc="all". | ||
| predicted_atom_array_stack = load_any(exp.refined_cif_path) | ||
| clustering_results = nn_lddt_clustering( | ||
| px_seln_refernce_atom_array, | ||
| ensure_atom_array_stack(predicted_atom_array_stack), | ||
| translate_selection(selection_string), | ||
| ) | ||
|
|
||
| lddt_result_keys = ("occupancies", "avg_silhouette", "avg_silhouette_to_ref") | ||
| result.update({k: clustering_results[k] for k in lddt_result_keys}) | ||
|
|
||
| logger.info( | ||
| f"Successfully processed {exp.protein_dir_name} w/ selection {selection_string}" | ||
| ) | ||
|
|
||
| except Exception as e: | ||
| logger.error(f"Error processing experiment {exp.exp_dir}: {e}") | ||
| logger.error(f" Traceback: {traceback.format_exc()}") | ||
| result["error"] = str(e) | ||
| result["avg_silhouette"] = np.nan | ||
| result["avg_silhouette_to_ref"] = np.nan | ||
| result["occupancies"] = [] | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| args = parse_args("Evaluate LDDT on grid search results.") | ||
| main(args) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is probably sufficient, but I am not sure. Maybe it would be better to detect the pyMOL like selection syntax? We should also write (or use if it exists) an atomworks/pandas <-> pyMOL style translator so that it is easy to port directly into pymol scripts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'll make an issue to solve this. I have a translation method but it only handles some cases. We definitely need a more complete solution.