In [None]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import yaml
import pickle
from ipyfilechooser import FileChooser

from cryo_challenge.data._validation.config_validators import (
    validate_input_config_mtm,
)
from cryo_challenge.data._validation.output_validators import MapToMapResultsValidator

After you preprocess your data (submission_*.pt), you can compute a map to map distance matrix, for various distance functions, by the following steps.

# 1. Create a config file
The config file (e.g. `config_files/config_map_to_map_distance_matrix.yaml`) has information about how to read the preprocessed submission and ground truth data, and how to perform the map to map distance analysis. 

Here is an example of the config file

In [14]:
# Select path to Map to Map config file
# An example of this file is available in the path ../config_files/config_map_to_map.yaml
config_m2m_path = FileChooser(os.path.expanduser("~"))
config_m2m_path.filter_pattern = "*.yaml"
display(config_m2m_path)

FileChooser(path='/mnt/home/gwoollard', filename='', title='', show_hidden=False, select_desc='Select', change…

In [15]:
with open(config_m2m_path.value, "r") as file:
    config = yaml.safe_load(file)

config

{'data': {'n_pix': 224,
  'psize': 2.146,
  'submission': {'fname': 'data/submission_0.pt',
   'volume_key': 'volumes',
   'metadata_key': 'populations',
   'label_key': 'id'},
  'ground_truth': {'volumes': 'data/maps_gt_flat.pt',
   'metadata': 'data/metadata.csv'},
  'mask': {'do': True, 'volume': 'data/mask_dilated_wide_224x224.mrc'}},
 'analysis': {'metrics': ['l2', 'corr', 'bioem', 'fsc'],
  'chunk_size_submission': 80,
  'chunk_size_gt': 190,
  'normalize': {'do': True, 'method': 'median_zscore'}},
 'output': 'results/map_to_map_distance_matrix_submission_0.pkl'}

These docstrings explain what the config file entries are

In [16]:
validate_input_config_mtm?

[0;31mSignature:[0m [0mvalidate_input_config_mtm[0m[0;34m([0m[0mconfig[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the config dictionary for the MapToMap config.

data: dict, is the data part of the config.
analysis: dict, is the analysis part of the config.
output: str, is the path to the output file.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

In [17]:
validate_config_mtm_data_submission?

[0;31mSignature:[0m [0mvalidate_config_mtm_data_submission[0m[0;34m([0m[0mconfig_data_submission[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the submission part of the config dictionary for the MapToMap config.

fname: str, is the path to the submission file (submission_*.pt).
volume_key: str, is the key in the submission file that contains the volume.
metadata_key: str, is the key in the submission file that contains the metadata.
label_key: str, is the key in the submission file that contains the (anonymizing) label.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

In [18]:
validate_config_mtm_data_ground_truth?

[0;31mSignature:[0m [0mvalidate_config_mtm_data_ground_truth[0m[0;34m([0m[0mconfig_data_ground_truth[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the ground truth part of the config dictionary for the MapToMap config.

volumes: str, is the path to the ground truth volume (.pt) file.
metadata: str, is the path to the ground truth metadata (.csv) file.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

In [19]:
validate_config_mtm_data_mask?

[0;31mSignature:[0m [0mvalidate_config_mtm_data_mask[0m[0;34m([0m[0mconfig_data_mask[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the mask part of the config dictionary for the MapToMap config.

do: bool, is a flag to indicate whether to use a mask.
volume: str, is the path to the mask volume (.mrc) file.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

In [20]:
validate_config_mtm_data?

[0;31mSignature:[0m [0mvalidate_config_mtm_data[0m[0;34m([0m[0mconfig_data[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the data part of the config dictionary for the MapToMap config.

n_pix: int, is the number of pixels in each dimension of the volume.
psize: float, is the pixel size of the volume in Angstroms.
submission: dict, is the submission part of the config.
ground_truth: dict, is the ground truth part of the config.
mask: dict, is the mask part of the config.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

In [21]:
validate_config_mtm_analysis?

[0;31mSignature:[0m [0mvalidate_config_mtm_analysis[0m[0;34m([0m[0mconfig_analysis[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the analysis part of the config dictionary for the MapToMap config.

metrics: list, is a list of metrics to compute.
chunk_size_submission: int, is the chunk size for the submission volume.
chunk_size_gt: int, is the chunk size for the ground truth volume.
normalize: dict, is the normalize part of the analysis part of the config.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

In [22]:
validate_config_mtm_analysis_normalize?

[0;31mSignature:[0m [0mvalidate_config_mtm_analysis_normalize[0m[0;34m([0m[0mconfig_analysis_normalize[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the normalize part of the analysis part of the config dictionary for the MapToMap config.

do: bool, is a flag to indicate whether to normalize the volumes.
method: str, is the method to use for normalization.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

The whole config is validated in `cryo_challenge.data._validation.config_validators.validate_input_config_mtm`.

In [23]:
validate_input_config_mtm(config)

# 2. Run `run_map2map_pipeline`

`cryo_challenge run_map2map_pipeline --config config_files/config_map_to_map_distance_matrix.yaml`

The output (of step #1) is validated with `cryo_challenge.data._validation.output_validators.MapToMapResultsValidator` (automatically, this is just explained below to serve as documentation)

In [24]:
with open(os.path.join("../", config["output"]), "rb") as f:
    results_dict = pickle.load(f)

In [25]:
_ = MapToMapResultsValidator.from_dict(results_dict)

The fields of the output are explained below:

In [26]:
MapToMapResultsValidator?

[0;31mInit signature:[0m
[0mMapToMapResultsValidator[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m:[0m [0mdict[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muser_submitted_populations[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorr[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0ml2[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbioem[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfsc[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m-

For each map to map distance, there is a separate dict of results containing: 

In [27]:
validate_maptomap_result?

[0;31mSignature:[0m [0mvalidate_maptomap_result[0m[0;34m([0m[0moutput_dict[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the output dictionary of the map-to-map distance matrix computation.

cost_matrix: pd.DataFrame, is the cost matrix, with ground truth rows and submission columns.
user_submission_label: str, is the label of the submission.
computed_assets: dict, is a dictionary of computed assets, which can be re-used in other analyses.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function