In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import yaml
import pickle
from ipyfilechooser import FileChooser

from cryo_challenge.data import validate_input_config_disttodist, validate_config_dtd_optimal_q_kl
from cryo_challenge.data import DistributionToDistributionResultsValidator

# from cryo_challenge.data._validation.config_validators import validate_input_config_disttodist, validate_config_dtd_optimal_q_kl

After you compute the map to map distances between ground truth and your submission, you can quantify the similarity between the distributions, taking into account the population weights.

# 1. Create a config file
The config file (e.g. `config_files/config_distribution_to_distribution.yaml`) has information about how compare the distributions, including how to pool nearby ground truth states hypermarameters for optimizing the submitted population weights.

Here is an example of the config file

In [None]:
# Select path to Distribution to Distribution config file
# An example of this file is available in the path ../config_files/config_distribution_to_distribution.yaml
config_d2d_path = FileChooser(os.path.expanduser("~"))
config_d2d_path.filter_pattern = '*.yaml'
display(config_d2d_path)

In [2]:
with open(config_d2d_path.selected_filename, "r") as file:
    config = yaml.safe_load(file)

config

{'input_fname': 'results/map_to_map_distance_matrix_submission_0.pkl',
 'metrics': ['l2', 'corr', 'bioem', 'fsc'],
 'gt_metadata_fname': 'data/metadata.csv',
 'n_replicates': 30,
 'n_pool_microstate': 5,
 'replicate_fraction': 0.9,
 'cvxpy_solver': 'ECOS',
 'optimal_q_kl': {'n_iter': 100000, 'break_atol': 0.0001},
 'output_fname': 'results/distribution_to_distribution_submission_0.pkl'}

These docstrings explain what the config file entries are

In [4]:
validate_input_config_disttodist?

[0;31mSignature:[0m [0mvalidate_input_config_disttodist[0m[0;34m([0m[0mconfig[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the config dictionary.

input_fname: str, is the path to the map to map distance matrix (output from map2map_pipeline).
metrics: list, is a list of metrics to compute.
gt_metadata_fname: str, is the path to the ground truth metadata (.csv) file.
n_replicates: int, is the number of replicates to compute.
n_pool_microstate: int, is the number of microstates to pool (low values less than 3-5 can cause problems for optimization convergence in CVXPY numerical solvers).
replicate_fraction: float, is the fraction of the data to use for replicates.
cvxpy_solver: str, is the solver to use for CVXPY optimization.
optimal_q_kl: dict, is the optimal_q_kl part of the config.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_val

In [5]:
validate_config_dtd_optimal_q_kl?

[0;31mSignature:[0m [0mvalidate_config_dtd_optimal_q_kl[0m[0;34m([0m[0mconfig_optimal_q_kl[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Validate the optimal_q_kl part of the config dictionary for the DistributionToDistribution config.

n_iter: int, is the number of iterations for the optimization.
break_atol: float, is the absolute tolerance for the optimization.
[0;31mFile:[0m      /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/config_validators.py
[0;31mType:[0m      function

The whole config is validated in `cryo_challenge.data._validation.config_validators.validate_input_config_disttodist`.

In [6]:
validate_input_config_disttodist(config)

# 2. Run `run_distribution2distribution_pipeline`

`cryo_challenge run_distribution2distribution_pipeline --config config_files/config_distribution_to_distribution.yaml`

The output (of step #1) is validated with `cryo_challenge.data._validation.output_validators.DistributionToDistributionResultsValidator` (automatically, this is just explained below to serve as documentation)

In [8]:
with open(os.path.join('../',config["output_fname"]), "rb") as f:
    results_dict = pickle.load(f)


In [9]:
_ = DistributionToDistributionResultsValidator.from_dict(results_dict)

The fields of the output are explained below:

In [11]:
DistributionToDistributionResultsValidator?

[0;31mInit signature:[0m
[0mDistributionToDistributionResultsValidator[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mconfig[0m[0;34m:[0m [0mdict[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muser_submitted_populations[0m[0;34m:[0m [0mtorch[0m[0;34m.[0m[0mTensor[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mid[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfsc[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbioem[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0ml2[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcorr[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mdict[0m[0;34m,[0m [0mNoneType[0m[0;34

For each map to map distance, there is a dictionary of replicates

In [10]:
from cryo_challenge.data import MetricDistToDistValidator
MetricDistToDistValidator?

[0;31mInit signature:[0m [0mMetricDistToDistValidator[0m[0;34m([0m[0mreplicates[0m[0;34m:[0m [0mdict[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Validate the output dictionary of one map to map metric in the the distribution-to-distribution pipeline.

replicates: dict, dictionary of replicates.
[0;31mFile:[0m           /mnt/ceph/users/gwoollard/repos/Cryo-EM-Heterogeneity-Challenge-1/src/cryo_challenge/data/_validation/output_validators.py
[0;31mType:[0m           type
[0;31mSubclasses:[0m     

For each replicate (subsampling the ground truth distribution), there is a separate dict of results containing: 

In [11]:
from cryo_challenge.data import ReplicateValidatorEMD, ReplicateValidatorKL

In [12]:
ReplicateValidatorEMD?

[0;31mInit signature:[0m
[0mReplicateValidatorEMD[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mq_opt[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mEMD_opt[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransport_plan_opt[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mflow_opt[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mprob_opt[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mruntime_opt[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mEMD_submitted[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtransport_plan_submitted[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[

In [13]:
ReplicateValidatorKL?

[0;31mInit signature:[0m
[0mReplicateValidatorKL[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mq_opt[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mklpq_opt[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mklqp_opt[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mA[0m[0;34m:[0m [0mList[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mfloat[0m[0;34m][0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0miter_stop[0m[0;34m:[0m [0mint[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0meps_stop[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mklpq_submitted[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mklqp_submitted[0m[0;34m:[0m [0mfloat[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;32mNone[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Validate the output dictionary of one KL divergence in th