From fe82be178f8e0d7e4b0155e0ddda50102f1e47e4 Mon Sep 17 00:00:00 2001 From: fabrizio-credo <110419923+fabrizio-credo@users.noreply.github.com> Date: Tue, 22 Nov 2022 12:23:33 -0800 Subject: [PATCH] Docs/evaluator pages (#253) - Created evaluator parser - Restructured docs for better organization - Created how to make your evaluator documentation - Enabled docstring automatic testing --- credoai/evaluators/data_fairness.py | 3 +- credoai/evaluators/data_profiler.py | 3 +- credoai/evaluators/deepchecks.py | 8 +- credoai/evaluators/equity.py | 22 ++ credoai/evaluators/evaluator.py | 87 ++++-- credoai/evaluators/identity_verification.py | 86 +++--- credoai/evaluators/privacy.py | 3 +- credoai/evaluators/ranking_fairness.py | 3 +- credoai/evaluators/security.py | 3 +- credoai/evaluators/shap.py | 14 +- credoai/evaluators/survival_fairness.py | 4 + credoai/governance/__init__.py | 6 + docs/Makefile | 2 +- docs/README.md | 16 +- docs/autogeneration/formatter.py | 25 +- docs/autogeneration/pages/evaluators.py | 22 ++ docs/autogeneration/pages/metrics.py | 2 +- docs/conf.py | 15 +- docs/index.rst | 12 +- docs/{ => pages}/developer_guide.rst | 9 +- docs/{ => pages}/evaluators.rst | 41 ++- docs/pages/evaluators/dataequity.rst | 25 ++ docs/pages/evaluators/datafairness.rst | 28 ++ docs/pages/evaluators/dataprofiler.rst | 18 ++ docs/pages/evaluators/deepchecks.rst | 29 ++ docs/pages/evaluators/featuredrift.rst | 34 +++ .../pages/evaluators/identityverification.rst | 85 ++++++ docs/pages/evaluators/modelequity.rst | 6 + docs/pages/evaluators/modelfairness.rst | 34 +++ docs/pages/evaluators/modelprofiler.rst | 29 ++ docs/pages/evaluators/performance.rst | 29 ++ docs/pages/evaluators/privacy.rst | 9 + docs/pages/evaluators/rankingfairness.rst | 83 ++++++ docs/pages/evaluators/security.rst | 24 ++ docs/pages/evaluators/shapexplainer.rst | 60 ++++ docs/pages/evaluators/survivalfairness.rst | 6 + docs/pages/make_your_own.rst | 256 ++++++++++++++++++ docs/{ => pages}/metrics.rst | 20 +- docs/{ => pages}/setup.rst | 0 docs/{ => pages}/tutorials.rst | 6 +- docs/requirements.txt | 2 +- pytest.ini | 3 +- requirements.txt | 2 +- 43 files changed, 1045 insertions(+), 129 deletions(-) create mode 100644 docs/autogeneration/pages/evaluators.py rename docs/{ => pages}/developer_guide.rst (95%) rename docs/{ => pages}/evaluators.rst (82%) create mode 100644 docs/pages/evaluators/dataequity.rst create mode 100644 docs/pages/evaluators/datafairness.rst create mode 100644 docs/pages/evaluators/dataprofiler.rst create mode 100644 docs/pages/evaluators/deepchecks.rst create mode 100644 docs/pages/evaluators/featuredrift.rst create mode 100644 docs/pages/evaluators/identityverification.rst create mode 100644 docs/pages/evaluators/modelequity.rst create mode 100644 docs/pages/evaluators/modelfairness.rst create mode 100644 docs/pages/evaluators/modelprofiler.rst create mode 100644 docs/pages/evaluators/performance.rst create mode 100644 docs/pages/evaluators/privacy.rst create mode 100644 docs/pages/evaluators/rankingfairness.rst create mode 100644 docs/pages/evaluators/security.rst create mode 100644 docs/pages/evaluators/shapexplainer.rst create mode 100644 docs/pages/evaluators/survivalfairness.rst create mode 100644 docs/pages/make_your_own.rst rename docs/{ => pages}/metrics.rst (97%) rename docs/{ => pages}/setup.rst (100%) rename docs/{ => pages}/tutorials.rst (88%) diff --git a/credoai/evaluators/data_fairness.py b/credoai/evaluators/data_fairness.py index 4e55140f..09f892bb 100644 --- a/credoai/evaluators/data_fairness.py +++ b/credoai/evaluators/data_fairness.py @@ -33,7 +33,8 @@ class DataFairness(Evaluator): - """Data Fairness for Credo AI. + """ + Data Fairness for Credo AI. This evaluator performs a fairness evaluation on the dataset. Given a sensitive feature, it calculates a number of assessments: diff --git a/credoai/evaluators/data_profiler.py b/credoai/evaluators/data_profiler.py index bce94deb..b992afbb 100644 --- a/credoai/evaluators/data_profiler.py +++ b/credoai/evaluators/data_profiler.py @@ -16,7 +16,8 @@ class DataProfiler(Evaluator): - """Data profiling module for Credo AI. + """ + Data profiling module for Credo AI. This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number of descriptive statistics about the data. diff --git a/credoai/evaluators/deepchecks.py b/credoai/evaluators/deepchecks.py index a163877c..28d59b98 100644 --- a/credoai/evaluators/deepchecks.py +++ b/credoai/evaluators/deepchecks.py @@ -11,7 +11,7 @@ class Deepchecks(Evaluator): """ - deepchecks evaluator + Deepchecks evaluator This evaluator enables running of deepchecks `checks` and passing the results to the Governance platform in the form of a deepchecks SuiteResult, cast to JSON format. @@ -49,7 +49,7 @@ def __init__( checks: Optional[List[BaseCheck]] = DEFAULT_CHECKS, ): super().__init__() - self.name = suite_name + self.suite_name = suite_name self.checks = checks def _setup(self): @@ -75,7 +75,7 @@ def _setup_deepchecks(self): if self.model: self.deepchecks_model = self.model.model_like - self.suite = Suite(name=self.name) + self.suite = Suite(name=self.suite_name) for check in self.checks: self.suite.add(check) # doing this as a for-loop list seems to be the only way @@ -94,7 +94,7 @@ def evaluate(self): self._setup_deepchecks() self.run_suite() - self.results = [DeepchecksContainer(self.name, self.suite_results)] + self.results = [DeepchecksContainer(self.suite_name, self.suite_results)] return self diff --git a/credoai/evaluators/equity.py b/credoai/evaluators/equity.py index 29d1d78c..ebd73810 100644 --- a/credoai/evaluators/equity.py +++ b/credoai/evaluators/equity.py @@ -318,6 +318,28 @@ def logit(x): class ModelEquity(DataEquity): + """ + Evaluates the equity of a model's predictions. + + This evaluator assesses whether model predictions are distributed equally across a sensitive + feature. Depending on the kind of outcome, different tests will be performed. + + - Discrete: chi-squared contingency tests, + followed by bonferronni corrected posthoc chi-sq tests + - Continuous: One-way ANOVA, followed by Tukey HSD posthoc tests + - Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then + proceed as normal for continuous + + Parameters + ---------- + use_predict_proba : bool, optional + Defines which predict method will be used, if True predict_proba will be used. + This methods outputs probabilities rather then class predictions. The availability + of predict_proba is dependent on the model under assessment. By default False + p_value : float, optional + The significance value to evaluate statistical tests, by default 0.01 + """ + def __init__(self, use_predict_proba=False, p_value=0.01): self.use_predict_proba = use_predict_proba super().__init__(p_value) diff --git a/credoai/evaluators/evaluator.py b/credoai/evaluators/evaluator.py index 1d004ef7..f2dd9afe 100644 --- a/credoai/evaluators/evaluator.py +++ b/credoai/evaluators/evaluator.py @@ -11,6 +11,13 @@ class Evaluator(ABC): Defines basic functions required from any evaluator object. + This class leverages the special method `__call__` to make artifacts + available in the class enclosure. + + .. automethod:: __call__ + .. automethod:: _init_artifacts + .. automethod:: _validate_arguments + .. automethod:: _setup """ def __init__(self): @@ -21,10 +28,22 @@ def __init__(self): @property def name(self): + """The name associated to the Evaluator, equals the class name.""" return self.__class__.__name__ @property def results(self): + """ + Container for all results. + + It is expected to be a list of EvidenceContainers. This is enforced in + the associated setter method. + + Raises + ------ + NotRunError + It indicates that results are missing, the evaluator was not run. + """ if self._results is not None: return self._results else: @@ -34,6 +53,7 @@ def results(self): @results.setter def results(self, results): + """Requires the results to be list of Evidence Containers""" if not isinstance(results, list): raise ValidationError("Results must be a list") for result in results: @@ -44,6 +64,19 @@ def results(self, results): @property @abstractmethod def required_artifacts(self): + """ + The required artifacts necessary for the functioning of the evaluator + + This set contains the :ref:`artifacts` that Lens can feed to + an evaluator, the accepted values are ``{"model", "assessment_data", "training_data", "data"}``. + + The string "data" means that the evaluator can be run on assessment and/or training data + (DataProfiler is an example). Lens will iterate over all the available artifacts internally. + + The set can also include the string "sensitive_feature". This is to indicate + that the evaluator depends on sensitive features. Lens will iterate over the available sensitive + features internally. + """ pass def __call__(self, **kwargs): @@ -51,32 +84,12 @@ def __call__(self, **kwargs): This method is used to pass the model, assessment_data and training_data artifacts to instantiated evaluator. - After objects are passed, it performs arguments validation and calls _setup - - >>> pipeline = Lens(model = model, assessment_data = dataset1) - - where a group of arguments shareable across multiple evaluators is passed. - This method inside a specific evaluator takes the required arguments and - makes them available to the evaluator instance. - - Requirements - ------------- - _shared_arg_assignment requires explicitly named arguments. - - Returns - ------- - self + The method is called internally by the Lens instance, which only passes the + artifacts specified in the property :meth:`required_artifacts`. - Implementation template - ----------------------- - The following code template provides an example of what the internal of this - method could look like: - - >>> self.model = kwargs['model'] - >>> self.assessment_dataset = kwargs['assessment_dataset'] - - where model and assessment_dataset are Lens() arguments. + After the artifacts are passed, it performs arguments validation and calls :meth:`_setup` + At the end of these operation, the validated artifacts are available in the evaluator enclosure. """ self._init_artifacts(kwargs) self._validate_arguments() @@ -89,14 +102,23 @@ def evaluate(self): Execute any data/model processing required for the evaluator. Populates the self.results object. - - Returns - ------- - self """ return self def get_container_info(self, labels: dict = None, metadata: dict = None): + """ + Expands the base labels and metadata used to populate evidences. + + Parameters + ---------- + labels : dict, optional + The default labels can be expanded by the user when defining a new evaluator. + A label is in general any information necessary to identify evidences in the Credo AI Platform, + therefore, by default None. + metadata : dict, optional + Any extra info the user wants to associate to the evidences. Compared + to labels these are not necessary for evidence identification, by default None. + """ info = self._base_container_info() if labels: info["labels"].update(labels) @@ -105,6 +127,7 @@ def get_container_info(self, labels: dict = None, metadata: dict = None): return info def _base_container_info(self): + """Extract basic info to populate labels and metadata.""" meta = {**self.metadata, **self._get_artifacts()} labels = {"evaluator": self.name} if "dataset_type" in meta: @@ -112,6 +135,11 @@ def _base_container_info(self): return {"labels": labels, "metadata": meta} def _get_artifacts(self): + """ + Extract artifacts that will be used by the evaluator. + + The method also extract name info from the available artifacts. + """ artifacts = {} save_keys = { "model": "model_name", @@ -140,6 +168,9 @@ def _init_artifacts(self, artifacts): @abstractmethod def _setup(self): + """ + Contains any extra steps necessary to initialize the evaluator + """ pass @abstractmethod diff --git a/credoai/evaluators/identity_verification.py b/credoai/evaluators/identity_verification.py index 7d45692d..35d9bbd2 100644 --- a/credoai/evaluators/identity_verification.py +++ b/credoai/evaluators/identity_verification.py @@ -26,10 +26,11 @@ class IdentityVerification(Evaluator): - """Pair-wise-comparison-based identity verification evaluator for Credo AI + """ + Pair-wise-comparison-based identity verification evaluator for Credo AI This evaluator takes in identity verification data and - provides functionality to perform performance and fairness assessment + provides functionality to perform performance and fairness assessment Parameters ---------- @@ -65,50 +66,47 @@ class IdentityVerification(Evaluator): Example -------- - import pandas as pd - from credoai.lens import Lens - from credoai.artifacts import ComparisonData, ComparisonModel - from credoai.evaluators import IdentityVerification - - evaluator = IdentityVerification(similarity_thresholds=[60, 99]) - - pairs = pd.DataFrame({ - 'source-subject-id': ['s0', 's0', 's0', 's0', 's1', 's1', 's1', 's1', 's1', 's2'], - 'source-subject-data-sample': ['s00', 's00', 's00', 's00', 's10', 's10', 's10', 's11', 's11', 's20'], - 'target-subject-id': ['s1', 's1', 's2', 's3', 's1', 's2', 's3', 's2', 's3', 's3'], - 'target-subject-data-sample': ['s10', 's11', 's20', 's30', 's11', 's20', 's30', 's20', 's30', 's30'] - }) - - subjects_sensitive_features = pd.DataFrame({ - 'subject-id': ['s0', 's1', 's2', 's3'], - 'gender': ['female', 'male', 'female', 'female'] - }) - - class FaceCompare: - # a dummy selfie comparison model - def compare(self, pairs): - similarity_scores = [31.5, 16.7, 20.8, 84.4, 12.0, 15.2, 45.8, 23.5, 28.5, 44.5] - return similarity_scores - - face_compare = FaceCompare() - - credo_data = ComparisonData( - name="face-data", - pairs=pairs, - subjects_sensitive_features=subjects_sensitive_features - ) - - credo_model = ComparisonModel( - name="face-compare", - model_like=face_compare - ) - - pipeline = Lens(model=credo_model, assessment_data=credo_data) - pipeline.add(evaluator) + >>> import pandas as pd + >>> from credoai.lens import Lens + >>> from credoai.artifacts import ComparisonData, ComparisonModel + >>> from credoai.evaluators import IdentityVerification + >>> evaluator = IdentityVerification(similarity_thresholds=[60, 99]) + >>> import doctest + >>> doctest.ELLIPSIS_MARKER = '-etc-' + >>> pairs = pd.DataFrame({ + ... 'source-subject-id': ['s0', 's0', 's0', 's0', 's1', 's1', 's1', 's1', 's1', 's2'], + ... 'source-subject-data-sample': ['s00', 's00', 's00', 's00', 's10', 's10', 's10', 's11', 's11', 's20'], + ... 'target-subject-id': ['s1', 's1', 's2', 's3', 's1', 's2', 's3', 's2', 's3', 's3'], + ... 'target-subject-data-sample': ['s10', 's11', 's20', 's30', 's11', 's20', 's30', 's20', 's30', 's30'] + ... }) + >>> subjects_sensitive_features = pd.DataFrame({ + ... 'subject-id': ['s0', 's1', 's2', 's3'], + ... 'gender': ['female', 'male', 'female', 'female'] + ... }) + >>> class FaceCompare: + ... # a dummy selfie comparison model + ... def compare(self, pairs): + ... similarity_scores = [31.5, 16.7, 20.8, 84.4, 12.0, 15.2, 45.8, 23.5, 28.5, 44.5] + ... return similarity_scores + >>> face_compare = FaceCompare() + >>> credo_data = ComparisonData( + ... name="face-data", + ... pairs=pairs, + ... subjects_sensitive_features=subjects_sensitive_features + ... ) + >>> credo_model = ComparisonModel( + ... name="face-compare", + ... model_like=face_compare + ... ) + >>> pipeline = Lens(model=credo_model, assessment_data=credo_data) + >>> pipeline.add(evaluator) # doctest: +ELLIPSIS + -etc- + >>> pipeline.run() # doctest: +ELLIPSIS + -etc- + >>> pipeline.get_results() # doctest: +ELLIPSIS + -etc- - pipeline.run() - pipeline.get_results() """ def __init__( diff --git a/credoai/evaluators/privacy.py b/credoai/evaluators/privacy.py index 085b673f..3ca7af4f 100644 --- a/credoai/evaluators/privacy.py +++ b/credoai/evaluators/privacy.py @@ -72,7 +72,8 @@ class Privacy(Evaluator): - """Privacy module for Credo AI. + """ + Privacy module for Credo AI. This module takes in in classification model and data and provides functionality to perform privacy assessment diff --git a/credoai/evaluators/ranking_fairness.py b/credoai/evaluators/ranking_fairness.py index ca32f993..3f81232b 100644 --- a/credoai/evaluators/ranking_fairness.py +++ b/credoai/evaluators/ranking_fairness.py @@ -34,7 +34,8 @@ class RankingFairness(Evaluator): - """Ranking fairness evaluator for Credo AI + """ + Ranking fairness evaluator for Credo AI This module takes in ranking results and provides functionality to perform fairness assessment The results should include rankings, sensitive features, and optionally, scores. diff --git a/credoai/evaluators/security.py b/credoai/evaluators/security.py index 8e9f0e18..126c1abc 100644 --- a/credoai/evaluators/security.py +++ b/credoai/evaluators/security.py @@ -30,7 +30,8 @@ class Security(Evaluator): - """Security module for Credo AI. + """ + Security module for Credo AI. This module takes in classification model and data and provides functionality to perform security assessment diff --git a/credoai/evaluators/shap.py b/credoai/evaluators/shap.py index 2b45ff02..6a6985d0 100644 --- a/credoai/evaluators/shap.py +++ b/credoai/evaluators/shap.py @@ -17,6 +17,7 @@ class ShapExplainer(Evaluator): leveraging the SHAP package. It supports 2 types of assessments: + 1. Overall statistics of the shap values across all samples: mean and mean(|x|) 2. Individual shapley values for a list of samples @@ -28,11 +29,12 @@ class ShapExplainer(Evaluator): value, the calculation of the shap values is still performed on the full dataset. Two strategies for down sampling are provided: - 1. Random sampling (the default strategy): the amount of samples can be specified - by the user. - 2. Kmeans: summarizes a dataset with k mean centroids, weighted by the number of - data points they each represent. The amount of centroids can also be specified - by the user. + + 1. Random sampling (the default strategy): the amount of samples can be specified + by the user. + 2. Kmeans: summarizes a dataset with k mean centroids, weighted by the number of + data points they each represent. The amount of centroids can also be specified + by the user. There is no consensus on the optimal down sampling approach. For reference, see this conversation: https://github.com/slundberg/shap/issues/1018 @@ -64,6 +66,8 @@ class ShapExplainer(Evaluator): If True, use SHAP kmeans to create a data summary to serve as background data for the SHAP explainer using 50 centroids by default. If an int is provided, that will be used as the number of centroids. If False, random sampling will take place. + + """ def __init__( diff --git a/credoai/evaluators/survival_fairness.py b/credoai/evaluators/survival_fairness.py index 8c71147e..57934c93 100644 --- a/credoai/evaluators/survival_fairness.py +++ b/credoai/evaluators/survival_fairness.py @@ -12,6 +12,10 @@ class SurvivalFairness(Evaluator): + """ + Calculate Survival fairness + """ + def __init__(self, CoxPh_kwargs=None, confounds=None): if CoxPh_kwargs is None: CoxPh_kwargs = {"duration_col": "duration", "event_col": "event"} diff --git a/credoai/governance/__init__.py b/credoai/governance/__init__.py index ddb574cb..a07d0c1d 100644 --- a/credoai/governance/__init__.py +++ b/credoai/governance/__init__.py @@ -1 +1,7 @@ +""" +Direct connection to the package credoai-connect. + +The credoai_connect package handles all protocols linking a local Lens run +to CredoAI Platform. +""" from connect.governance import Governance diff --git a/docs/Makefile b/docs/Makefile index b0a3036b..f0243608 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,7 +15,7 @@ help: .PHONY: help Makefile html: - ls autogeneration/pages/*.py|xargs -n 1 -P 3 python +# ls autogeneration/pages/*.py|xargs -n 1 -P 3 python @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) # Catch-all target: route all unknown targets to Sphinx using the new diff --git a/docs/README.md b/docs/README.md index 81f2a351..7f6d184c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,9 +1,21 @@ # Documentation & Website Generation -To build the documentation locally, run `make html` from the `/docs` directory and the docs site will build to: `docs/_build/html/index.html`, which can be opened in the browser. +To build the documentation:, from the `/docs` folder: + +Run: +* `python autogeneration/pages/evaluators.py` -> creates evaluator pages +* `python autogeneration/pages/metrics.py` -> creates metrics page + +> Note: credoai-lens and credoai-connect needs to be installed for this to work correctly. + +After pushing to github, Read the Docs will build the documentation if the +branch is currently activated. + +For local build, on top of the previous commands, run `make html`directory and the docs site will build to: `docs/_build/html/index.html`, which can be opened in the browser. + > Make sure you have [Sphinx installed](https://www.sphinx-doc.org/en/master/usage/installation.html) if you are building the docs site locally. -Building the docs requires additional dependencies listed in `docs/requirements.txt`. +> Building the docs requires additional dependencies listed in `docs/requirements.txt`. This directory contains the content relevant to documentation & website generation using `sphinx`. The most important resource is `conf.py` which diff --git a/docs/autogeneration/formatter.py b/docs/autogeneration/formatter.py index df764c6a..8957b17e 100644 --- a/docs/autogeneration/formatter.py +++ b/docs/autogeneration/formatter.py @@ -7,6 +7,7 @@ from typing import List, Literal, Optional from pandas import DataFrame, Series, concat +from re import finditer def create_title( @@ -20,7 +21,7 @@ def create_title( raise ValueError("Unknown level.") ttl_length = len(title) - title_string = f"\n{title.capitalize()}\n{sep*ttl_length}\n" + title_string = f"\n{title.title()}\n{sep*ttl_length}\n" if hyperlink: title_string = f"\n.. _{title}:\n" + title_string @@ -80,3 +81,25 @@ def convert_df_to_table( df["sphinx"] = "\t* - " + df.sphinx return "\n".join(list(df.sphinx)) + + +def camel_case_split(identifier): + matches = finditer( + ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier + ) + return [m.group(0) for m in matches] + + +def extract_docstring_info_from_evaluator(object): + text = object.__doc__ + text = text.replace(" {4}", "") + mod_txt = [] + for line in text.splitlines(True): + if line != "\n": + mod_txt.append(line[4:]) + else: + mod_txt.append(line) + + title = " ".join(camel_case_split(object.name)) + page = create_page_area([create_title(title), "".join(mod_txt)]) + return page diff --git a/docs/autogeneration/pages/evaluators.py b/docs/autogeneration/pages/evaluators.py new file mode 100644 index 00000000..b28ce5c4 --- /dev/null +++ b/docs/autogeneration/pages/evaluators.py @@ -0,0 +1,22 @@ +from docs.autogeneration.formatter import ( + create_page_area, + create_title, + extract_docstring_info_from_evaluator, +) +from credoai.lens.pipeline_creator import build_list_of_evaluators + + +def create_all_evaluator_pages(): + all_ev = build_list_of_evaluators() + for evaluator in all_ev: + try: + doc = extract_docstring_info_from_evaluator(evaluator) + page_name = evaluator.name.lower() + with open(f"./pages/evaluators/{page_name}.rst", "w") as text_file: + text_file.write(doc) + except: + print(f"{evaluator.name} docstring not found") + + +if __name__ == "__main__": + create_all_evaluator_pages() diff --git a/docs/autogeneration/pages/metrics.py b/docs/autogeneration/pages/metrics.py index 435f6772..1defbbbe 100644 --- a/docs/autogeneration/pages/metrics.py +++ b/docs/autogeneration/pages/metrics.py @@ -60,7 +60,7 @@ def create_metrics_page(): page = create_page_area([INTRO, metrics_table, metric_info]) # Create the page - with open("./metrics.rst", "w") as text_file: + with open("./pages/metrics.rst", "w") as text_file: text_file.write(page) diff --git a/docs/conf.py b/docs/conf.py index 8a098521..25f05dfc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -124,6 +124,9 @@ "transformers", ] +# copy button properties +copybutton_prompt_text = r">>> |\.\.\. " +copybutton_prompt_is_regexp = True # -- Options for HTML output ------------------------------------------------- @@ -149,12 +152,14 @@ # -- Generating pages --------------------------------------------------- -from docs.autogeneration.pages.metrics import create_metrics_page +# from docs.autogeneration.pages.metrics import create_metrics_page +# from docs.autogeneration.pages.evaluators import create_all_evaluator_pages -def create_custom_pages(app): - create_metrics_page() +# def create_custom_pages(app): +# create_metrics_page() +# create_all_evaluator_pages() -def setup(app): - app.connect("builder-inited", create_custom_pages) +# def setup(app): +# app.connect("builder-inited", create_custom_pages) diff --git a/docs/index.rst b/docs/index.rst index b1724898..84b99f92 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,11 +3,11 @@ :hidden: Home page - Setup - Jupyter tutorials - Developer Guide - Evaluators - Metrics + Setup + Jupyter tutorials + Developer Guide + Evaluators + Metrics API reference <_autosummary/credoai> @@ -33,7 +33,7 @@ Check out the :ref:`quickstart tutorial ` to get started. Check out the :ref:`FAQ ` for answers to common questions. -To connect with the Credo AI Governance Platform, see the :ref:`governance integration tutorial `. +To connect with the Credo AI Governance Platform, see the :ref:`platform integration tutorial `. Overview diff --git a/docs/developer_guide.rst b/docs/pages/developer_guide.rst similarity index 95% rename from docs/developer_guide.rst rename to docs/pages/developer_guide.rst index 0f1ac3fa..4a318086 100644 --- a/docs/developer_guide.rst +++ b/docs/pages/developer_guide.rst @@ -2,6 +2,12 @@ Developer Guide =============== +.. toctree:: + :maxdepth: 1 + :hidden: + + ./make_your_own + This document will go into more detail about the different components of Lens so that you can extend it to fit your needs. @@ -37,7 +43,8 @@ Evaluator Classes inheriting ``Evaluator`` will perform any kind of assessment or function needed to understand an AI Artifacts behavior or characteristics. There are clearly a wide range of possible evaluators! Everything from assessing performance metrics to levying adversarial attacks - can be instantiated in an evaluator. + can be instantiated in an evaluator. For more information on how to create an evaluator, please visit + :ref:`How to create Evaluators` EvidenceContainers The ``EvidenceContainer`` abstract class is used to hold results of evaluators. An EvidenceContainer diff --git a/docs/evaluators.rst b/docs/pages/evaluators.rst similarity index 82% rename from docs/evaluators.rst rename to docs/pages/evaluators.rst index cad335ca..48847249 100644 --- a/docs/evaluators.rst +++ b/docs/pages/evaluators.rst @@ -1,6 +1,27 @@ Evaluators ========== +.. toctree:: + :maxdepth: 1 + :hidden: + + + ./evaluators/dataequity + ./evaluators/datafairness + ./evaluators/dataprofiler + ./evaluators/deepchecks + ./evaluators/featuredrift + ./evaluators/identityverification + ./evaluators/modelequity + ./evaluators/modelfairness + ./evaluators/modelprofiler + ./evaluators/performance + ./evaluators/privacy + ./evaluators/rankingfairness + ./evaluators/security + ./evaluators/shapexplainer + ./evaluators/survivalfairness + Evaluators are the classes that perform specific functions on a model and/or data. These can include assessing the model for fairness, or profiling a data. Evaluators are constantly being added to the framework, which creates Lens's standard @@ -9,7 +30,7 @@ library. Library of Evaluators --------------------- -DataEquity +:ref:`DataEquity` This evaluator assesses whether outcomes are distributed equally across a sensitive feature. Depending on the kind of outcome, different tests will be performed. @@ -19,7 +40,7 @@ DataEquity - Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then proceed as normal for continuous -DataFairness +:ref:`DataFairness` This evaluator performs a fairness evaluation on the dataset. Given a sensitive feature, it calculates a number of assessments: @@ -28,18 +49,18 @@ DataFairness - whether the entire dataset can be seen as a proxy for the sensitive feature (i.e., the sensitive feature is "redundantly encoded") -DataProfiling +:ref:`DataProfiling` This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number of descriptive statistics about the data. -ModelFairness +:ref:`ModelFairness` This evaluator calculates performance metrics disaggregated by a sensitive feature, as well as evaluating the parity of those metrics. Handles any metric that can be calculated on a set of ground truth labels and predictions, e.g., binary classification, multiclass classification, regression. -ModelEquity +:ref:`ModelEquity` This evaluator assesses whether model outcomes (i.e., predictions) are distributed equally across a sensitive feature. Depending on the kind of outcome, different tests will be performed. @@ -49,23 +70,23 @@ ModelEquity - Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then proceed as normal for continuous -Performance +:ref:`Performance` This evaluator calculates overall performance metrics. Handles any metric that can be calculated on a set of ground truth labels and predictions, e.g., binary classification, multiclass classification, regression. -RankingFairness +:ref:`RankingFairness` This evaluator calculates group fairness metrics for ranking systems. This works on ranked items. If items scores data are also available and provided, it outputs a wider range of metrics. -IdentityVerification +:ref:`IdentityVerification` This evaluator performs performance and fairness assessments for identity verification systems. The identity verification system here refers to a pair-wise-comparison-based system that inputs samples of a biometric attribute (face, fingerprint, voice, etc.) and their demographics and then outputs the degree to which they represent the same person to verify their Identity. -Privacy +:ref:`Privacy` This evaluator calculates privacy metrics based on two adversarial attacks: - Membership inference attack: when an attacker with black-box access to a model attempts @@ -73,7 +94,7 @@ Privacy - Attribute inference attack: when an attacker attempts to learn the attacked feature from the rest of the features. -Security +:ref:`Security` This evaluator calculates security metrics based on two adversarial attacks: - Model extraction attack: when an attacker with black-box access to a model attempts to diff --git a/docs/pages/evaluators/dataequity.rst b/docs/pages/evaluators/dataequity.rst new file mode 100644 index 00000000..bc3005fa --- /dev/null +++ b/docs/pages/evaluators/dataequity.rst @@ -0,0 +1,25 @@ + +Data Equity +=========== + + +Data Equity module for Credo AI. + +This evaluator assesses whether outcomes are distributed equally across a sensitive +feature. Depending on the kind of outcome, different tests will be performed. + +- Discrete: chi-squared contingency tests, + followed by bonferronni corrected posthoc chi-sq tests +- Continuous: One-way ANOVA, followed by Tukey HSD posthoc tests +- Proportion (Bounded [0-1] continuous outcome): outcome is transformed to logits, then + proceed as normal for continuous + +Parameters +---------- +sensitive_features : pandas.Series + The segmentation feature which should be used to create subgroups to analyze. +y : (List, pandas.Series, numpy.ndarray) + Outcomes (e.g., labels for classification or target values for regression), + either ground-truth or model generated. +p_value : float + The significance value to evaluate statistical tests diff --git a/docs/pages/evaluators/datafairness.rst b/docs/pages/evaluators/datafairness.rst new file mode 100644 index 00000000..22a33299 --- /dev/null +++ b/docs/pages/evaluators/datafairness.rst @@ -0,0 +1,28 @@ + +Data Fairness +============= + + +Data Fairness for Credo AI. + +This evaluator performs a fairness evaluation on the dataset. Given a sensitive feature, +it calculates a number of assessments: + +- group differences of features +- evaluates whether features in the dataset are proxies for the sensitive feature +- whether the entire dataset can be seen as a proxy for the sensitive feature + (i.e., the sensitive feature is "redundantly encoded") + +Parameters +---------- +X : pandas.DataFrame + The features +y : pandas.Series + The outcome labels +sensitive_features : pandas.Series + A series of the sensitive feature labels (e.g., "male", "female") which should be used to create subgroups +categorical_features_keys : list[str], optional + Names of the categorical features +categorical_threshold : float + Parameter for automatically identifying categorical columns. See + `credoai.utils.common.is_categorical` diff --git a/docs/pages/evaluators/dataprofiler.rst b/docs/pages/evaluators/dataprofiler.rst new file mode 100644 index 00000000..eb9ad8c8 --- /dev/null +++ b/docs/pages/evaluators/dataprofiler.rst @@ -0,0 +1,18 @@ + +Data Profiler +============= + + +Data profiling module for Credo AI. + +This evaluator runs the pandas profiler on a data. Pandas profiler calculates a number +of descriptive statistics about the data. + +Parameters +---------- +X : pandas.DataFrame + The features +y : pandas.Series + The outcome labels +profile_kwargs + Passed to pandas_profiling.ProfileReport diff --git a/docs/pages/evaluators/deepchecks.rst b/docs/pages/evaluators/deepchecks.rst new file mode 100644 index 00000000..98c74f98 --- /dev/null +++ b/docs/pages/evaluators/deepchecks.rst @@ -0,0 +1,29 @@ + +Deepchecks +========== + + +Deepchecks evaluator + +This evaluator enables running of deepchecks `checks` and passing the results to +the Governance platform in the form of a deepchecks SuiteResult, cast to JSON format. +See https://docs.deepchecks.com/stable/api/generated/deepchecks.tabular.checks.model_evaluation.html +and https://docs.deepchecks.com/stable/api/generated/deepchecks.core.SuiteResult.html +and https://docs.deepchecks.com/stable/user-guide/general/customizations/examples/plot_create_a_custom_suite.html +for more details. + +This evaluator provides some redundant functionality. For instance, metrics which can be +calculated using the Performance evaluator can potentially be calculated by deepchecks +(and thus this evaluator) as well. The same applies to the FeatureDrift evaluator. +When a choice exists, the best practice dictates that the "Lens native" evaluator should +be used in preference to deepchecks, since output formats of other evaluators is generally +consistent, while this deepchecks evaluator outputs results in a highly structured JSON format. + + +Parameters +---------- +name : str, optional + Name of the supplied deepchecks suite +checks : List-like, optional + A list of instantiated deepchecks checks objects (e.g. BoostingOverfit, CalibrationScore) + #TODO allow list of strings? diff --git a/docs/pages/evaluators/featuredrift.rst b/docs/pages/evaluators/featuredrift.rst new file mode 100644 index 00000000..d6b165f8 --- /dev/null +++ b/docs/pages/evaluators/featuredrift.rst @@ -0,0 +1,34 @@ + +Feature Drift +============= + + +Measure Feature Drift using population stability index. + +This evaluator measures feature drift in: + +1. Model prediction: the prediction for the assessment dataset is compared + to the prediction for the training dataset. + In the case of classifiers, the prediction is performed with predict proba if available. + If it is not available, the prediction is treated like a categorical variable, see the + processing of categorical variables in the item below. + +2. Dataset features: 1 to 1 comparison across all features for the datasets. This is also +referred to as "characteristic stability index" (CSI). + - Numerical features are directly fed into the population_stability_index metric, and + binned according to the parameters specified at init time. + - Categorical features percentage distribution is manually calculated. The % amount of + samples per each class is calculated and then fed into the population_stability_index metric. + The percentage flag in the metric is set to True, to bypass the internal binning process. + + +Parameters +---------- +buckets : int, optional + Number of buckets to consider to bin the predictions, by default 10 +buckettype : Literal["bins", "quantiles"] + Type of strategy for creating buckets, bins splits into even splits, + quantiles splits into quantiles buckets, by default "bins" +csi_calculation : bool, optional + Calculate characteristic stability index, i.e., PSI for all features in the datasets, + by default False diff --git a/docs/pages/evaluators/identityverification.rst b/docs/pages/evaluators/identityverification.rst new file mode 100644 index 00000000..7850d481 --- /dev/null +++ b/docs/pages/evaluators/identityverification.rst @@ -0,0 +1,85 @@ + +Identity Verification +===================== + + +Pair-wise-comparison-based identity verification evaluator for Credo AI + +This evaluator takes in identity verification data and +provides functionality to perform performance and fairness assessment + +Parameters +---------- +pairs : pd.DataFrame of shape (n_pairs, 4) + Dataframe where each row represents a data sample pair and associated subjects + Type of data sample is decided by the ComparisonModel's `compare` function, which takes + data sample pairs and returns their similarity scores. Examples are selfies, fingerprint scans, + or voices of a person. + Required columns: + source-subject-id: unique identifier of the source subject + source-subject-data-sample: data sample from the source subject + target-subject-id: unique identifier of the target subject + target-subject-data-sample: data sample from the target subject +subjects_sensitive_features : pd.DataFrame of shape (n_subjects, n_sensitive_feature_names), optional + Sensitive features of all subjects present in pairs dataframe + If provided, disaggregated performance assessment is also performed. + This can be the columns you want to perform segmentation analysis on, or + a feature related to fairness like 'race' or 'gender' + Required columns: + subject-id: id of subjects. Must cover all the subjects inlcluded in `pairs` dataframe + other columns with arbitrary names for sensitive features +similarity_thresholds : list + list of similarity score thresholds + Similarity equal or greater than a similarity score threshold means match +comparison_levels : list + list of comparison levels. Options: + sample: it means a match is observed for every sample pair. Sample-level comparison represent + a use case where only two samples (such as a real time selfie and stored ID image) are + used to confirm an identity. + subject: it means if any pairs of samples for the same subject are a match, the subject pair + is marked as a match. Some identity verification use cases improve overall accuracy by storing + multiple samples per identity. Subject-level comparison mirrors this behavior. + +Example +-------- + +>>> import pandas as pd +>>> from credoai.lens import Lens +>>> from credoai.artifacts import ComparisonData, ComparisonModel +>>> from credoai.evaluators import IdentityVerification +>>> evaluator = IdentityVerification(similarity_thresholds=[60, 99]) +>>> import doctest +>>> doctest.ELLIPSIS_MARKER = '-etc-' +>>> pairs = pd.DataFrame({ +... 'source-subject-id': ['s0', 's0', 's0', 's0', 's1', 's1', 's1', 's1', 's1', 's2'], +... 'source-subject-data-sample': ['s00', 's00', 's00', 's00', 's10', 's10', 's10', 's11', 's11', 's20'], +... 'target-subject-id': ['s1', 's1', 's2', 's3', 's1', 's2', 's3', 's2', 's3', 's3'], +... 'target-subject-data-sample': ['s10', 's11', 's20', 's30', 's11', 's20', 's30', 's20', 's30', 's30'] +... }) +>>> subjects_sensitive_features = pd.DataFrame({ +... 'subject-id': ['s0', 's1', 's2', 's3'], +... 'gender': ['female', 'male', 'female', 'female'] +... }) +>>> class FaceCompare: +... # a dummy selfie comparison model +... def compare(self, pairs): +... similarity_scores = [31.5, 16.7, 20.8, 84.4, 12.0, 15.2, 45.8, 23.5, 28.5, 44.5] +... return similarity_scores +>>> face_compare = FaceCompare() +>>> credo_data = ComparisonData( +... name="face-data", +... pairs=pairs, +... subjects_sensitive_features=subjects_sensitive_features +... ) +>>> credo_model = ComparisonModel( +... name="face-compare", +... model_like=face_compare +... ) +>>> pipeline = Lens(model=credo_model, assessment_data=credo_data) +>>> pipeline.add(evaluator) # doctest: +ELLIPSIS +-etc- +>>> pipeline.run() # doctest: +ELLIPSIS +-etc- +>>> pipeline.get_results() # doctest: +ELLIPSIS +-etc- + diff --git a/docs/pages/evaluators/modelequity.rst b/docs/pages/evaluators/modelequity.rst new file mode 100644 index 00000000..1fdba839 --- /dev/null +++ b/docs/pages/evaluators/modelequity.rst @@ -0,0 +1,6 @@ + +Model Equity +============ + + +Calculates Equity for model predictions. diff --git a/docs/pages/evaluators/modelfairness.rst b/docs/pages/evaluators/modelfairness.rst new file mode 100644 index 00000000..01094921 --- /dev/null +++ b/docs/pages/evaluators/modelfairness.rst @@ -0,0 +1,34 @@ + +Model Fairness +============== + + +Model Fairness evaluator for Credo AI. + +This evaluator calculates performance metrics disaggregated by a sensitive feature, as +well as evaluating the parity of those metrics. + +Handles any metric that can be calculated on a set of ground truth labels and predictions, +e.g., binary classification, multiclass classification, regression. + + +Parameters +---------- +metrics : List-like + list of metric names as string or list of Metrics (credoai.metrics.Metric). + Metric strings should in list returned by credoai.modules.list_metrics. + Note for performance parity metrics like + "false negative rate parity" just list "false negative rate". Parity metrics + are calculated automatically if the performance metric is supplied +sensitive_features : pandas.DataFrame + The segmentation feature(s) which should be used to create subgroups to analyze. +y_true : (List, pandas.Series, numpy.ndarray) + The ground-truth labels (for classification) or target values (for regression). +y_pred : (List, pandas.Series, numpy.ndarray) + The predicted labels for classification +y_prob : (List, pandas.Series, numpy.ndarray), optional + The unthresholded predictions, confidence values or probabilities. +method : str, optional + How to compute the differences: "between_groups" or "to_overall". + See fairlearn.metrics.MetricFrame.difference + for details, by default 'between_groups' diff --git a/docs/pages/evaluators/modelprofiler.rst b/docs/pages/evaluators/modelprofiler.rst new file mode 100644 index 00000000..7932d143 --- /dev/null +++ b/docs/pages/evaluators/modelprofiler.rst @@ -0,0 +1,29 @@ + +Model Profiler +============== + + +Model profiling evaluator. + +This evaluator builds a model card the purpose of which is to characterize +a fitted model. + +The overall strategy is: + 1. Extract all potentially useful info from the model itself in an + automatic fashion. + 2. Allow the user to personalize the model card freely. + +The method generate_template() provides a dictionary with several entries the +user could be interested in filling up. + +Parameters +---------- +model_info : Optional[dict] + Information provided by the user that cannot be inferred by + the model itself. The dictionary con contain any number of elements, + a template can be provided by running the generate_template() method. + + The only restrictions are checked in a validation step: + 1. Some keys are protected because they are used internally + 2. Only basic python types are accepted as values + diff --git a/docs/pages/evaluators/performance.rst b/docs/pages/evaluators/performance.rst new file mode 100644 index 00000000..674b3e62 --- /dev/null +++ b/docs/pages/evaluators/performance.rst @@ -0,0 +1,29 @@ + +Performance +=========== + + +Performance evaluator for Credo AI. + +This evaluator calculates overall performance metrics. +Handles any metric that can be calculated on a set of ground truth labels and predictions, +e.g., binary classification, multiclass classification, regression. + +This module takes in a set of metrics and provides functionality to: +- calculate the metrics +- create disaggregated metrics + +Parameters +---------- +metrics : List-like + list of metric names as strings or list of Metric objects (credoai.modules.metrics.Metric). + Metric strings should in list returned by credoai.modules.metric_utils.list_metrics(). + Note for performance parity metrics like + "false negative rate parity" just list "false negative rate". Parity metrics + are calculated automatically if the performance metric is supplied +y_true : (List, pandas.Series, numpy.ndarray) + The ground-truth labels (for classification) or target values (for regression). +y_pred : (List, pandas.Series, numpy.ndarray) + The predicted labels for classification +y_prob : (List, pandas.Series, numpy.ndarray), optional + The unthresholded predictions, confidence values or probabilities. diff --git a/docs/pages/evaluators/privacy.rst b/docs/pages/evaluators/privacy.rst new file mode 100644 index 00000000..53793f7d --- /dev/null +++ b/docs/pages/evaluators/privacy.rst @@ -0,0 +1,9 @@ + +Privacy +======= + + +Privacy module for Credo AI. + +This module takes in in classification model and data and provides functionality + to perform privacy assessment diff --git a/docs/pages/evaluators/rankingfairness.rst b/docs/pages/evaluators/rankingfairness.rst new file mode 100644 index 00000000..85c583b4 --- /dev/null +++ b/docs/pages/evaluators/rankingfairness.rst @@ -0,0 +1,83 @@ + +Ranking Fairness +================ + + +Ranking fairness evaluator for Credo AI + +This module takes in ranking results and provides functionality to perform fairness assessment + The results should include rankings, sensitive features, and optionally, scores. + +skew_parity_difference: max_skew - min_skew, where skew is the proportion of the selected + items from a group over the desired proportion for that group. + It ranges from 0 to inf and the ideal value is 0. +ndkl: is a metric that accounts for increasing ranks. It is non-negative, with larger values + indicating a greater divergence between the desired and actual distributions of + sensitive attribute labels. + It ranges from 0 to inf and the ideal value is 0. +demographic_parity_ratio: min_selection_rate / max_selection_rate, where selection rate + is the proportion of the selected items from a group over the number of items for + that group in the pool. + It ranges from 0 to 1 and ideal value is 1. +balance_ratio: min_presence / max_presence, where presence is the number of the selected items + from a group. + It ranges from 0 to 1 and ideal value is 1. +qualified_demographic_parity_ratio: demographic_parity_ratio but with a qualified (i.e., score + greater than or equal to q) filter applied to the items. + It ranges from 0 to 1 and ideal value is 1. +qualified_balance_ratio: balance_ratio but with a qualified (i.e., score greater than or equal + to q) filter applied to the items. + It ranges from 0 to 1 and ideal value is 1. +calibrated_demographic_parity_ratio: demographic_parity_ratio but with the selected set from + specified score bins. This is to audit if items with similiar scores are are treated similarly + (via proportional presence) regardless of group membership. + It ranges from 0 to 1 and ideal value is 1. +calibrated_balance_ratio: balance_ratio but with the selected set from + specified score bins. This is to audit if items with similiar scores are are treated similarly + (via equal presence) regardless of group membership. + It ranges from 0 to 1 and ideal value is 1. +relevance_parity_ratio: to audit if groups are represented proportional to their average score + (i.e., score-based relevance) + It ranges from 0 to 1 and ideal value is 1. +score_parity_ratio: min_average_Score / max_average_Score, where average score + is the average score of the selected items from a group. + It ranges from 0 to 1 and ideal value is 1. +score_balance_ratio: min_total_Score / max_total_Score, where total score + is the total score of the selected items from a group. + It ranges from 0 to 1 and ideal value is 1. +score_empirical_distribution: score empirical distributions for each demographic group as tables + The x axis is scores and the y axis is cumulative probabilities (ranges from 0 to 1) + It is useful for a visual examination of the distribution of scores for the different groups. + +Parameters +---------- +sensitive_features : pandas.Series + A series of the sensitive feature labels (e.g., "male", "female") which should + be used to create subgroups +rankings : pandas.Series of type int + The computed ranks + It should be passed to TabularData's y argument with the column name `rankings` +scores : pandas.Series of type int or float, Optional + A series of the scores + It should be passed to TabularData's y argument with the column name `scores` +k: int, Optional + The top k items are considered as the selected subset + If not provided, the top 50% of the items are considered as selected +q: float, Optional + The relevance score for which items in the pool that have score >= q are "relevant". + These two metrics require this to be provided: `qualified_demographic_parity_ratio` + and `qualified_balance_ratio` +lb_bin: numpy array of shape = (n_bins), Optional + The lower bound scores for each bin (bin is greater than or equal to lower bound). + These two metrics require this to be provided: `calibrated_demographic_parity_ratio` + and `calibrated_balance_ratio` +ub_bin: numpy array of shape = (n_bins), Optional + The upper bound scores for each bin (bin is less than upper bound). + These two metrics require this to be provided: `calibrated_demographic_parity_ratio` + and `calibrated_balance_ratio` +desired_proportions: dict, Optional + The desired proportion for each subgroups (e.g., {"male":0.4, "female":0.6}) + If not provided, equal proportions are used for calculation of `skew` score +down_sampling_step : int, optional + down-sampling step for scores empirical distribution curve + If not provided, down-sampling is done such that the curve length be nearly 100 diff --git a/docs/pages/evaluators/security.rst b/docs/pages/evaluators/security.rst new file mode 100644 index 00000000..74ee600b --- /dev/null +++ b/docs/pages/evaluators/security.rst @@ -0,0 +1,24 @@ + +Security +======== + + +Security module for Credo AI. + +This module takes in classification model and data and + provides functionality to perform security assessment + +Parameters +---------- +model : model + A trained binary or multi-class classification model + The only requirement for the model is to have a `predict` function that returns + predicted classes for a given feature vectors as a one-dimensional array. +x_train : pandas.DataFrame + The training features +y_train : pandas.Series + The training outcome labels +x_test : pandas.DataFrame + The test features +y_test : pandas.Series + The test outcome labels diff --git a/docs/pages/evaluators/shapexplainer.rst b/docs/pages/evaluators/shapexplainer.rst new file mode 100644 index 00000000..5afc12e7 --- /dev/null +++ b/docs/pages/evaluators/shapexplainer.rst @@ -0,0 +1,60 @@ + +Shap Explainer +============== + + +This evaluator perform the calculation of shapley values for a dataset/model, +leveraging the SHAP package. + +It supports 2 types of assessments: + +1. Overall statistics of the shap values across all samples: mean and mean(|x|) +2. Individual shapley values for a list of samples + +Sampling +-------- +In order to speed up computation time, at the stage in which the SHAP explainer is +initialized, a down sampled version of the dataset is passed to the `Explainer` +object as background data. This is only affecting the calculation of the reference +value, the calculation of the shap values is still performed on the full dataset. + +Two strategies for down sampling are provided: + +1. Random sampling (the default strategy): the amount of samples can be specified + by the user. +2. Kmeans: summarizes a dataset with k mean centroids, weighted by the number of + data points they each represent. The amount of centroids can also be specified + by the user. + +There is no consensus on the optimal down sampling approach. For reference, see this +conversation: https://github.com/slundberg/shap/issues/1018 + + +Categorical variables +--------------------- +The interpretation of the results for categorical variables can be more challenging, and +dependent on the type of encoding utilized. Ordinal or one/hot encoding can be hard to +interpret. + +There is no agreement as to what is the best strategy as far as categorical variables are +concerned. A good discussion on this can be found here: https://github.com/slundberg/shap/issues/451 + +No restriction on feature type is imposed by the evaluator, so user discretion in the +interpretation of shap values for categorical variables is advised. + + +Parameters +---------- +samples_ind : Optional[List[int]], optional + List of row numbers representing the samples for which to extract individual + shapley values. This must be a list of integer indices. The underlying SHAP + library does not support non-integer indexing. +background_samples: int, + Amount of samples to be taken from the dataset in order to build the reference values. + See documentation about sampling above. Unused if background_kmeans is not False. +background_kmeans : Union[bool, int], optional + If True, use SHAP kmeans to create a data summary to serve as background data for the + SHAP explainer using 50 centroids by default. If an int is provided, + that will be used as the number of centroids. If False, random sampling will take place. + + diff --git a/docs/pages/evaluators/survivalfairness.rst b/docs/pages/evaluators/survivalfairness.rst new file mode 100644 index 00000000..f5c7bd91 --- /dev/null +++ b/docs/pages/evaluators/survivalfairness.rst @@ -0,0 +1,6 @@ + +Survival Fairness +================= + + +Calculate Survival fairness diff --git a/docs/pages/make_your_own.rst b/docs/pages/make_your_own.rst new file mode 100644 index 00000000..321ae947 --- /dev/null +++ b/docs/pages/make_your_own.rst @@ -0,0 +1,256 @@ +######################## +How to create Evaluators +######################## + +Evaluators are the core components of the Lens framework. Each evaluator performs a specific type +of assessment such as performance, fairness, privacy, etc... You can find the list of all evaluators +in the page :ref:`Library of Evaluators`. + +The following documents goes through: + +1. :ref:`Core methods for the class` +2. :ref:`Evaluator class organization` +3. :ref:`Docstring style guide` +4. :ref:`TLTR: A condensed summary` + +In order to understand the structure of evaluators, it is important to understand how they +are used within :class:`~.credoai.lens.lens.Lens`. A typical example of how Lens consumes evaluator +can be the following: + +.. code-block:: + + lens = Lens(model=credo_model, assessment_data=credo_data) + lens.add(Performance(metrics = 'false positive rate')) + +In this snippet of code (an extract of the notebook :ref:`quickstart`) , Lens is initiated with a model and assessment_data, +and a :class:`~.credoai.evaluators.performance.Performance` evaluator is added to the lens pipeline. +The Performance evaluator is also initiated, in this case the parameter `metrics` is provided. +The objects credo_data and credo_model are example of :mod:`~.credoai.artifacts`. + +Evaluators consume artifacts, they act on models/data in order to perform assessments. Multiple evaluators can be +added to a single Lens run. Each of them is an independent object, the assessment performed by one evaluator do not have +any effect on the results of the others. + +The life cycle of an evaluator is the following: + +#. The evaluator gets initialized with any necessary (and/or optional) parameter required for the evaluation. + In the snippet above the parameter `metrics` is necessary for the Performance evaluator to function. + +#. The initialized evaluator is added to a Lens instance. + +#. Based on the evaluator property :attr:`~.credoai.evaluators.evaluator.Evaluator.required_artifacts`, + Lens provides to the evaluator the artifacts necessary for its working. A Lens instance can be initialized + with only three artifacts: a model, training data and assessment data. An evaluator can potentially require + any combination of artifacts. + +#. A validation step is performed, by the evaluator, to make sure that the artifacts are structured + in the correct way in order for the assessment to take place correctly. If the validation fails, + Lens communicates to the user the reason behind the failure, and no assessment from that specific evaluator takes place. + +#. If the validation is successful, the evaluator performs the assessment. + + +The steps above are performed by specific methods shared by all evaluators. In the next section we will explore the way +these methods are built and organized. + +********************** +Core evaluator methods +********************** + +The full API reference for the ``Evaluator`` class can be found :ref:`on this page`. +The abstract class defines the main methods necessary for the functioning of an evaluator +within the Lens framework. + +Mirroring the steps listed in the previous section, the methods and properties the user will have to define +are the following: + +#. :meth:`~.credoai.evaluators.evaluator.Evaluator.__init__` -> Standard class initialization, the particularity + is that there is no need to specify any of the Lens artifacts (model, training data, assessment data). Lens + handle the passing of the artifacts to the evaluator. + +#. :attr:`~.credoai.evaluators.evaluator.Evaluator.required_artifacts` -> The strings contained in this set, + establish which artifacts Lens will try to pass to the evaluator, these artifacts will be made available + in the `self` object of the evaluator. + The accepted string values are: + + * ``"model"``: This means the evaluator requires a model. Accessible as ``self.model`` + * ``"assessment_data"``: This represent any dataset used to perform assessment. Accessible as ``self.assessment_data`` + * ``"training_data"``: This represent a dataset used to perform a model training during assessment time. + Accessible as ``self.assessment_data`` + * ``"data"``: This means that the evaluator can work on any generic dataset. If both training and assessment + data are available, Lens will run the evaluator on each separately. Accessible as ``self.data``. + * ``"sensitive_feature"``: this is a special value, it represents a dependance of the evaluator on sensitive + features, whereby that is intended in the context of Responsible AI. In case multiple sensitive features + are available, Lens will run the evaluator on each separately. Accessible as ``self.sensitive_feature`` + +#. :meth:`~.credoai.evaluators.evaluator.Evaluator._validate_arguments` -> Any validation on the format and content + of the required artifacts will be performed in this method. The module :mod:`~.credoai.evaluators.utils.validation` + contains several pre-made utility function that can aid the user in creating their validity checks. + +#. :meth:`~.credoai.evaluators.evaluator.Evaluator._setup` -> This method is supposed to contain any extra step necessary + to complete the initialization. This is introduced because the required artifacts are made available at a later + time compared to when the evaluator class is initialized. + +.. important:: + + The methods ``_validate_arguments()`` and ``_setup()``, together with the passing of the artifacts are handled + programmatically by Lens. The user must not explicitly call them from withing the evaluator. For the interested + reader, this part of the automation is handled by Lens via the function :meth:`~.credoai.evaluators.evaluator.Evaluator.__call__`. + +5. :meth:`~.credoai.evaluators.evaluator.Evaluator.evaluate()` -> This is the method that effectively runs all + the evaluating procedure. The user is free to structure the running of the evaluation as preferred, there + are no restriction on number of methods, however the method evaluate needs to be used to **run** the whole + procedure. This is the method that Lens references internally. + + The ``evaluate()`` method populates the property :attr:`~.credoai.evaluators.evaluator.Evaluator.results`, this property + can only accept list of `evidence containers `_. + +**************** +Evaluator schema +**************** + +This section deals with best practices in the organization of an evaluator class. This is a list +of principles that aim to make the structure of evaluators consistent, easy to interpret and debug. + +In general we strive to follow `Python PEP8 `_ guidelines. +Specifically to evaluator, these are the main directives: + +#. All evaluators inherit the class :class:`~.credoai.evaluators.evaluator.Evaluator`. +#. Evaluator class naming is in CamelCase, consistent with Python best practices for classes. +#. Immediately after the class name a docstring describing the purpose of the evaluator and any + parameter necessary at ``init`` time is inserted. For more info on the docstring structure, please + refer to the next section. +#. Immediately after the docstring, these methods/property (enforced by the abstract class) follow in this order: + + #. ``__init__``: The last line of this method is ``super().__init__()``. The invocation of the abstract + class init method is necessary to initialize some class properties. + #. ``required_artifacts``: this is defined as a property of the class, outside of init + #. ``_validate_arguments`` + #. ``_setup`` + #. ``evaluate`` + +#. The ``evaluate`` method is meant to be as simple as possible. Ideally the majority of the complexity is organized + in a suitable amount of private methods. +#. By the end of the ``evaluate`` method, the property ``self.results`` needs to be populated with the results + of the assessment. +#. Any other method created by the user to structure the code can come after evaluate. The only other recommendation + is for static methods to be put after any non-static method. + +********************* +Docstring style guide +********************* + +This is a general style guide for any method docstring. In particular, the evaluator class docstring will be used to +create an evaluator specific page in :ref:`evaluators`, so following the guidelines will ensure that the page +will be displayed correctly. + +The following settings are generally applied to any docstring. Modern IDE generally allow to configure +how docstrings will be populated. + +- Format: **numpy** +- Quote style: *"""* (3 x double quotes) +- Start on new line: True -> This forces the docstring to not start in line with the first + 3 double quotes. This setting is necessary for the docs page to be visualized correctly. + +The default format for the text content within the docstring follows the `sphinx restructured text `_ +conventions. Below an example of what a typical docstring could look like: + +.. code-block:: + + def MyEvaluator: + """ + Evaluator purpose, no more than one line. + + Notice how the first line above starts below the triple quotes. + + Custom section + -------------- + Any form of custom section is supported, and will + be formatted according to rst rules. These sections can be used to further + break down a lengthy description. NOtice the header level is defined by a rows + of "-" the same length as the section title. + + Any section can contain numbered/bullet lists. There needs to be an empty line + between the text and the start of the list. No indentation is required to start + the list. + + 1. Numbered item + 2. Another numbered item + + * Bullet point + To extend to multiline simply align to the first letter + * Sub bullet + + Parameters + ---------- + param 1: type + Description + + Examples + -------- + + This is expected to be the last section. + + Code syntax uses doctest conventions: + 1. Prefix lines with >>>, multiline code uses ... for second ine onward + 2. Line after a code line is interpreted as expected output + 3. Any output produced by the code needs to be matched for the test to succeed. + See strategy below to bypass matching a specific output + + >>> a = 2 + >>> def my_func(): # multiline example + ... pass + >>> print('123') # This outputs needs to be matched + 123 + + To skip having to match a specific output: + + >>> import doctest + >>> doctest.ELLIPSIS_MARKER = '-etc-' + >>> # the next line produces output we will ignore + >>> print(42) # doctest: +ELLIPSIS + -etc- + + + **WARNING!!!** Code prefixed with >>> will be tested during package testing, leveraging doctests + capabilities. + + Pseudo code can be inserted using indentation, this will not be tested: + + my_pseudo_code = something_generic + + """ + +.. warning:: + + It is necessary for any code prefixedto be syntactically correct and to conform to `doctests `_. + You can find an example of a complex code section in :ref:`this docstring `. + +******************** +Summary of the steps +******************** +Here's a very practical, and condensed, approach to making an evaluator: + +* Create a class that inherits from :class:`~.credoai.evaluators.evaluator.Evaluator`. +* Create the ``__init__``, remember that Lens artifacts are not meant to be here. +* Define the ``required_artifacts`` for the evaluator. +* Define ``validate_arguments`` and ``setup``. These tend to be updated as the understanding. + of the evaluator scope and desired outcome increase. +* Break down the logic of the evaluation in a suitable amount of methods. +* Finalize creating the ``evaluate`` method, which runs the full logic and populates ``self.results`` with + a list of *evidence containers*. + +During building/testing phase you can run the evaluator outside of Lens, in order to make the artifacts +available to the evaluator you can use the ``__call__`` method. An example of what a test would look like +would be: + +.. code:: + + test = MyEvaluator(param1 = value1) + # If MyEvaluator requires model and assessment data, call the evaluator + # to pass teh artifacts. This mimics what happens internally in Lens. + test(model = my_model_artifact, assessment_data = my_assessment_data) + test.evaluate() + # To check the results + test.results + diff --git a/docs/metrics.rst b/docs/pages/metrics.rst similarity index 97% rename from docs/metrics.rst rename to docs/pages/metrics.rst index 73d47e63..34fc7d12 100644 --- a/docs/metrics.rst +++ b/docs/pages/metrics.rst @@ -59,7 +59,7 @@ Custom metrics are supported by using the `Metric` class, which can be used to w - fdr * - :ref:`false_negative_rate` - performance - - fnr, miss_rate + - miss_rate, fnr * - :ref:`false_omission_rate` - performance - @@ -68,7 +68,7 @@ Custom metrics are supported by using the `Metric` class, which can be used to w - fallout_rate, fpr * - :ref:`gini_coefficient` - - - discriminatory_gini_index, gini_index, discriminatory_gini + - discriminatory_gini, discriminatory_gini_index, gini_index * - :ref:`matthews_correlation_coefficient` - performance - @@ -95,7 +95,7 @@ Custom metrics are supported by using the `Metric` class, which can be used to w - * - :ref:`mean_squared_error` - performance - - MSD, mean_squared_deviation, MSE + - MSE, mean_squared_deviation, MSD * - :ref:`mean_squared_log_error` - performance - @@ -119,7 +119,7 @@ Custom metrics are supported by using the `Metric` class, which can be used to w - precision * - :ref:`r2_score` - performance - - r_squared, r2 + - r2, r_squared * - :ref:`roc_auc_score` - performance - @@ -146,7 +146,7 @@ Custom metrics are supported by using the `Metric` class, which can be used to w - tnr, specificity * - :ref:`true_positive_rate` - performance - - recall, hit_rate, tpr, sensitivity, recall_score + - sensitivity, recall_score, tpr, hit_rate, recall * - :ref:`underprediction` - performance - @@ -302,7 +302,7 @@ False negative rate is defined as follows: **Source**: `click here `__ -**Other known names**: fnr, miss_rate +**Other known names**: miss_rate, fnr False_omission_rate ------------------- @@ -335,7 +335,7 @@ Gini_coefficient -**Other known names**: discriminatory_gini_index, gini_index, discriminatory_gini +**Other known names**: discriminatory_gini, discriminatory_gini_index, gini_index Matthews_correlation_coefficient -------------------------------- @@ -404,7 +404,7 @@ Mean square error is the expected value of the squared (quadratic) error or loss **Source**: `click here `__ -**Other known names**: MSD, mean_squared_deviation, MSE +**Other known names**: MSE, mean_squared_deviation, MSD Mean_squared_log_error ---------------------- @@ -466,7 +466,7 @@ Best possible score is 1.0 and it can be negative (because the model can be arbi **Source**: `click here `__ -**Other known names**: r_squared, r2 +**Other known names**: r2, r_squared Roc_auc_score ------------- @@ -547,7 +547,7 @@ True Positive Rate (also called sensitivity, recall, or hit rate) refers to the **Source**: `click here `__ -**Other known names**: recall, hit_rate, tpr, sensitivity, recall_score +**Other known names**: sensitivity, recall_score, tpr, hit_rate, recall Underprediction --------------- diff --git a/docs/setup.rst b/docs/pages/setup.rst similarity index 100% rename from docs/setup.rst rename to docs/pages/setup.rst diff --git a/docs/tutorials.rst b/docs/pages/tutorials.rst similarity index 88% rename from docs/tutorials.rst rename to docs/pages/tutorials.rst index cfb612c6..6fce7587 100644 --- a/docs/tutorials.rst +++ b/docs/pages/tutorials.rst @@ -8,9 +8,9 @@ Start here! .. toctree:: :maxdepth: 1 - notebooks/quickstart - notebooks/lens_faq - notebooks/governance_integration + ../notebooks/quickstart + ../notebooks/lens_faq + ../notebooks/platform_integration **Quickstart Demo** diff --git a/docs/requirements.txt b/docs/requirements.txt index dc41da16..39fe6081 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -16,4 +16,4 @@ scipy==1.9.1 scikit-learn==1.1.2 pandas==1.4.4 fairlearn==0.7.0 -lifelines>=0.27.3 \ No newline at end of file +lifelines>=0.27.3 diff --git a/pytest.ini b/pytest.ini index 128c9260..646ee4fd 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,4 +2,5 @@ filterwarnings = ignore:.* is deprecated:DeprecationWarning ignore:Call to deprecated create *.* - ignore:.*distutils Version *.* \ No newline at end of file + ignore:.*distutils Version *.* +addopts = --doctest-modules credoai diff --git a/requirements.txt b/requirements.txt index 9ff6f82a..2de043e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ absl-py>=1.0.0 cloudpickle>=2.0.0 -credoai-connect >= 0.0.2 +credoai-connect>=0.0.2 fairlearn>=0.7.0 json-api-doc>=0.15.0 matplotlib>=3.0