From c9aadef900e96fc9e6b4a923183a3a5aa56de844 Mon Sep 17 00:00:00 2001 From: matanper Date: Thu, 9 Dec 2021 10:55:18 +0200 Subject: [PATCH] add docs (#233) * add docs * update docs * update docs * remove links * update docs * update docs * update docs * update docs * update docs Co-authored-by: Matan Perlmutter --- deepchecks/base/check.py | 5 +- .../methodology/datasets_size_comparison.py | 39 +- notebooks/examples/add_a_custom_check.ipynb | 343 +++++++++++ .../examples/configure_check_conditions.ipynb | 531 ++++++++++++++++++ 4 files changed, 896 insertions(+), 22 deletions(-) create mode 100644 notebooks/examples/add_a_custom_check.ipynb create mode 100644 notebooks/examples/configure_check_conditions.ipynb diff --git a/deepchecks/base/check.py b/deepchecks/base/check.py index ec1f0fd631..681354d934 100644 --- a/deepchecks/base/check.py +++ b/deepchecks/base/check.py @@ -41,7 +41,7 @@ class Condition: def __init__(self, name: str, function: Callable, params: Dict): if not isinstance(function, Callable): raise DeepchecksValueError(f'Condition must be a function `(Any) -> Union[ConditionResult, bool]`, ' - f'but got: {type(function).__name__}') + f'but got: {type(function).__name__}') if not isinstance(name, str): raise DeepchecksValueError(f'Condition name must be of type str but got: {type(name).__name__}') self.name = name @@ -279,7 +279,8 @@ def __repr__(self, tabs=0, prefix=''): def params(self) -> Dict: """Return parameters to show when printing the check.""" - return {k: v for k, v in vars(self).items() if not k.startswith('_') and v is not None} + return {k: v for k, v in vars(self).items() + if not k.startswith('_') and v is not None and not callable(v)} def clean_conditions(self): """Remove all conditions from this check instance.""" diff --git a/deepchecks/checks/methodology/datasets_size_comparison.py b/deepchecks/checks/methodology/datasets_size_comparison.py index 94f1cacf39..ce1e32c0ed 100644 --- a/deepchecks/checks/methodology/datasets_size_comparison.py +++ b/deepchecks/checks/methodology/datasets_size_comparison.py @@ -14,14 +14,14 @@ from deepchecks import Dataset, CheckResult, ConditionResult, TrainTestBaseCheck -__all__ = ['DatasetsSizeComparison',] +__all__ = ['DatasetsSizeComparison'] T = t.TypeVar('T', bound='DatasetsSizeComparison') class DatasetsSizeComparison(TrainTestBaseCheck): - """Verify Test dataset size comparing it to the Train dataset size.""" + """Verify test dataset size comparing it to the train dataset size.""" def run(self, train_dataset: Dataset, test_dataset: Dataset, model: object = None) -> CheckResult: """Run check instance. @@ -44,13 +44,11 @@ def run(self, train_dataset: Dataset, test_dataset: Dataset, model: object = Non check_name = type(self).__name__ Dataset.validate_dataset(train_dataset, check_name) Dataset.validate_dataset(test_dataset, check_name) - result = pd.DataFrame.from_dict({ - 'train': {'size': train_dataset.n_samples}, - 'test': {'size': test_dataset.n_samples}, - }) + sizes = {'Train': train_dataset.n_samples, 'Test': test_dataset.n_samples} + display = pd.DataFrame(sizes, index=['Size']) return CheckResult( - value=result, - display=result + value=sizes, + display=display ) def add_condition_test_size_not_smaller_than(self: T, value: int = 100) -> T: @@ -62,15 +60,15 @@ def add_condition_test_size_not_smaller_than(self: T, value: int = 100) -> T: Returns: Self: current instance of the DatasetsSizeComparison check. """ - def condition(check_result: pd.DataFrame) -> ConditionResult: + def condition(check_result: dict) -> ConditionResult: return ( - ConditionResult(False, f'Test dataset is smaller than {value}.') - if check_result['test']['size'] <= value # type: ignore + ConditionResult(False, f'Test dataset is {check_result["Test"]}') + if check_result['Test'] <= value else ConditionResult(True) ) return self.add_condition( - name=f'Test dataset size is not smaller than {value}.', + name=f'Test dataset size is not smaller than {value}', condition_func=condition ) @@ -84,14 +82,15 @@ def add_condition_test_train_size_ratio_not_smaller_than(self: T, ratio: float = Self: current instance of the DatasetsSizeComparison check. """ - def condition(check_result: pd.DataFrame) -> ConditionResult: - if (check_result['test']['size'] / check_result['train']['size']) <= ratio: # type: ignore - return ConditionResult(False, f'Test-Train size ratio is smaller than {ratio}.') + def condition(check_result: dict) -> ConditionResult: + test_train_ratio = check_result['Test'] / check_result['Train'] + if test_train_ratio <= ratio: + return ConditionResult(False, f'Test-Train size ratio is {test_train_ratio}') else: return ConditionResult(True) return self.add_condition( - name=f'Test-Train size ratio is not smaller than {ratio}.', + name=f'Test-Train size ratio is not smaller than {ratio}', condition_func=condition ) @@ -102,13 +101,13 @@ def add_condition_train_dataset_not_smaller_than_test(self: T) -> T: Self: current instance of the DatasetsSizeComparison check. """ - def condition(check_result: pd.DataFrame) -> ConditionResult: - if check_result['train']['size'] < check_result['test']['size']: # type: ignore - return ConditionResult(False, 'Train dataset is smaller than test dataset.') + def condition(check_result: dict) -> ConditionResult: + if check_result['Train'] < check_result['Test']: + return ConditionResult(False, 'Train dataset is smaller than test dataset') else: return ConditionResult(True) return self.add_condition( - name='Train dataset is not smaller than test dataset.', + name='Train dataset is not smaller than test dataset', condition_func=condition ) diff --git a/notebooks/examples/add_a_custom_check.ipynb b/notebooks/examples/add_a_custom_check.ipynb new file mode 100644 index 0000000000..6baf96485c --- /dev/null +++ b/notebooks/examples/add_a_custom_check.ipynb @@ -0,0 +1,343 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ef39dc68", + "metadata": {}, + "source": [ + "# Add a Custom Check" + ] + }, + { + "cell_type": "markdown", + "id": "da46d90d", + "metadata": {}, + "source": [ + "It is possible to extend deepchecks by implementing custom checks. This enables you to have your own logic of metrics or validation, or even just to display your own graph using deepchecks' suite.\n", + "\n", + "- [Check Structure](#Check-Structure)\n", + "- [Write a Basic Check](#Write-a-Basic-Check)\n", + "- [Check Display](#Check-Display)" + ] + }, + { + "cell_type": "markdown", + "id": "c335dcd0", + "metadata": {}, + "source": [ + "## Check Structure \n", + "Each check consists of 3 main parts:\n", + "- Return value\n", + "- Display\n", + "- Conditions\n", + "\n", + "This guide will demonstrate how to implement a Check with a return value and display, for addding a condition see working with conditions (LINK)" + ] + }, + { + "cell_type": "markdown", + "id": "c1bb0b9d", + "metadata": {}, + "source": [ + "## Write a Basic Check" + ] + }, + { + "cell_type": "markdown", + "id": "74e3cf95", + "metadata": {}, + "source": [ + "Let's implement a check for comparing the sizes of the test and the train datasets.\n", + "\n", + "The first step is to create check class, which inherits from a base check class. Each base check is differed by its run method signature, read more about all types (LINK). In this case we will use `TrainTestBaseCheck`, which is used to compare between the test and the train datasets. After creating the basic class with the `run` function we will write our check logic inside it.\n", + "\n", + "*Good to know: the return value of a check can be any object, a number, dictionary, string, etc...*" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dcfb8006", + "metadata": {}, + "outputs": [], + "source": [ + "from deepchecks import TrainTestBaseCheck, CheckResult, Dataset\n", + "\n", + "\n", + "class DatasetSizeComparison(TrainTestBaseCheck):\n", + " \"\"\"Check which compares the sizes of train and test datasets.\"\"\"\n", + " \n", + " def run(self, train_dataset: Dataset, test_dataset: Dataset, model=None) -> CheckResult:\n", + " ## Check logic\n", + " train_size = train_dataset.n_samples\n", + " test_size = test_dataset.n_samples\n", + " \n", + " ## Return value as check result\n", + " return_value = {'train_size': train_size, 'test_size': test_size}\n", + " return CheckResult(return_value)" + ] + }, + { + "cell_type": "markdown", + "id": "d464d1d2", + "metadata": {}, + "source": [ + "Hooray! we just implemented a custom check. Now let's create two Datasets and try to run it:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b1f1c5da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Dataset Size Comparison

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Check which compares the sizes of train and test datasets.

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Nothing found

" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import pandas as pd\n", + "\n", + "# We'll use dummy data for the purpose of this demonstration\n", + "train_dataset = Dataset(pd.DataFrame(data={'x': [1,2,3,4,5,6,7,8,9]}), label=None)\n", + "test_dataset = Dataset(pd.DataFrame(data={'x': [1,2,3]}), label=None)\n", + "\n", + "result = DatasetSizeComparison().run(train_dataset, test_dataset)\n", + "result" + ] + }, + { + "cell_type": "markdown", + "id": "b8bcb3fe", + "metadata": {}, + "source": [ + "Our check ran successfully but we got the print \"Nothing found\". This is because we haven't defined to the check anything to display, so the default behavior is to print \"Nothing found\". In order to access the value that we have defined earlier we can use the \"value\" property on the result." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "3c2b0360", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'train_size': 9, 'test_size': 3}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result.value" + ] + }, + { + "cell_type": "markdown", + "id": "144b593f", + "metadata": {}, + "source": [ + "To see code references for more complex checks (that can receive parameters etc.), check out any of your favorite checks from our API Reference (LINK)." + ] + }, + { + "cell_type": "markdown", + "id": "185e8b08", + "metadata": {}, + "source": [ + "## Check Display" + ] + }, + { + "cell_type": "markdown", + "id": "220171db", + "metadata": {}, + "source": [ + "Most of the times we will want our checks to have a visual display that will quickly summarize the check result. We can pass objects for display to the CheckResult. Objects for display should be of type: html string, dataframe or a function that plots a graph. Let's define a graph that will be displayed using `matplotlib`. In order to use `matplotlib` we have to implement the code inside a function and not call it directly in the check, this is due to architectural limitations of `matplotlib`.\n", + "\n", + "*Good to know: `display` can receive a single object to display or a list of objects*" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4399049b", + "metadata": {}, + "outputs": [], + "source": [ + "from deepchecks import TrainTestBaseCheck, CheckResult, Dataset\n", + "import matplotlib.pyplot as plt\n", + "\n", + "class DatasetSizeComparison(TrainTestBaseCheck):\n", + " \"\"\"Check which compares the sizes of train and test datasets.\"\"\"\n", + " \n", + " def run(self, train_dataset: Dataset, test_dataset: Dataset, model=None) -> CheckResult:\n", + " ## Check logic\n", + " train_size = train_dataset.n_samples\n", + " test_size = test_dataset.n_samples\n", + " \n", + " ## Create the check result value\n", + " sizes = {'Train': train_size, 'Test': test_size}\n", + " sizes_df_for_display = pd.DataFrame(sizes, index=['Size'])\n", + " \n", + " ## Display function of matplotlib graph:\n", + " def graph_display():\n", + " plt.bar(sizes.keys(), sizes.values(), color='green')\n", + " plt.xlabel(\"Dataset\")\n", + " plt.ylabel(\"Size\")\n", + " plt.title(\"Datasets Size Comparison\")\n", + " \n", + " return CheckResult(sizes, display=[sizes_df_for_display, graph_display])" + ] + }, + { + "cell_type": "markdown", + "id": "a16d1954", + "metadata": {}, + "source": [ + "Let check it out:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "be731fa3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

Dataset Size Comparison

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Check which compares the sizes of train and test datasets.

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Train Test
Size93
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAEWCAYAAABsY4yMAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/Il7ecAAAACXBIWXMAAAsTAAALEwEAmpwYAAATjklEQVR4nO3deZQlZX3G8e8DwyKCoNIiMyCDohCiSSQjoojgkhhcgjnBgEYjGEXNiSxioh4XcEGNMQYTk5CJCyYQMC4haoxiFAE1EIdFFBGjiAsgNAgIys4vf1T14U5zu6d1prpn3v5+zrlnblW9Ve+vbt3z3Jr31q1OVSFJas9GC12AJGkYBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeC06SU5I8oaFrmN9kWSfJJcudB1a9wz4RSbJ5UluSXJTkhuSfCXJy5LM6b2QZHmSSrJk4DrXqp8kf5zkW/1+Xp3k00m2Aqiql1XVW9ZtxZDkaUnO6vucTHJmkt9d1/2sa1V1dlXtutB1aN0z4BenZ1XVVsBOwDuAVwPvX9iS1p0k+wJvA57b7+evAB8euM8DgY8A/wzsAGwHvBF41pD9rq2hP6i1wKrKxyJ6AJcDT502b0/gbuCR/fQzgAuAnwI/BI4dafsDoICb+8fjgIcBXwCuA64FTga2GVnn1cAVwE3ApcBT+vkbAa8Bvtuv+2/AA2bpZxfgTODGvp8Pz7CPrwJOm+U1OBF4a//8kyN93Ny/Dof0y3YDPgf8pK/7D2bYXvp6/2yWPjcCXg98H7iG7oNg637Z8n5fD+1f7+uBlwGPAS4CbgDeO7KtQ4AvA+/tX4tvTb2m/fJDgUv61/sy4KUjy/YDftQfkx8D/zI1bw7HazPgeODK/nE8sNm07R7d799VwKEL/X5f7I8FL8DHPB/wMQHfz/8B8PL++X7Ao/pQ+jXgauDZ/bKpMFoysu4uwG/1ATABnAUc3y/btQ+tpSPrP6x/fgRwDt0Z72bAPwKnzNLPKcDr+ro2B54wwz7uA9wCvAnYeyqERpafSB/w0+bv3wfXjsB9+7oPBZYAj6b7UNl9zHq79bXuPMvr/iLgO8BDgS2BjwP/Mm1fT+j367eBW4HTgAcBy/rQ3LdvfwhwJ3AUsAlwEF3QT304PoPuQzfAvsDPgT1Gju2dwF/0r/l9GAn4NRyvN/fH60H9cf4K8JZp231zX9PT+37vv9Dv+cX8WPACfMzzAZ854M8BXjfDOscDf90/v1fwjmn/bOCC/vkufTg9FdhkWrtLWP3Mc3vgjj5Q79UP3VnvSmCHOezn/nRn5zfQnZm/G9i4X3Yi0wIeeERf5xP66YOAs6e1+UfgmDF97d3Xuvks9Xwe+JOR6V3H7OuykeXXAQeNTH8MOLJ/fgjdB1FGlv8v8IIZ+j4NOKJ/vh9w+2it0wJ+tuP1XeDpI9NPAy4f2cYt047XNcBeC/2eX8wPx+A1ZRndUARJHpvkjP6Lwhvphgu2nWnFJNslOTXJFUl+Cpw01b6qvgMcCRwLXNO3W9qvuhPw7/2XvTfQBf5ddOPX4/w53Vnp/ya5OMmLZqqpqv6rqp4FPAA4gC4UXzxD/VsD/wG8vqq+NFLbY6dq6+v7Q+DBYzZxXf/v9jPVAyylG56Z8n26cB/d16tHnt8yZnrLkekrqk/Rke0t7fdn/yTnJPlJX/fTWf34TVbVreOKXMPxGrcPS0emr6uqO0emfz6tZs0zA14keQxdwE+F278CnwB2rKqt6YYO0i8bd/vRt/XzH1VV9wOeP9KeqvrXqnoCXWgW3fAAdEMB+1fVNiOPzavqinH9VNWPq+olVbUUeCnw90l2mW3fquruqvo83XcEjxyz7xv1+3tGVa0cWfRD4MxptW1ZVS8f082lffvfn6WUK/v9n/IQuiGNq8c3X6NlSTIy/RDgyiSb0Z3tvwvYrqq2AT7NyPFg/DG8Z+HMx2vcPlz5S9aveWDAL2JJ7pfkmcCpwElV9fV+0VbAT6rq1iR7As8bWW2S7ovIh47M24puGOTGJMuAPxvpY9ckT+6D51a6M9G7+8UnAMcl2alvO5HkgJn6SfKcJDv0k9fThc/Utkb364AkBye5fzp70o1FnzPmZTiObrz9iGnzPwU8IskLkmzSPx6T5Femb6A/k34l8IYkh/av60ZJnpBk6kPjFOCoJDsn2ZLuQ/HD0854fxEPAg7v63oO3ZVCnwY2pRtbnwTuTLI/3Zj+nKzheJ0CvL4/TtvSXSV00i9Zv+aBAb84fTLJTXRnna+jG58+dGT5nwBv7tu8ke7qFgCq6ud0ofjlfuhiL7ovM/eg+6LvP+m+QJyyGd2lmNfSXbXxIOC1/bL30P1P4fS+r3OAx87Sz2OAc5Pc3K93RFVdNmb/rgdeAvwf3ZVAJwF/WVUnj2n7XGAv4PokN/ePP6yqm+iC8WC6s9Qfc88Xk/dSVR+lG7d/Ud/+auCtdEM/AB+gu2LlLOB7dOH5inHbmqNzgYfTva7HAQdW1XV93YfTHbPr6T6cP/ELbHe24/VWYBXdlT1fB87v52k9ldWH8SSt75IcAry4H0aRZuQZvCQ1yoCXpEY5RCNJjfIMXpIatV7daGjbbbet5cuXL3QZkrTBOO+8866tqolxy9argF++fDmrVq1a6DIkaYOR5PszLXOIRpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGrVe/ZJ1beRNWXMjLUp1jDfU0+LkGbwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUqEEDPslRSS5O8o0kpyTZfMj+JEn3GCzgkywDDgdWVNUjgY2Bg4fqT5K0uqGHaJYA90myBNgCuHLg/iRJvcECvqquAN4F/AC4Crixqk6f3i7JYUlWJVk1OTk5VDmStOgMOURzf+AAYGdgKXDfJM+f3q6qVlbViqpaMTExMVQ5krToDDlE81Tge1U1WVV3AB8HHj9gf5KkEUMG/A+AvZJskSTAU4BLBuxPkjRiyDH4c4GPAucDX+/7WjlUf5Kk1S0ZcuNVdQxwzJB9SJLG85esktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjRo04JNsk+SjSb6V5JIkjxuyP0nSPZYMvP33AJ+pqgOTbApsMXB/kqTeYAGfZGvgicAhAFV1O3D7UP1JklY35BDNzsAk8MEkFyR5X5L7Tm+U5LAkq5KsmpycHLAcSVpchgz4JcAewD9U1aOBnwGvmd6oqlZW1YqqWjExMTFgOZK0uAwZ8D8CflRV5/bTH6ULfEnSPBgs4Kvqx8APk+zaz3oK8M2h+pMkrW7oq2heAZzcX0FzGXDowP1JknqDBnxVXQisGLIPSdJ4/pJVkhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktSoOQV8ku2SvD/Jf/XTuyf542FLkyStjbmewZ8IfBZY2k9/GzhygHokSevIXAN+26r6N+BugKq6E7hrsKokSWttrgH/syQPBAogyV7AjYNVJUlaa3P9o9tHA58AHpbky8AE8JzBqpIkrbU5BXxVnZdkX2BXIMClVXXHoJVJktbKXK+i+S7w4qq6uKq+UVV3JPnUwLVJktbCXMfg7wCelOSDSTbt5y0bqCZJ0jow14D/eVUdBFwCnJ3kIfRfuEqS1k9z/ZI1AFX1ziTnA6cDDxisKknSWptrwL9x6klV/XeSpwEvHKYkSdK6MGvAJ9mtqr4FXJFkj2mL/ZJVktZjazqDfyVwGPBX/fT0cfcnr/OKJEnrxJq+ZH1fkgdX1ZOq6knAh4CbgW8ABw5enSTpl7amgD8BuB0gyROBt9OF/I3AymFLkyStjTUN0WxcVT/pnx8ErKyqjwEfS3LhoJVJktbKms7gN04y9SHwFOALI8vmegWOJGkBrCmkTwHOTHItcAtwNkCSXfBukpK0Xps14KvquCSfB7YHTq+qqatoNgJeMXRxkqRf3hqHWarqnDHzvj1MOZKkdcU/ui1JjRo84JNsnOQCby8sSfNrPs7gj6C7C6UkaR4NGvBJdgCeAbxvyH4kSfc29Bn88cCfA3fP1CDJYUlWJVk1OTk5cDmStHgMFvBJnglcU1XnzdauqlZW1YqqWjExMTFUOZK06Ax5Br838LtJLgdOBZ6c5KQB+5MkjRgs4KvqtVW1Q1UtBw4GvlBVzx+qP0nS6rwOXpIaNS83DKuqLwJfnI++JEkdz+AlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUfPyF50kQd6UhS5B66k6pgbZrmfwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMGC/gkOyY5I8k3k1yc5Iih+pIk3duQf3T7TuDoqjo/yVbAeUk+V1XfHLBPSVJvsDP4qrqqqs7vn98EXAIsG6o/SdLq5mUMPsly4NHAuWOWHZZkVZJVk5OT81GOJC0Kgwd8ki2BjwFHVtVPpy+vqpVVtaKqVkxMTAxdjiQtGoMGfJJN6ML95Kr6+JB9SZJWN+RVNAHeD1xSVe8eqh9J0nhDnsHvDbwAeHKSC/vH0wfsT5I0YrDLJKvqS0CG2r4kaXb+klWSGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRBrwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRhnwktQoA16SGmXAS1KjDHhJapQBL0mNMuAlqVEGvCQ1yoCXpEYZ8JLUKANekhplwEtSowx4SWqUAS9JjTLgJalRgwZ8kt9JcmmS7yR5zZB9SZJWN1jAJ9kY+Dtgf2B34LlJdh+qP0nS6oY8g98T+E5VXVZVtwOnAgcM2J8kacSSAbe9DPjhyPSPgMdOb5TkMOCwfvLmJJcOWNNisS1w7UIXsb7IsVnoEjSe79PeWr5Hd5ppwZABPydVtRJYudB1tCTJqqpasdB1SLPxfTq8IYdorgB2HJneoZ8nSZoHQwb8V4GHJ9k5yabAwcAnBuxPkjRisCGaqrozyZ8CnwU2Bj5QVRcP1Z9W45CXNgS+TweWqlroGiRJA/CXrJLUKANekhq14JdJas2SPBD4fD/5YOAuYLKf3rP/IdlM664A/qiqDh+2SqmzNu/Xfv39gNur6itD1bhYOAa/gUlyLHBzVb1rZN6Sqrpz4aqSxhv3fh1iHY3nEM0GKsmJSU5Ici7wziR7JvmfJBck+UqSXft2+yX5VP/82CQfSPLFJJcl8axe8yLJbyY5M8l5ST6bZPt+/uFJvpnkoiSnJlkOvAw4KsmFSfZZ0MI3cA7RbNh2AB5fVXcluR+wT3956lOBtwG/P2ad3YAnAVsBlyb5h6q6Y/5K1iIU4G+BA6pqMslBwHHAi4DXADtX1W1JtqmqG5KcgGfw64QBv2H7SFXd1T/fGvhQkocDBWwywzr/WVW3AbcluQbYju4+QdJQNgMeCXwuCXS/i7mqX3YRcHKS04DTFqK4lhnwG7afjTx/C3BGVf1e/9/cL86wzm0jz+/C94CGF+DiqnrcmGXPAJ4IPAt4XZJHzWtljXMMvh1bc8+9fg5ZwDqk6W4DJpI8DiDJJkl+NclGwI5VdQbwarr38JbATXRDiFpLBnw73gm8PckFeFau9cvdwIHAXyT5GnAh8Hi6oZqTknwduAD4m6q6Afgk8Ht+ybr2vExSkhrlGbwkNcqAl6RGGfCS1CgDXpIaZcBLUqMMeDUryV39pXYXJ/lakqP7a69nW2d5kucNUMuRSbZY19uVZmPAq2W3VNVvVNWvAr8F7A8cs4Z1lgPrPOCBIwEDXvPKgNeiUFXXAIcBf5rO8iRnJzm/fzy+b/oOYJ/+zP+omdol2T7JWX27b0z9ICfJb/d39Tw/yUeSbNnftXMpcEaSMxZi/7U4+UMnNSvJzVW15bR5NwC70v0c/u6qurW/QdspVbWi/2MTr6qqZ/btt5ih3dHA5lV1XJKN6c7ONwM+DuxfVT9L8mpgs6p6c5LLgRVVde187LsE/qRdi9cmwHuT/AbdTdce8Qu2+yrwgSSbAKdV1YVJ9gV2B77c3zVxU+B/BtsDaQ0MeC0aSR5KF9LX0I3FXw38Ot1Q5a0zrHbUuHZVdVaSJ9LdDfHEJO8Grgc+V1XPHXI/pLlyDF6LQpIJ4ATgvdWNS24NXFVVdwMvoLvxFdz7ToZj2yXZCbi6qv4JeB+wB3AOsHeSXfo2903yiBm2Kw3OgFfL7jN1mSTw38DpwJv6ZX8PvLC/u+Fu3HNv/YuAu/rLKo+apd1+wNf6u3ceBLynqibpbtV8SpKL6IZnduvbrwQ+45esmk9+ySpJjfIMXpIaZcBLUqMMeElqlAEvSY0y4CWpUQa8JDXKgJekRv0/y8+M0mEJmrEAAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "DatasetSizeComparison().run(train_dataset, test_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "407909d1", + "metadata": {}, + "source": [ + "## Voila!\n", + "\n", + "Now we have a check that prints a graph and has a value. We can add this check to any Suite and it will run within it.\n", + "\n", + "The next possible step is to implement a condition, which will allow us to give the check result a pass / fail mark. To do so, check out the following guide (LINK)" + ] + }, + { + "cell_type": "markdown", + "id": "002708b0", + "metadata": {}, + "source": [ + "## Base Checks Types" + ] + }, + { + "cell_type": "markdown", + "id": "9ec65b44", + "metadata": {}, + "source": [ + "There are a number of different `BaseCheck` Classes to inherit from. Each base check is differed by the objects it requires in order to run, and their sole difference is the `run` method's signature.\n", + "\n", + "|Check|`run` Signature|Notes|\n", + "|-----|---------------|-----|\n", + "|`SingleDatasetBaseCheck`|`run(self, dataset, model=None)`|When used in a suite you can choose whether to run on the test dataset, the train dataset or on both|\n", + "|`TrainTestBaseCheck`|`run(self, train_dataset, test_dataset, model=None)`||\n", + "|`ModelOnlyBaseCheck`|`run(self, model)`||" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/examples/configure_check_conditions.ipynb b/notebooks/examples/configure_check_conditions.ipynb new file mode 100644 index 0000000000..5f8d34c7ad --- /dev/null +++ b/notebooks/examples/configure_check_conditions.ipynb @@ -0,0 +1,531 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "470c3f6b", + "metadata": {}, + "source": [ + "# Configure Check Conditions" + ] + }, + { + "cell_type": "markdown", + "id": "8eec4ed8", + "metadata": {}, + "source": [ + "The following guide includes different options for configuring a check's condition(s):\n", + "\n", + "- [Add Condition](#Add-Condition)\n", + "- [Remove / Edit a Condition](#Remove-/-Edit-a-Condition)\n", + "- [Add a Custom Condition](#Add-a-Custom-Condition)\n", + "- [Set Custom Condition Category](#Set-Custom-Condition-Category)" + ] + }, + { + "cell_type": "markdown", + "id": "1026760f", + "metadata": {}, + "source": [ + "## Add Condition" + ] + }, + { + "cell_type": "markdown", + "id": "0c850e76", + "metadata": {}, + "source": [ + "In order to add a condition to an existing check, we can use any of the pre-defined conditions for that check. The naming convention for the methods that add the condition is `add_condition_...`.\n", + "\n", + "If you want to create and add your custom condition logic for parsing the check's result value, see [Add a Custom Condition](#Add-a-Custom-Condition)." + ] + }, + { + "cell_type": "markdown", + "id": "8bc3394b", + "metadata": {}, + "source": [ + "### Add a condition to a new check" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b3862684", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "DatasetsSizeComparison\n", + "\tConditions:\n", + "\t\t0: Test dataset size is not smaller than 1000" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from deepchecks.checks import DatasetsSizeComparison\n", + "\n", + "check = DatasetsSizeComparison().add_condition_test_size_not_smaller_than(1000)\n", + "check" + ] + }, + { + "cell_type": "markdown", + "id": "1c3ec933", + "metadata": {}, + "source": [ + "Conditions are used mainly in the context of a Suite, and displayed in the Conditions Summary table. For example how to run in a suite you can look at [Add a Custom Condition](#Add-a-Custom-Condition) or if you would like to run the conditions outside of suite you can execute:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "db25f8ed", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'is_pass': False, 'details': 'Test dataset is 3', 'category': , 'name': 'Test dataset size is not smaller than 1000'}]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from deepchecks import Dataset\n", + "import pandas as pd\n", + "# Dummy data\n", + "train_dataset = Dataset(pd.DataFrame(data={'x': [1,2,3,4,5,6,7,8,9]}))\n", + "test_dataset = Dataset(pd.DataFrame(data={'x': [1,2,3]}))\n", + "\n", + "condition_results = check.conditions_decision(check.run(train_dataset, test_dataset))\n", + "condition_results" + ] + }, + { + "cell_type": "markdown", + "id": "6d6be7c3", + "metadata": {}, + "source": [ + "### Add a condition to a check in a suite" + ] + }, + { + "cell_type": "markdown", + "id": "01af72d0", + "metadata": {}, + "source": [ + "If we want to add a conditon to a check within an existing suite, we should first find the Check's ID within the suite, and then add the condition to it, by running the relevant `add_condition_` method on that check's instance. See the next section to understand how to do so.\n", + "\n", + "The condition will then be appended to the list of conditions on that check (or be the first one if no conditions are defined), and each condition will be evaluated separately when running the suite.\n" + ] + }, + { + "cell_type": "markdown", + "id": "977fb8ec", + "metadata": {}, + "source": [ + "## Remove / Edit a Condition" + ] + }, + { + "cell_type": "markdown", + "id": "4be40708", + "metadata": {}, + "source": [ + "Deepchecks provides different kinds of default suites, which come with pre-defined conditions. You may want to remove a condition in case it isn't needed for you, or you may want to change the condition's parameters (since conditions functions are immutable).\n", + "\n", + "To remove a condition, start by printing the Suite and identifing the Check's ID, and the Condition's ID:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "af7bd116", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Overfit Suite: [\n", + "\t0: TrainTestDifferenceOverfit\n", + "\t\tConditions:\n", + "\t\t\t0: Train-Test metrics degradation ratio is not greater than 0.1\n", + "\t1: BoostingOverfit(num_steps=20)\n", + "\t\tConditions:\n", + "\t\t\t0: Test score decline is not greater than 5.00%\n", + "]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from deepchecks.suites import overfit_suite\n", + "\n", + "suite = overfit_suite()\n", + "suite" + ] + }, + { + "cell_type": "markdown", + "id": "44e1da74", + "metadata": {}, + "source": [ + "After we found the IDs we can remove the Condition:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "faf589a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Overfit Suite: [\n", + "\t0: TrainTestDifferenceOverfit\n", + "\t\tConditions:\n", + "\t\t\t0: Train-Test metrics degradation ratio is not greater than 0.1\n", + "\t1: BoostingOverfit(num_steps=20)\n", + "]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Access check by id\n", + "check = suite[1]\n", + "# Remove condition by id\n", + "check.remove_condition(0)\n", + "\n", + "suite" + ] + }, + { + "cell_type": "markdown", + "id": "ae06fd53", + "metadata": {}, + "source": [ + "Now if we want we can also re-add the Condition using the built-in methods on the check, with a different parameter. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9488c030", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Overfit Suite: [\n", + "\t0: TrainTestDifferenceOverfit\n", + "\t\tConditions:\n", + "\t\t\t0: Train-Test metrics degradation ratio is not greater than 0.1\n", + "\t1: BoostingOverfit(num_steps=20)\n", + "\t\tConditions:\n", + "\t\t\t1: Test score decline is not greater than 20.00%\n", + "]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Re-add the condition with new parameter\n", + "check.add_condition_test_score_percent_decline_not_greater_than(0.2)\n", + "\n", + "suite" + ] + }, + { + "cell_type": "markdown", + "id": "09b434c6", + "metadata": {}, + "source": [ + "## Add a Custom Condition" + ] + }, + { + "cell_type": "markdown", + "id": "73168e2e", + "metadata": {}, + "source": [ + "In order to write conditions we first have to know what value a given check produces. \n", + "Let's look at the check `DatasetsSizeComparison` and see it's return value in order to write a condition for it." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2a1821bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Train': 9, 'Test': 3}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from deepchecks.checks import DatasetsSizeComparison\n", + "from deepchecks import Dataset\n", + "import pandas as pd\n", + "\n", + "# We'll use dummy data for the purpose of this demonstration\n", + "train_dataset = Dataset(pd.DataFrame(data={'x': [1,2,3,4,5,6,7,8,9]}))\n", + "test_dataset = Dataset(pd.DataFrame(data={'x': [1,2,3]}))\n", + "\n", + "result = DatasetsSizeComparison().run(train_dataset, test_dataset)\n", + "result.value" + ] + }, + { + "cell_type": "markdown", + "id": "08113761", + "metadata": {}, + "source": [ + "Now we know what the return value looks like. Let's add a new condition that validates that the ratio between the train and test datasets size is inside a given range.\n", + "To create condition we need to use the `add_condition` method of the check which accepts a condition name and a function. This function receives the value of the `CheckResult` that we saw above and should return either a boolean or a `ConditionResult` containing a boolean and optional extra info that will be displayed in the Conditions Summary table.\n", + "\n", + "*Note: When implementing a condition in a custom check, you may want to add a method `add_condition_x()` to allow any consumer of your check to apply the condition (possibly with given parameters). For examples look at implemented Checks' source code*" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "06ef9c99", + "metadata": {}, + "outputs": [], + "source": [ + "from deepchecks import ConditionResult\n", + "\n", + "# Our parameters for the condition\n", + "low_threshold = 0.4\n", + "high_threshold = 0.6\n", + "\n", + "# Create the condition function\n", + "def custom_condition(value: dict): \n", + " ratio = value['Test'] / value['Train']\n", + " if low_threshold <= ratio <= high_threshold:\n", + " return ConditionResult(True)\n", + " else:\n", + " # Note: if you doesn't care about the extra info, you can return directly a boolean\n", + " return ConditionResult(False, f'Test-Train ratio is {ratio:.2}')\n", + "\n", + "# Create the condition name\n", + "condition_name = f'Test-Train ratio is between {low_threshold} to {high_threshold}'\n", + "\n", + "# Create check instance with the condition \n", + "check = DatasetsSizeComparison().add_condition(condition_name, custom_condition)" + ] + }, + { + "cell_type": "markdown", + "id": "6c194fdd", + "metadata": {}, + "source": [ + "Now we will use a Suite to demonstrate the action of the condition, since the suite runs the condition for us automatically and prints out a Conditions Summary table (for all the conditions defined on the checks within the suite):" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "39b8606b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Suite for Condition: 0%| | 0/1 [00:00Suite for Condition\n", + "

The suite is composed of various checks such as: Datasets Size Comparison, etc...
\n", + " Each check may contain conditions (which results in \n", + " /\n", + " /\n", + " !\n", + " ), as well as other outputs such as plots or tables.
\n", + " Suites, checks and conditions can all be modified (see tutorial [link]).

\n", + "

Conditions Summary

\n", + " " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Status Check Condition More Info
Datasets Size ComparisonTest-Train ratio is between 0.4 to 0.6Test-Train ratio is 0.33
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Additional Outputs

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Datasets Size Comparison

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

Verify test dataset size comparing it to the train dataset size.

" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Train Test
Size93
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from deepchecks import Suite\n", + "\n", + "# Using suite to run check & condition\n", + "suite = Suite('Suite for Condition',\n", + " check\n", + ")\n", + "\n", + "suite.run(train_dataset, test_dataset)" + ] + }, + { + "cell_type": "markdown", + "id": "8d0eba67", + "metadata": {}, + "source": [ + "----------------\n", + "## Set Custom Condition Category" + ] + }, + { + "cell_type": "markdown", + "id": "5afdc396", + "metadata": {}, + "source": [ + "When writing your own condition logic, you can decide to mark a condition result as either fail or warn, by passing the category to the `ConditionResult` object.\n", + "For example we can even write condition which sets the category based on severity of the result:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b6ce24a3", + "metadata": {}, + "outputs": [], + "source": [ + "from deepchecks import ConditionResult, ConditionCategory\n", + "\n", + "# Our parameters for the condition\n", + "low_threshold = 0.3\n", + "high_threshold = 0.7\n", + "\n", + "# Create the condition function for check `DatasetsSizeComparison`\n", + "def custom_condition(value: dict): \n", + " ratio = value['Test'] / value['Train']\n", + " if low_threshold <= ratio <= high_threshold:\n", + " return ConditionResult(True)\n", + " elif ratio < low_threshold:\n", + " return ConditionResult(False, f'Test-Train ratio is {ratio:.2}', ConditionCategory.FAIL)\n", + " else:\n", + " return ConditionResult(False, f'Test-Train ratio is {ratio:.2}', ConditionCategory.WARN)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}