Quick start model evaluation + new dataset (#1726)

* Part 1 - converting tree leaves into filters * Part 1 - improvements * Apply suggestions from code review Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> * Part 1 - pr comments * Apply suggestions from code review Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> * Part 1 - pr comments v2 * Weak segment performance check - without display and docs * merge with master * Improve run be removing unnecessary operations * Edge cases - small or empty datasets * Display for check * Pr comments * display categorical features * Massages + import fix * import fix ver 2 * example page * example page v2 * pr comments * fixed CheckResultJson * pylint * Nir comments * plot name * docstring * bla * pr comments * Apply suggestions from code review Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com> * more pr comments * isort * fix tutorial bug * added a dataset * link to API reference * model eval quick start * shir comments * quick start * changes * comments fixed * text * condition to check * bressler's pr comments * Apply suggestions from code review Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com> Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com> Co-authored-by: Itay Gabbay <itay@deepchecks.com> Co-authored-by: Yurii Romanyshyn <yurii.romanyshyn@starnavi.io> Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com>
deepchecks · Jul 7, 2022 · eef382e · eef382e
1 parent 6a33c73
commit eef382e
Show file tree

Hide file tree

Showing 14 changed files with 355 additions and 65 deletions.
diff --git a/deepchecks/tabular/checks/model_evaluation/model_error_analysis.py b/deepchecks/tabular/checks/model_evaluation/model_error_analysis.py
@@ -14,8 +14,7 @@
 
 from sklearn import preprocessing
 
-from deepchecks import CheckFailure
-from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
+from deepchecks.core import CheckFailure, CheckResult, ConditionCategory, ConditionResult
 from deepchecks.core.errors import DeepchecksProcessError
 from deepchecks.tabular import Context, Dataset, TrainTestCheck
 from deepchecks.tabular.utils.task_type import TaskType

diff --git a/deepchecks/tabular/checks/model_evaluation/train_test_prediction_drift.py b/deepchecks/tabular/checks/model_evaluation/train_test_prediction_drift.py
@@ -16,8 +16,7 @@
 import numpy as np
 import pandas as pd
 
-from deepchecks import ConditionCategory
-from deepchecks.core import CheckResult, ConditionResult
+from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
 from deepchecks.tabular import Context, TrainTestCheck
 from deepchecks.tabular.utils.task_type import TaskType
 from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS,

diff --git a/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py b/deepchecks/tabular/checks/model_evaluation/weak_segments_performance.py
@@ -21,11 +21,10 @@
 from sklearn.model_selection import GridSearchCV
 from sklearn.tree import DecisionTreeRegressor
 
-from deepchecks import ConditionCategory, ConditionResult, Dataset
-from deepchecks.core import CheckResult
+from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
 from deepchecks.core.check_result import DisplayMap
 from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
-from deepchecks.tabular import Context, SingleDatasetCheck
+from deepchecks.tabular import Context, Dataset, SingleDatasetCheck
 from deepchecks.tabular.context import _DummyModel
 from deepchecks.tabular.utils.task_type import TaskType
 from deepchecks.utils.dataframes import default_fill_na_per_column_type

diff --git a/deepchecks/tabular/context.py b/deepchecks/tabular/context.py
@@ -14,8 +14,7 @@
 import numpy as np
 import pandas as pd
 
-from deepchecks import CheckFailure, CheckResult
-from deepchecks.core import DatasetKind
+from deepchecks.core import CheckFailure, CheckResult, DatasetKind
 from deepchecks.core.errors import (DatasetValidationError, DeepchecksNotSupportedError, DeepchecksValueError,
                                     ModelValidationError)
 from deepchecks.tabular._shared_docs import docstrings

diff --git a/deepchecks/tabular/datasets/regression/__init__.py b/deepchecks/tabular/datasets/regression/__init__.py
@@ -9,6 +9,6 @@
 # ----------------------------------------------------------------------------
 #
 """Module for working with pre-built regression datasets."""
-from . import avocado
+from . import avocado, wine_quality
 
-__all__ = ['avocado']
+__all__ = ['avocado', 'wine_quality']
diff --git a/deepchecks/tabular/datasets/regression/wine_quality.py b/deepchecks/tabular/datasets/regression/wine_quality.py
@@ -0,0 +1,126 @@
+# ----------------------------------------------------------------------------
+# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com)
+#
+# This file is part of Deepchecks.
+# Deepchecks is distributed under the terms of the GNU Affero General
+# Public License (version 3 or later).
+# You should have received a copy of the GNU Affero General Public License
+# along with Deepchecks.  If not, see <http://www.gnu.org/licenses/>.
+# ----------------------------------------------------------------------------
+#
+"""The wine quality dataset contains data on different wines and their overall quality."""
+import typing as t
+from urllib.request import urlopen
+
+import joblib
+import pandas as pd
+import sklearn
+from category_encoders import OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+
+from deepchecks.tabular.dataset import Dataset
+
+__all__ = ['load_data', 'load_fitted_model']
+_MODEL_URL = 'https://ndownloader.figshare.com/files/36146916'
+_FULL_DATA_URL = 'https://ndownloader.figshare.com/files/36146853'
+_TRAIN_DATA_URL = 'https://ndownloader.figshare.com/files/36146856'
+_TEST_DATA_URL = 'https://ndownloader.figshare.com/files/36146859'
+_MODEL_VERSION = '1.0.2'
+_target = 'quality'
+_CAT_FEATURES = []
+_NUM_FEATURES = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
+                 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
+                 'pH', 'sulphates', 'alcohol']
+
+
+def load_data(data_format: str = 'Dataset', as_train_test: bool = True) -> \
+        t.Union[t.Tuple, t.Union[Dataset, pd.DataFrame]]:
+    """Load and returns the Wine Quality dataset (regression).
+
+    The data has 1599 records with 11 features and one ordinal target column, referring to the overall quality
+    of a specific wine. see https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009
+    for additional information.
+
+    The typical ML task in this dataset is to build a model that predicts the overall quality of Wine.
+
+    This dataset is licensed under the Open Data Commons Open Database License (ODbL) v1.0
+    (https://opendatacommons.org/licenses/odbl/1-0/).
+    Right reserved to P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
+    Modeling wine preferences by data mining from physicochemical properties.
+    In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
+
+    Parameters
+    ----------
+    data_format : str , default: Dataset
+        Represent the format of the returned value. Can be 'Dataset'|'Dataframe'
+        'Dataset' will return the data as a Dataset object
+        'Dataframe' will return the data as a pandas Dataframe object
+    as_train_test : bool , default: True
+        If True, the returned data is splitted into train and test exactly like the toy model
+        was trained. The first return value is the train data and the second is the test data.
+        In order to get this model, call the load_fitted_model() function.
+        Otherwise, returns a single object.
+
+    Returns
+    -------
+    dataset : Union[deepchecks.Dataset, pd.DataFrame]
+        the data object, corresponding to the data_format attribute.
+    train_data, test_data : Tuple[Union[deepchecks.Dataset, pd.DataFrame],Union[deepchecks.Dataset, pd.DataFrame]
+        tuple if as_train_test = True. Tuple of two objects represents the dataset splitted to train and test sets.
+    """
+    if not as_train_test:
+        dataset = pd.read_csv(_FULL_DATA_URL)
+
+        if data_format == 'Dataset':
+            dataset = Dataset(dataset, label=_target, cat_features=_CAT_FEATURES)
+
+        return dataset
+    else:
+        train = pd.read_csv(_TRAIN_DATA_URL)
+        test = pd.read_csv(_TEST_DATA_URL)
+
+        if data_format == 'Dataset':
+            train = Dataset(train, label=_target, cat_features=_CAT_FEATURES)
+            test = Dataset(test, label=_target, cat_features=_CAT_FEATURES)
+
+        return train, test
+
+
+def load_fitted_model(pretrained=True):
+    """Load and return a fitted regression model to predict the quality in the Wine Quality dataset.
+
+    Returns
+    -------
+    model : Joblib
+        the model/pipeline that was trained on the Wine Quality dataset.
+
+    """
+    if sklearn.__version__ == _MODEL_VERSION and pretrained:
+        with urlopen(_MODEL_URL) as f:
+            model = joblib.load(f)
+    else:
+        model = _build_model()
+        train, _ = load_data()
+        model.fit(train.data[train.features], train.data[train.label_name])
+        joblib.dump(model, 'wine_quality_model.sav')
+    return model
+
+
+def _build_model():
+    """Build the model to fit."""
+    return Pipeline(steps=[
+        ('preprocessor',
+         ColumnTransformer(transformers=[('num',
+                                          Pipeline(steps=[('imputer',
+                                                           SimpleImputer(strategy='median')),
+                                                          ('scaler',
+                                                           StandardScaler())]),
+                                          _NUM_FEATURES),
+                                         ('cat', OneHotEncoder(),
+                                          _CAT_FEATURES)])),
+        ('classifier', RandomForestRegressor(random_state=0, max_depth=7, n_estimators=30))
+    ])
diff --git a/deepchecks/utils/single_sample_metrics.py b/deepchecks/utils/single_sample_metrics.py
@@ -16,8 +16,8 @@
 from sklearn import metrics
 from sklearn.preprocessing import LabelBinarizer
 
-from deepchecks import Dataset
 from deepchecks.core.errors import DeepchecksNotImplementedError
+from deepchecks.tabular import Dataset
 from deepchecks.tabular.utils.task_type import TaskType
 
 

diff --git a/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py b/docs/source/checks/tabular/model_evaluation/plot_weak_segments_performance.py
@@ -1,38 +1,41 @@
 # -*- coding: utf-8 -*-
 """
 Weak Segments Performance
-********************************
+*************************
 
 This notebooks provides an overview for using and understanding the weak segment performance check.
 
 **Structure:**
 
 * `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__
+* `Automatically detecting weak segments <#automatically-detecting-weak-segments>`__
 * `Generate data & model <#generate-data-model>`__
 * `Run the check <#run-the-check>`__
 * `Define a condition <#define-a-condition>`__
 
 What is the purpose of the check?
-=================================
+==================================
 
 The check is designed to help you easily identify the model's weakest segments in the data provided. In addition,
 it enables to provide a sublist of the Dataset's features, thus limiting the check to search in
 interesting subspaces.
 
-How Deepchecks automatically detects weak segments
-------------------------------------
+Automatically detecting weak segments
+=====================================
 
 The check contains several steps:
 
-#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE.
+#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according
+   to the task type.
 
-#. Select a subset of features for the the weak segment search. This is done by selecting the features with the highest feature importance to the model provided (within the features selected for check, if limited).
+#. Select a subset of features for the weak segment search. This is done by selecting the features with the
+   highest feature importance to the model provided (within the features selected for check, if limited).
 
-#. We train multiple simple tree based models, each one is trained using exactly two features (out of the ones selected above) to predict the per sample error calculated before.
+#. We train multiple simple tree based models, each one is trained using exactly two
+   features (out of the ones selected above) to predict the per sample error calculated before.
 
-#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance.
-
-#. For the model's weakest segments detected we calculate bins for the remaining of the data and calculate the model's
+#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance. For the
+   weakest segments detected we also calculate the model's performance on data segments surrounding them.
 """
 #%%
 # Generate data & model
@@ -65,8 +68,7 @@
 # ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
 # "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
 #
-# for additional information on the check's parameters, please refer to the API reference of the check
-# :class:`deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance`.
+# see :class:`deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance` for more details.
 
 from deepchecks.tabular.datasets.classification import phishing
 from deepchecks.tabular.checks import WeakSegmentsPerformance
@@ -75,7 +77,7 @@
 scorer = {'f1': make_scorer(f1_score, average='micro')}
 _, test_ds = phishing.load_data()
 model = phishing.load_fitted_model()
-check = WeakSegmentsPerformance(columns= ['urlLength', 'numTitles', 'ext', 'entropy'],
+check = WeakSegmentsPerformance(columns=['urlLength', 'numTitles', 'ext', 'entropy'],
                                 alternative_scorer=scorer,
                                 segment_minimum_size_ratio=0.03,
                                 categorical_aggregation_threshold=0.05)

diff --git a/docs/source/getting-started/welcome.rst b/docs/source/getting-started/welcome.rst
@@ -52,11 +52,11 @@ Tabular Data
 
 Head over to one of our following quickstart tutorials, and have deepchecks running on your environment in less than 5 min:
 
-- :doc:`Train-Test Validation Quickstart (loans data) </user-guide/tabular/auto_tutorials/plot_quick_data_integrity>`
+- :doc:`Data Integrity Quickstart </user-guide/tabular/auto_tutorials/plot_quick_data_integrity>`
 
-- :doc:`Data Integrity Quickstart (avocado sales data) </user-guide/tabular/auto_tutorials/plot_quick_data_integrity>`
+- :doc:`Train-Test Validation Quickstart </user-guide/tabular/auto_tutorials/plot_quick_train_test_validation>`
 
-- :doc:`Full Suite (many checks) Quickstart (iris data) </user-guide/tabular/auto_tutorials/plot_quickstart_in_5_minutes>`
+- :doc:`Model Evaluation Quickstart </user-guide/tabular/auto_tutorials/plot_quick_model_evaluation>`
 
  **Recommended - download the code and run it locally** on the built-in dataset and (optional) model, or **replace them with your own**.
 

diff --git a/docs/source/user-guide/tabular/tutorials/plot_quick_data_integrity.py b/docs/source/user-guide/tabular/tutorials/plot_quick_data_integrity.py
@@ -2,8 +2,8 @@
 """
 .. _quick_data_integrity:
 
-Quickstart - Data Integrity Suite (Avocado Sales Data)
-*******************************************************
+Quickstart - Data Integrity Suite
+*********************************
 
 The deepchecks integrity suite is relevant any time you have data that you wish to validate:
 whether it's on a fresh batch of data, or right before splitting it or using it for training. 
@@ -13,9 +13,11 @@
 
 .. code-block:: bash
 
-    # Before we start, if you don't have deepchecks installed yet,
-    # make sure to run:
-    pip install deepchecks -U --quiet #--user
+    # Before we start, if you don't have deepchecks installed yet, run:
+    import sys
+    !{sys.executable} -m pip install deepchecks -U --quiet
+
+    # or install using pip from your python environment
 """
 
 #%%
@@ -49,7 +51,7 @@ def add_dirty_data(df):
 # Run Deepchecks for Data Integrity
 # ====================================
 #
-# Define a Dataset Object
+# Create a Dataset Object
 # ------------------------
 #
 # Create a deepchecks Dataset, including the relevant metadata (label, date, index, etc.).
@@ -58,12 +60,12 @@ def add_dirty_data(df):
 
 from deepchecks.tabular import Dataset
 
-# We state the categorical features, otherwise they will be automatically inferred,
-# which may be less accurate, therefore stating them explicitly is recommended.
+# Categorical features can be heuristically inferred, however we
+# recommend to state them explicitly to avoid misclassification.
 
-# The label can be passed as a column name or a separate pd.Series / pd.DataFrame
+# Metadata attributes are optional. Some checks will run only if specific attributes are declared.
 
-ds = Dataset(dirty_df, cat_features = ['type'], datetime_name='Date', label = 'AveragePrice')
+ds = Dataset(dirty_df, cat_features= ['type'], datetime_name='Date', label= 'AveragePrice')
 
 #%%
 # Run the Deepchecks Suite
@@ -83,7 +85,7 @@ def add_dirty_data(df):
 suite_result = integ_suite.run(ds)
 # Note: the result can be saved as html using suite_result.save_as_html()
 # or exported to json using suite_result.to_json()
-suite_result
+suite_result.show()
 
 #%%
 # We can inspect the suite outputs and see that there are a few problems we'd like to fix.
@@ -105,7 +107,7 @@ def add_dirty_data(df):
 # we can also add a condition:
 single_value_with_condition = IsSingleValue().add_condition_not_single_value()
 result = single_value_with_condition.run(ds)
-result
+result.show()
 
 #%%
 
@@ -118,7 +120,7 @@ def add_dirty_data(df):
 
 ds.data.drop('Is Ripe', axis=1, inplace=True)
 result = single_value_with_condition.run(ds)
-result
+result.show()
 
 #%%
 
@@ -128,7 +130,7 @@ def add_dirty_data(df):
 dirty_df.drop('Is Ripe', axis=1, inplace=True)
 ds = Dataset(dirty_df, cat_features=['type'], datetime_name='Date', label='AveragePrice')
 result = DataDuplicates().add_condition_ratio_less_or_equal(0).run(ds)
-result
+result.show()
 
 #%%
 # Rerun Suite on the Fixed Dataset
@@ -159,4 +161,4 @@ def add_dirty_data(df):
 # Additional Outputs section*
 #
 # For more info about working with conditions, see the detailed
-# :doc:`/user-guide/general/customizations/examples/plot_configure_checks_conditions` guide.
+# :doc:`/user-guide/general/customizations/examples/plot_configure_check_conditions` guide.