Skip to content

Commit

Permalink
Quick start model evaluation + new dataset (#1726)
Browse files Browse the repository at this point in the history
* Part 1 - converting tree leaves into filters

* Part 1 - improvements

* Apply suggestions from code review

Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>

* Part 1 - pr comments

* Apply suggestions from code review

Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>

* Part 1 - pr comments v2

* Weak segment performance check - without display and docs

* merge with master

* Improve run be removing unnecessary operations

* Edge cases - small or empty datasets

* Display for check

* Pr comments

* display categorical features

* Massages + import fix

* import fix ver 2

* example page

* example page v2

* pr comments

* fixed CheckResultJson

* pylint

* Nir comments

* plot name

* docstring

* bla

* pr comments

* Apply suggestions from code review

Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com>

* more pr comments

* isort

* fix tutorial bug

* added a dataset

* link to API reference

* model eval quick start

* shir comments

* quick start

* changes

* comments fixed

* text

* condition to check

* bressler's pr comments

* Apply suggestions from code review

Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com>

Co-authored-by: Nir Hutnik <92314933+nirhutnik@users.noreply.github.com>
Co-authored-by: Itay Gabbay <itay@deepchecks.com>
Co-authored-by: Yurii Romanyshyn <yurii.romanyshyn@starnavi.io>
Co-authored-by: shir22 <33841818+shir22@users.noreply.github.com>
  • Loading branch information
5 people committed Jul 7, 2022
1 parent 6a33c73 commit eef382e
Show file tree
Hide file tree
Showing 14 changed files with 355 additions and 65 deletions.
Expand Up @@ -14,8 +14,7 @@

from sklearn import preprocessing

from deepchecks import CheckFailure
from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.core import CheckFailure, CheckResult, ConditionCategory, ConditionResult
from deepchecks.core.errors import DeepchecksProcessError
from deepchecks.tabular import Context, Dataset, TrainTestCheck
from deepchecks.tabular.utils.task_type import TaskType
Expand Down
Expand Up @@ -16,8 +16,7 @@
import numpy as np
import pandas as pd

from deepchecks import ConditionCategory
from deepchecks.core import CheckResult, ConditionResult
from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.tabular import Context, TrainTestCheck
from deepchecks.tabular.utils.task_type import TaskType
from deepchecks.utils.distribution.drift import (SUPPORTED_CATEGORICAL_METHODS, SUPPORTED_NUMERIC_METHODS,
Expand Down
Expand Up @@ -21,11 +21,10 @@
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

from deepchecks import ConditionCategory, ConditionResult, Dataset
from deepchecks.core import CheckResult
from deepchecks.core import CheckResult, ConditionCategory, ConditionResult
from deepchecks.core.check_result import DisplayMap
from deepchecks.core.errors import DeepchecksNotSupportedError, DeepchecksProcessError
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.tabular import Context, Dataset, SingleDatasetCheck
from deepchecks.tabular.context import _DummyModel
from deepchecks.tabular.utils.task_type import TaskType
from deepchecks.utils.dataframes import default_fill_na_per_column_type
Expand Down
3 changes: 1 addition & 2 deletions deepchecks/tabular/context.py
Expand Up @@ -14,8 +14,7 @@
import numpy as np
import pandas as pd

from deepchecks import CheckFailure, CheckResult
from deepchecks.core import DatasetKind
from deepchecks.core import CheckFailure, CheckResult, DatasetKind
from deepchecks.core.errors import (DatasetValidationError, DeepchecksNotSupportedError, DeepchecksValueError,
ModelValidationError)
from deepchecks.tabular._shared_docs import docstrings
Expand Down
4 changes: 2 additions & 2 deletions deepchecks/tabular/datasets/regression/__init__.py
Expand Up @@ -9,6 +9,6 @@
# ----------------------------------------------------------------------------
#
"""Module for working with pre-built regression datasets."""
from . import avocado
from . import avocado, wine_quality

__all__ = ['avocado']
__all__ = ['avocado', 'wine_quality']
126 changes: 126 additions & 0 deletions deepchecks/tabular/datasets/regression/wine_quality.py
@@ -0,0 +1,126 @@
# ----------------------------------------------------------------------------
# Copyright (C) 2021-2022 Deepchecks (https://www.deepchecks.com)
#
# This file is part of Deepchecks.
# Deepchecks is distributed under the terms of the GNU Affero General
# Public License (version 3 or later).
# You should have received a copy of the GNU Affero General Public License
# along with Deepchecks. If not, see <http://www.gnu.org/licenses/>.
# ----------------------------------------------------------------------------
#
"""The wine quality dataset contains data on different wines and their overall quality."""
import typing as t
from urllib.request import urlopen

import joblib
import pandas as pd
import sklearn
from category_encoders import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from deepchecks.tabular.dataset import Dataset

__all__ = ['load_data', 'load_fitted_model']
_MODEL_URL = 'https://ndownloader.figshare.com/files/36146916'
_FULL_DATA_URL = 'https://ndownloader.figshare.com/files/36146853'
_TRAIN_DATA_URL = 'https://ndownloader.figshare.com/files/36146856'
_TEST_DATA_URL = 'https://ndownloader.figshare.com/files/36146859'
_MODEL_VERSION = '1.0.2'
_target = 'quality'
_CAT_FEATURES = []
_NUM_FEATURES = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol']


def load_data(data_format: str = 'Dataset', as_train_test: bool = True) -> \
t.Union[t.Tuple, t.Union[Dataset, pd.DataFrame]]:
"""Load and returns the Wine Quality dataset (regression).
The data has 1599 records with 11 features and one ordinal target column, referring to the overall quality
of a specific wine. see https://www.kaggle.com/datasets/uciml/red-wine-quality-cortez-et-al-2009
for additional information.
The typical ML task in this dataset is to build a model that predicts the overall quality of Wine.
This dataset is licensed under the Open Data Commons Open Database License (ODbL) v1.0
(https://opendatacommons.org/licenses/odbl/1-0/).
Right reserved to P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
Modeling wine preferences by data mining from physicochemical properties.
In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
Parameters
----------
data_format : str , default: Dataset
Represent the format of the returned value. Can be 'Dataset'|'Dataframe'
'Dataset' will return the data as a Dataset object
'Dataframe' will return the data as a pandas Dataframe object
as_train_test : bool , default: True
If True, the returned data is splitted into train and test exactly like the toy model
was trained. The first return value is the train data and the second is the test data.
In order to get this model, call the load_fitted_model() function.
Otherwise, returns a single object.
Returns
-------
dataset : Union[deepchecks.Dataset, pd.DataFrame]
the data object, corresponding to the data_format attribute.
train_data, test_data : Tuple[Union[deepchecks.Dataset, pd.DataFrame],Union[deepchecks.Dataset, pd.DataFrame]
tuple if as_train_test = True. Tuple of two objects represents the dataset splitted to train and test sets.
"""
if not as_train_test:
dataset = pd.read_csv(_FULL_DATA_URL)

if data_format == 'Dataset':
dataset = Dataset(dataset, label=_target, cat_features=_CAT_FEATURES)

return dataset
else:
train = pd.read_csv(_TRAIN_DATA_URL)
test = pd.read_csv(_TEST_DATA_URL)

if data_format == 'Dataset':
train = Dataset(train, label=_target, cat_features=_CAT_FEATURES)
test = Dataset(test, label=_target, cat_features=_CAT_FEATURES)

return train, test


def load_fitted_model(pretrained=True):
"""Load and return a fitted regression model to predict the quality in the Wine Quality dataset.
Returns
-------
model : Joblib
the model/pipeline that was trained on the Wine Quality dataset.
"""
if sklearn.__version__ == _MODEL_VERSION and pretrained:
with urlopen(_MODEL_URL) as f:
model = joblib.load(f)
else:
model = _build_model()
train, _ = load_data()
model.fit(train.data[train.features], train.data[train.label_name])
joblib.dump(model, 'wine_quality_model.sav')
return model


def _build_model():
"""Build the model to fit."""
return Pipeline(steps=[
('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median')),
('scaler',
StandardScaler())]),
_NUM_FEATURES),
('cat', OneHotEncoder(),
_CAT_FEATURES)])),
('classifier', RandomForestRegressor(random_state=0, max_depth=7, n_estimators=30))
])
2 changes: 1 addition & 1 deletion deepchecks/utils/single_sample_metrics.py
Expand Up @@ -16,8 +16,8 @@
from sklearn import metrics
from sklearn.preprocessing import LabelBinarizer

from deepchecks import Dataset
from deepchecks.core.errors import DeepchecksNotImplementedError
from deepchecks.tabular import Dataset
from deepchecks.tabular.utils.task_type import TaskType


Expand Down
@@ -1,38 +1,41 @@
# -*- coding: utf-8 -*-
"""
Weak Segments Performance
********************************
*************************
This notebooks provides an overview for using and understanding the weak segment performance check.
**Structure:**
* `What is the purpose of the check? <#what-is-the-purpose-of-the-check>`__
* `Automatically detecting weak segments <#automatically-detecting-weak-segments>`__
* `Generate data & model <#generate-data-model>`__
* `Run the check <#run-the-check>`__
* `Define a condition <#define-a-condition>`__
What is the purpose of the check?
=================================
==================================
The check is designed to help you easily identify the model's weakest segments in the data provided. In addition,
it enables to provide a sublist of the Dataset's features, thus limiting the check to search in
interesting subspaces.
How Deepchecks automatically detects weak segments
------------------------------------
Automatically detecting weak segments
=====================================
The check contains several steps:
#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE.
#. We calculate loss for each sample in the dataset using the provided model via either log-loss or MSE according
to the task type.
#. Select a subset of features for the the weak segment search. This is done by selecting the features with the highest feature importance to the model provided (within the features selected for check, if limited).
#. Select a subset of features for the weak segment search. This is done by selecting the features with the
highest feature importance to the model provided (within the features selected for check, if limited).
#. We train multiple simple tree based models, each one is trained using exactly two features (out of the ones selected above) to predict the per sample error calculated before.
#. We train multiple simple tree based models, each one is trained using exactly two
features (out of the ones selected above) to predict the per sample error calculated before.
#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance.
#. For the model's weakest segments detected we calculate bins for the remaining of the data and calculate the model's
#. We convert each of the leafs in each of the trees into a segment and calculate the segment's performance. For the
weakest segments detected we also calculate the model's performance on data segments surrounding them.
"""
#%%
# Generate data & model
Expand Down Expand Up @@ -65,8 +68,7 @@
# ``categorical_aggregation_threshold``: By default the check will combine rare categories into a single category called
# "Other". This parameter determines the frequency threshold for categories to be mapped into to the "other" category.
#
# for additional information on the check's parameters, please refer to the API reference of the check
# :class:`deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance`.
# see :class:`deepchecks.tabular.checks.model_evaluation.WeakSegmentsPerformance` for more details.

from deepchecks.tabular.datasets.classification import phishing
from deepchecks.tabular.checks import WeakSegmentsPerformance
Expand All @@ -75,7 +77,7 @@
scorer = {'f1': make_scorer(f1_score, average='micro')}
_, test_ds = phishing.load_data()
model = phishing.load_fitted_model()
check = WeakSegmentsPerformance(columns= ['urlLength', 'numTitles', 'ext', 'entropy'],
check = WeakSegmentsPerformance(columns=['urlLength', 'numTitles', 'ext', 'entropy'],
alternative_scorer=scorer,
segment_minimum_size_ratio=0.03,
categorical_aggregation_threshold=0.05)
Expand Down
6 changes: 3 additions & 3 deletions docs/source/getting-started/welcome.rst
Expand Up @@ -52,11 +52,11 @@ Tabular Data

Head over to one of our following quickstart tutorials, and have deepchecks running on your environment in less than 5 min:

- :doc:`Train-Test Validation Quickstart (loans data) </user-guide/tabular/auto_tutorials/plot_quick_data_integrity>`
- :doc:`Data Integrity Quickstart </user-guide/tabular/auto_tutorials/plot_quick_data_integrity>`

- :doc:`Data Integrity Quickstart (avocado sales data) </user-guide/tabular/auto_tutorials/plot_quick_data_integrity>`
- :doc:`Train-Test Validation Quickstart </user-guide/tabular/auto_tutorials/plot_quick_train_test_validation>`

- :doc:`Full Suite (many checks) Quickstart (iris data) </user-guide/tabular/auto_tutorials/plot_quickstart_in_5_minutes>`
- :doc:`Model Evaluation Quickstart </user-guide/tabular/auto_tutorials/plot_quick_model_evaluation>`

**Recommended - download the code and run it locally** on the built-in dataset and (optional) model, or **replace them with your own**.

Expand Down
Expand Up @@ -2,8 +2,8 @@
"""
.. _quick_data_integrity:
Quickstart - Data Integrity Suite (Avocado Sales Data)
*******************************************************
Quickstart - Data Integrity Suite
*********************************
The deepchecks integrity suite is relevant any time you have data that you wish to validate:
whether it's on a fresh batch of data, or right before splitting it or using it for training.
Expand All @@ -13,9 +13,11 @@
.. code-block:: bash
# Before we start, if you don't have deepchecks installed yet,
# make sure to run:
pip install deepchecks -U --quiet #--user
# Before we start, if you don't have deepchecks installed yet, run:
import sys
!{sys.executable} -m pip install deepchecks -U --quiet
# or install using pip from your python environment
"""

#%%
Expand Down Expand Up @@ -49,7 +51,7 @@ def add_dirty_data(df):
# Run Deepchecks for Data Integrity
# ====================================
#
# Define a Dataset Object
# Create a Dataset Object
# ------------------------
#
# Create a deepchecks Dataset, including the relevant metadata (label, date, index, etc.).
Expand All @@ -58,12 +60,12 @@ def add_dirty_data(df):

from deepchecks.tabular import Dataset

# We state the categorical features, otherwise they will be automatically inferred,
# which may be less accurate, therefore stating them explicitly is recommended.
# Categorical features can be heuristically inferred, however we
# recommend to state them explicitly to avoid misclassification.

# The label can be passed as a column name or a separate pd.Series / pd.DataFrame
# Metadata attributes are optional. Some checks will run only if specific attributes are declared.

ds = Dataset(dirty_df, cat_features = ['type'], datetime_name='Date', label = 'AveragePrice')
ds = Dataset(dirty_df, cat_features= ['type'], datetime_name='Date', label= 'AveragePrice')

#%%
# Run the Deepchecks Suite
Expand All @@ -83,7 +85,7 @@ def add_dirty_data(df):
suite_result = integ_suite.run(ds)
# Note: the result can be saved as html using suite_result.save_as_html()
# or exported to json using suite_result.to_json()
suite_result
suite_result.show()

#%%
# We can inspect the suite outputs and see that there are a few problems we'd like to fix.
Expand All @@ -105,7 +107,7 @@ def add_dirty_data(df):
# we can also add a condition:
single_value_with_condition = IsSingleValue().add_condition_not_single_value()
result = single_value_with_condition.run(ds)
result
result.show()

#%%

Expand All @@ -118,7 +120,7 @@ def add_dirty_data(df):

ds.data.drop('Is Ripe', axis=1, inplace=True)
result = single_value_with_condition.run(ds)
result
result.show()

#%%

Expand All @@ -128,7 +130,7 @@ def add_dirty_data(df):
dirty_df.drop('Is Ripe', axis=1, inplace=True)
ds = Dataset(dirty_df, cat_features=['type'], datetime_name='Date', label='AveragePrice')
result = DataDuplicates().add_condition_ratio_less_or_equal(0).run(ds)
result
result.show()

#%%
# Rerun Suite on the Fixed Dataset
Expand Down Expand Up @@ -159,4 +161,4 @@ def add_dirty_data(df):
# Additional Outputs section*
#
# For more info about working with conditions, see the detailed
# :doc:`/user-guide/general/customizations/examples/plot_configure_checks_conditions` guide.
# :doc:`/user-guide/general/customizations/examples/plot_configure_check_conditions` guide.

0 comments on commit eef382e

Please sign in to comment.