Merge branch 'main' of https://github.com/deepchecks/MLChecks into 0.6.x

deepchecks · Apr 28, 2022 · df80eb1 · df80eb1
2 parents 86b67f8 + e679b25
commit df80eb1
Show file tree

Hide file tree

Showing 189 changed files with 2,516 additions and 954 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -45,7 +45,8 @@ jobs:
           sudo apt-get install pandoc dvisvgm texlive texlive-latex-extra
       - name: Build documentation
         run: make docs
-
+      - name: Validate examples
+        run: make validate-examples
       - name: Upload documentation
         if: github.event_name != 'pull_request'
         uses: actions/upload-artifact@v2

diff --git a/.gitignore b/.gitignore
@@ -96,24 +96,11 @@ docs.error.log
 *MNIST*
 
 # build folders of sphinx gallery
-docs/source/examples/vision/guides/examples/
-docs/source/examples/general/examples/
-docs/source/examples/vision/checks/distribution/examples/
-docs/source/examples/vision/checks/performance/examples/
-docs/source/examples/tabular/guides/examples/
-docs/source/examples/tabular/checks/distribution/examples/
-docs/source/examples/tabular/checks/overview/examples/
-docs/source/examples/tabular/checks/integrity/examples/
-docs/source/examples/tabular/checks/methodology/examples/
-docs/source/examples/tabular/checks/performance/examples/
-docs/source/examples/tabular/use-cases/examples/
-docs/source/examples/tabular/checks/methodology/examples/
-docs/source/tutorials/tabular/examples
-docs/source/tutorials/vision/examples
-docs/source/examples/vision/checks/methodology/examples/
 docs/source/user-guide/general/customizations/examples/
 docs/source/user-guide/general/exporting_results/examples/
-docs/source/tutorials/tabular/examples/
+docs/source/checks_gallery/
+docs/source/auto_tutorials/tabular
+docs/source/auto_tutorials/vision
 
 # build artifacts from running docs (vision and wandb export)
 docs/source/tutorials/vision/*.html

diff --git a/deepchecks/checks.py b/deepchecks/checks.py
@@ -39,6 +39,7 @@
     'CategoryMismatchTrainTest',
     'NewLabelTrainTest',
     'LabelAmbiguity',
+    'OutlierSampleDetection',
 
     # methodology checks
     'BoostingOverfit',

diff --git a/deepchecks/core/check_result.py b/deepchecks/core/check_result.py
@@ -23,9 +23,9 @@
 import numpy as np
 import ipywidgets as widgets
 import plotly.graph_objects as go
-import plotly
-from ipywidgets.embed import embed_minimal_html, dependency_state
 from plotly.basedatatypes import BaseFigure
+import plotly.io as pio
+import plotly
 from matplotlib import pyplot as plt
 from IPython.display import display_html
 from pandas.io.formats.style import Styler
@@ -34,7 +34,7 @@
 from deepchecks.core.display_pandas import dataframe_to_html, get_conditions_table
 from deepchecks.core.errors import DeepchecksValueError
 from deepchecks.utils.dataframes import un_numpy
-from deepchecks.utils.strings import get_docs_summary
+from deepchecks.utils.strings import create_new_file_name, get_docs_summary, widget_to_html
 from deepchecks.utils.ipython import is_notebook
 from deepchecks.utils.wandb_utils import set_wandb_run_state
 
@@ -130,6 +130,7 @@ def display_check(self, unique_id: str = None, as_widget: bool = False,
         """
         if as_widget:
             box = widgets.VBox()
+            box.add_class('rendered_html')
             box_children = []
         check_html = ''
         if unique_id:
@@ -185,14 +186,34 @@ def display_check(self, unique_id: str = None, as_widget: bool = False,
             return box
         display_html(check_html, raw=True)
 
-    def _repr_html_(self):
+    def _repr_html_(self, unique_id=None,
+                    show_additional_outputs=True, requirejs: bool = False):
         """Return html representation of check result."""
         html_out = io.StringIO()
-        widgeted_output = self.display_check(as_widget=True)
-        embed_minimal_html(html_out, views=[widgeted_output], requirejs=False,
-                           embed_url=None, state=dependency_state(widgeted_output))
+        self.save_as_html(html_out, unique_id=unique_id,
+                          show_additional_outputs=show_additional_outputs, requirejs=requirejs)
         return html_out.getvalue()
 
+    def save_as_html(self, file=None, unique_id=None,
+                     show_additional_outputs=True, requirejs: bool = True):
+        """Save output as html file.
+
+        Parameters
+        ----------
+        file : filename or file-like object
+            The file to write the HTML output to. If None writes to output.html
+        requirejs: bool , default: True
+            If to save with all javascript dependencies
+        """
+        if file is None:
+            file = 'output.html'
+        widgeted_output = self.display_check(unique_id=unique_id,
+                                             show_additional_outputs=show_additional_outputs,
+                                             as_widget=True)
+        if isinstance(file, str):
+            file = create_new_file_name(file, 'html')
+        widget_to_html(widgeted_output, html_out=file, title=self.get_header(), requirejs=requirejs)
+
     def _display_to_json(self) -> List[Tuple[str, str]]:
         displays = []
         old_backend = matplotlib.get_backend()
@@ -362,7 +383,7 @@ def _get_metadata(self, with_doc_link: bool = False):
     def _ipython_display_(self, unique_id=None, as_widget=False,
                           show_additional_outputs=True):
         check_widget = self.display_check(unique_id=unique_id, as_widget=as_widget,
-                                          show_additional_outputs=show_additional_outputs,)
+                                          show_additional_outputs=show_additional_outputs)
         if as_widget:
             display_html(check_widget)
 
@@ -423,11 +444,27 @@ def priority(self) -> int:
 
         return 4
 
-    def show(self, unique_id=None, show_additional_outputs=True):
-        """Display check result."""
+    def show(self, show_additional_outputs=True, unique_id=None):
+        """Display the check result.
+
+        Parameters
+        ----------
+        show_additional_outputs : bool
+            Boolean that controls if to show additional outputs.
+        unique_id : str
+            The unique id given by the suite that displays the check.
+        """
         if is_notebook():
-            self._ipython_display_(unique_id=unique_id,
-                                   show_additional_outputs=show_additional_outputs)
+            self.display_check(unique_id=unique_id,
+                               show_additional_outputs=show_additional_outputs)
+        elif 'sphinx_gallery' in pio.renderers.default:
+            html = self._repr_html_(unique_id=unique_id,
+                                    show_additional_outputs=show_additional_outputs)
+
+            class TempSphinx:
+                def _repr_html_(self):
+                    return html
+            return TempSphinx()
         else:
             warnings.warn('You are running in a non-interactive python shell. in order to show result you have to use '
                           'an IPython shell (etc Jupyter)')
@@ -464,7 +501,7 @@ def to_json(self, with_display: bool = True):
         """
         result_json = self._get_metadata()
         if with_display:
-            result_json['display'] = [('str', str(self.exception))]
+            result_json['display'] = [('html', f'<p style="color:red">{self.exception}</p>')]
         return jsonpickle.dumps(result_json, unpicklable=False)
 
     def to_wandb(self, dedicated_run: bool = True, **kwargs: Any):
@@ -501,15 +538,19 @@ def _get_metadata(self, with_doc_link: bool = False):
 
     def __repr__(self):
         """Return string representation."""
-        tb_str = traceback.format_exception(etype=type(self.exception), value=self.exception,
-                                            tb=self.exception.__traceback__)
-        return ''.join(tb_str)
+        return self.header + ': ' + str(self.exception)
 
     def _ipython_display_(self):
         """Display the check failure."""
         check_html = f'<h4>{self.header}</h4>'
         if hasattr(self.check.__class__, '__doc__'):
             summary = get_docs_summary(self.check)
             check_html += f'<p>{summary}</p>'
-        check_html += f'<p style="color:red"> {self.exception}</p>'
+        check_html += f'<p style="color:red">{self.exception}</p>'
         display_html(check_html, raw=True)
+
+    def print_traceback(self):
+        """Print the traceback of the failure."""
+        tb_str = traceback.format_exception(etype=type(self.exception), value=self.exception,
+                                            tb=self.exception.__traceback__)
+        print(''.join(tb_str))
diff --git a/deepchecks/core/check_utils/single_feature_contribution_utils.py b/deepchecks/core/check_utils/single_feature_contribution_utils.py
@@ -19,8 +19,41 @@
 import plotly.graph_objects as go
 
 
+def get_pps_figure(per_class: bool):
+    """If per_class is True, then no title is defined on the figure."""
+    fig = go.Figure()
+    fig.update_layout(
+        yaxis_title='Predictive Power Score (PPS)',
+        yaxis_range=[0, 1.05],
+        legend=dict(x=1.0, y=1.0),
+        barmode='group',
+        width=800, height=500
+    )
+    if per_class:
+        fig.update_layout(xaxis_title='Class')
+    else:
+        fig.update_layout(
+            title='Predictive Power Score (PPS) - Can a feature predict the label by itself?',
+            xaxis_title='Column',
+        )
+    return fig
+
+
+def pps_df_to_trace(s_pps: pd.Series, name: str):
+    """If name is train/test use our defined colors, else will use plotly defaults."""
+    name = name.capitalize() if name else None
+    return go.Bar(x=s_pps.index,
+                  y=s_pps,
+                  name=name,
+                  marker_color=colors.get(name),
+                  text=s_pps.round(2),
+                  textposition='outside'
+                  )
+
+
 def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Optional[Hashable], test_df: pd.DataFrame,
-                                    test_label_name: Optional[Hashable], ppscore_params: dict, n_show_top: int):
+                                    test_label_name: Optional[Hashable], ppscore_params: dict, n_show_top: int,
+                                    random_state: int = None):
     """
     Calculate the PPS for train, test and difference for single feature contribution checks.
 
@@ -42,18 +75,20 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
             dictionary of additional parameters for the ppscore predictor function
         n_show_top: int
             Number of features to show, sorted by the magnitude of difference in PPS
+        random_state: int, default None
+            Random state for the ppscore.predictors function
 
     Returns:
         CheckResult
             value: dictionaries of PPS values for train, test and train-test difference.
             display: bar graph of the PPS of each feature.
     """
     df_pps_train = pps.predictors(df=train_df, y=train_label_name,
-                                  random_seed=42,
+                                  random_seed=random_state,
                                   **ppscore_params)
     df_pps_test = pps.predictors(df=test_df,
                                  y=test_label_name,
-                                 random_seed=42, **ppscore_params)
+                                 random_seed=random_state, **ppscore_params)
 
     s_pps_train = df_pps_train.set_index('x', drop=True)['ppscore']
     s_pps_test = df_pps_test.set_index('x', drop=True)['ppscore']
@@ -65,34 +100,16 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
     s_pps_train_to_display = s_pps_train[s_difference_to_display.index]
     s_pps_test_to_display = s_pps_test[s_difference_to_display.index]
 
-    fig = go.Figure()
-    fig.add_trace(go.Bar(x=s_pps_train_to_display.index,
-                         y=s_pps_train_to_display,
-                         name='Train',
-                         marker_color=colors['Train'], text=s_pps_train_to_display.round(2), textposition='outside'
-                         ))
-    fig.add_trace(go.Bar(x=s_pps_test_to_display.index,
-                         y=s_pps_test_to_display,
-                         name='Test',
-                         marker_color=colors['Test'], text=s_pps_test_to_display.round(2), textposition='outside'
-                         ))
+    fig = get_pps_figure(per_class=False)
+    fig.add_trace(pps_df_to_trace(s_pps_train_to_display, 'train'))
+    fig.add_trace(pps_df_to_trace(s_pps_test_to_display, 'test'))
     fig.add_trace(go.Scatter(x=s_difference_to_display.index,
                              y=s_difference_to_display,
                              name='Train-Test Difference (abs)',
                              marker=dict(symbol='circle', size=15),
                              line=dict(color='#aa57b5', width=5)
                              ))
 
-    fig.update_layout(
-        title='Predictive Power Score (PPS) - Can a feature predict the label by itself?',
-        xaxis_title='Column',
-        yaxis_title='Predictive Power Score (PPS)',
-        yaxis_range=[0, 1.05],
-        legend=dict(x=1.0, y=1.0),
-        barmode='group',
-        width=800, height=500
-    )
-
     ret_value = {'train': s_pps_train.to_dict(), 'test': s_pps_test.to_dict(),
                  'train-test difference': s_difference.to_dict()}
 
@@ -106,7 +123,8 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe
                                               test_df: pd.DataFrame,
                                               test_label_name: Optional[Hashable], ppscore_params: dict,
                                               n_show_top: int,
-                                              min_pps_to_show: float = 0.05):
+                                              min_pps_to_show: float = 0.05,
+                                              random_state: int = None):
     """
     Calculate the PPS for train, test and difference for single feature contribution checks per class.
 
@@ -130,6 +148,8 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe
             Number of features to show, sorted by the magnitude of difference in PPS
         min_pps_to_show: float, default 0.05
             Minimum PPS to show a class in the graph
+        random_state: int, default None
+            Random state for the ppscore.predictors function
 
     Returns:
         CheckResult
@@ -153,11 +173,11 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe
             lambda x: 1 if x == c else 0)  # pylint: disable=cell-var-from-loop
 
         df_pps_train = pps.predictors(df=train_df_all_vs_one, y=train_label_name,
-                                      random_seed=42,
+                                      random_seed=random_state,
                                       **ppscore_params)
         df_pps_test = pps.predictors(df=test_df_all_vs_one,
                                      y=test_label_name,
-                                     random_seed=42, **ppscore_params)
+                                     random_seed=random_state, **ppscore_params)
 
         s_pps_train = df_pps_train.set_index('x', drop=True)['ppscore']
         s_pps_test = df_pps_test.set_index('x', drop=True)['ppscore']
@@ -183,28 +203,10 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe
             s_train_to_display = s_train[s_difference_to_display.index]
             s_test_to_display = s_test[s_difference_to_display.index]
 
-            fig = go.Figure()
-            fig.add_trace(go.Bar(x=s_train_to_display.index.astype(str),
-                                 y=s_train_to_display,
-                                 name='Train',
-                                 marker_color=colors['Train'], text=s_train_to_display.round(2), textposition='outside'
-                                 ))
-            fig.add_trace(go.Bar(x=s_test_to_display.index.astype(str),
-                                 y=s_test_to_display,
-                                 name='Test',
-                                 marker_color=colors['Test'], text=s_test_to_display.round(2), textposition='outside'
-                                 ))
-
-            fig.update_layout(
-                title=f'{feature}: Predictive Power Score (PPS) Per Class',
-                xaxis_title='Class',
-                yaxis_title='Predictive Power Score (PPS)',
-                yaxis_range=[0, 1.05],
-                legend=dict(x=1.0, y=1.0),
-                barmode='group',
-                width=800, height=400
-            )
-
+            fig = get_pps_figure(per_class=True)
+            fig.update_layout(title=f'{feature}: Predictive Power Score (PPS) Per Class')
+            fig.add_trace(pps_df_to_trace(s_train_to_display, 'train'))
+            fig.add_trace(pps_df_to_trace(s_test_to_display, 'test'))
             display.append(fig)
 
     return ret_value, display