Update single feature contribution plot (#1337)

* Update single feature contribution plot * Fix lint * Fix space missing in text
deepchecks · Apr 27, 2022 · c700e58 · c700e58
1 parent 6359c7a
commit c700e58
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 55 deletions.
diff --git a/deepchecks/core/check_utils/single_feature_contribution_utils.py b/deepchecks/core/check_utils/single_feature_contribution_utils.py
@@ -19,6 +19,38 @@
 import plotly.graph_objects as go
 
 
+def get_pps_figure(per_class: bool):
+    """If per_class is True, then no title is defined on the figure."""
+    fig = go.Figure()
+    fig.update_layout(
+        yaxis_title='Predictive Power Score (PPS)',
+        yaxis_range=[0, 1.05],
+        legend=dict(x=1.0, y=1.0),
+        barmode='group',
+        width=800, height=500
+    )
+    if per_class:
+        fig.update_layout(xaxis_title='Class')
+    else:
+        fig.update_layout(
+            title='Predictive Power Score (PPS) - Can a feature predict the label by itself?',
+            xaxis_title='Column',
+        )
+    return fig
+
+
+def pps_df_to_trace(s_pps: pd.Series, name: str):
+    """If name is train/test use our defined colors, else will use plotly defaults."""
+    name = name.capitalize() if name else None
+    return go.Bar(x=s_pps.index,
+                  y=s_pps,
+                  name=name,
+                  marker_color=colors.get(name),
+                  text=s_pps.round(2),
+                  textposition='outside'
+                  )
+
+
 def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Optional[Hashable], test_df: pd.DataFrame,
                                     test_label_name: Optional[Hashable], ppscore_params: dict, n_show_top: int,
                                     random_state: int = None):
@@ -68,34 +100,16 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
     s_pps_train_to_display = s_pps_train[s_difference_to_display.index]
     s_pps_test_to_display = s_pps_test[s_difference_to_display.index]
 
-    fig = go.Figure()
-    fig.add_trace(go.Bar(x=s_pps_train_to_display.index,
-                         y=s_pps_train_to_display,
-                         name='Train',
-                         marker_color=colors['Train'], text=s_pps_train_to_display.round(2), textposition='outside'
-                         ))
-    fig.add_trace(go.Bar(x=s_pps_test_to_display.index,
-                         y=s_pps_test_to_display,
-                         name='Test',
-                         marker_color=colors['Test'], text=s_pps_test_to_display.round(2), textposition='outside'
-                         ))
+    fig = get_pps_figure(per_class=False)
+    fig.add_trace(pps_df_to_trace(s_pps_train_to_display, 'train'))
+    fig.add_trace(pps_df_to_trace(s_pps_test_to_display, 'test'))
     fig.add_trace(go.Scatter(x=s_difference_to_display.index,
                              y=s_difference_to_display,
                              name='Train-Test Difference (abs)',
                              marker=dict(symbol='circle', size=15),
                              line=dict(color='#aa57b5', width=5)
                              ))
 
-    fig.update_layout(
-        title='Predictive Power Score (PPS) - Can a feature predict the label by itself?',
-        xaxis_title='Column',
-        yaxis_title='Predictive Power Score (PPS)',
-        yaxis_range=[0, 1.05],
-        legend=dict(x=1.0, y=1.0),
-        barmode='group',
-        width=800, height=500
-    )
-
     ret_value = {'train': s_pps_train.to_dict(), 'test': s_pps_test.to_dict(),
                  'train-test difference': s_difference.to_dict()}
 
@@ -189,28 +203,10 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe
             s_train_to_display = s_train[s_difference_to_display.index]
             s_test_to_display = s_test[s_difference_to_display.index]
 
-            fig = go.Figure()
-            fig.add_trace(go.Bar(x=s_train_to_display.index.astype(str),
-                                 y=s_train_to_display,
-                                 name='Train',
-                                 marker_color=colors['Train'], text=s_train_to_display.round(2), textposition='outside'
-                                 ))
-            fig.add_trace(go.Bar(x=s_test_to_display.index.astype(str),
-                                 y=s_test_to_display,
-                                 name='Test',
-                                 marker_color=colors['Test'], text=s_test_to_display.round(2), textposition='outside'
-                                 ))
-
-            fig.update_layout(
-                title=f'{feature}: Predictive Power Score (PPS) Per Class',
-                xaxis_title='Class',
-                yaxis_title='Predictive Power Score (PPS)',
-                yaxis_range=[0, 1.05],
-                legend=dict(x=1.0, y=1.0),
-                barmode='group',
-                width=800, height=400
-            )
-
+            fig = get_pps_figure(per_class=True)
+            fig.update_layout(title=f'{feature}: Predictive Power Score (PPS) Per Class')
+            fig.add_trace(pps_df_to_trace(s_train_to_display, 'train'))
+            fig.add_trace(pps_df_to_trace(s_test_to_display, 'test'))
             display.append(fig)
 
     return ret_value, display
diff --git a/deepchecks/tabular/checks/methodology/single_feature_contribution.py b/deepchecks/tabular/checks/methodology/single_feature_contribution.py
@@ -13,8 +13,8 @@
 
 import deepchecks.ppscore as pps
 from deepchecks.core import CheckResult, ConditionResult, ConditionCategory
+from deepchecks.core.check_utils.single_feature_contribution_utils import get_pps_figure, pps_df_to_trace
 from deepchecks.tabular import Context, SingleDatasetCheck
-from deepchecks.utils.plot import create_colorbar_barchart_for_check
 from deepchecks.utils.typing import Hashable
 from deepchecks.utils.strings import format_number
 
@@ -87,23 +87,21 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul
 
         df_pps = pps.predictors(df=dataset.data[relevant_columns], y=dataset.label_name, random_seed=self.random_state,
                                 **self.ppscore_params)
-        df_pps = df_pps.set_index('x', drop=True)
-        s_ppscore = df_pps['ppscore']
+        s_ppscore = df_pps.set_index('x', drop=True)['ppscore']
+        top_to_show = s_ppscore.head(self.n_top_features)
 
-        def plot(n_top_features=self.n_top_features):
-            top_to_show = s_ppscore.head(n_top_features)
-            # Create graph:
-            create_colorbar_barchart_for_check(x=top_to_show.index, y=top_to_show.values)
+        fig = get_pps_figure(per_class=False)
+        fig.add_trace(pps_df_to_trace(top_to_show, dataset_type))
 
         text = [
             'The Predictive Power Score (PPS) is used to estimate the ability of a feature to predict the '
-            f'label by itself. (Read more about {pps_html})'
-            'A high PPS (close to 1) can mean that this feature\'s success in predicting the label is'
+            f'label by itself (Read more about {pps_html}).'
+            ' A high PPS (close to 1) can mean that this feature\'s success in predicting the label is'
             ' actually due to data leakage - meaning that the feature holds information that is based on the label '
             'to begin with.']
 
         # display only if not all scores are 0
-        display = [plot, *text] if s_ppscore.sum() else None
+        display = [fig, *text] if s_ppscore.sum() else None
 
         return CheckResult(value=s_ppscore.to_dict(), display=display, header='Single Feature Contribution')
 

diff --git a/docs/source/examples/tabular/checks/methodology/source/plot_single_feature_contribution.py b/docs/source/examples/tabular/checks/methodology/source/plot_single_feature_contribution.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 import pandas as pd
-import matplotlib.pyplot as plt
 
 from deepchecks.tabular.checks.methodology import *
 from deepchecks.tabular import Dataset
@@ -26,7 +25,7 @@
 
 #%%
 
-ds = Dataset(df, label='label')
+ds = Dataset(df, label='label', cat_features=[])
 
 #%%
 # Running single_feature_contribution check