Skip to content

Commit

Permalink
Update single feature contribution plot (#1337)
Browse files Browse the repository at this point in the history
* Update single feature contribution plot

* Fix lint

* Fix space missing in text
  • Loading branch information
matanper committed Apr 27, 2022
1 parent 6359c7a commit c700e58
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 55 deletions.
82 changes: 39 additions & 43 deletions deepchecks/core/check_utils/single_feature_contribution_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,38 @@
import plotly.graph_objects as go


def get_pps_figure(per_class: bool):
"""If per_class is True, then no title is defined on the figure."""
fig = go.Figure()
fig.update_layout(
yaxis_title='Predictive Power Score (PPS)',
yaxis_range=[0, 1.05],
legend=dict(x=1.0, y=1.0),
barmode='group',
width=800, height=500
)
if per_class:
fig.update_layout(xaxis_title='Class')
else:
fig.update_layout(
title='Predictive Power Score (PPS) - Can a feature predict the label by itself?',
xaxis_title='Column',
)
return fig


def pps_df_to_trace(s_pps: pd.Series, name: str):
"""If name is train/test use our defined colors, else will use plotly defaults."""
name = name.capitalize() if name else None
return go.Bar(x=s_pps.index,
y=s_pps,
name=name,
marker_color=colors.get(name),
text=s_pps.round(2),
textposition='outside'
)


def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Optional[Hashable], test_df: pd.DataFrame,
test_label_name: Optional[Hashable], ppscore_params: dict, n_show_top: int,
random_state: int = None):
Expand Down Expand Up @@ -68,34 +100,16 @@ def get_single_feature_contribution(train_df: pd.DataFrame, train_label_name: Op
s_pps_train_to_display = s_pps_train[s_difference_to_display.index]
s_pps_test_to_display = s_pps_test[s_difference_to_display.index]

fig = go.Figure()
fig.add_trace(go.Bar(x=s_pps_train_to_display.index,
y=s_pps_train_to_display,
name='Train',
marker_color=colors['Train'], text=s_pps_train_to_display.round(2), textposition='outside'
))
fig.add_trace(go.Bar(x=s_pps_test_to_display.index,
y=s_pps_test_to_display,
name='Test',
marker_color=colors['Test'], text=s_pps_test_to_display.round(2), textposition='outside'
))
fig = get_pps_figure(per_class=False)
fig.add_trace(pps_df_to_trace(s_pps_train_to_display, 'train'))
fig.add_trace(pps_df_to_trace(s_pps_test_to_display, 'test'))
fig.add_trace(go.Scatter(x=s_difference_to_display.index,
y=s_difference_to_display,
name='Train-Test Difference (abs)',
marker=dict(symbol='circle', size=15),
line=dict(color='#aa57b5', width=5)
))

fig.update_layout(
title='Predictive Power Score (PPS) - Can a feature predict the label by itself?',
xaxis_title='Column',
yaxis_title='Predictive Power Score (PPS)',
yaxis_range=[0, 1.05],
legend=dict(x=1.0, y=1.0),
barmode='group',
width=800, height=500
)

ret_value = {'train': s_pps_train.to_dict(), 'test': s_pps_test.to_dict(),
'train-test difference': s_difference.to_dict()}

Expand Down Expand Up @@ -189,28 +203,10 @@ def get_single_feature_contribution_per_class(train_df: pd.DataFrame, train_labe
s_train_to_display = s_train[s_difference_to_display.index]
s_test_to_display = s_test[s_difference_to_display.index]

fig = go.Figure()
fig.add_trace(go.Bar(x=s_train_to_display.index.astype(str),
y=s_train_to_display,
name='Train',
marker_color=colors['Train'], text=s_train_to_display.round(2), textposition='outside'
))
fig.add_trace(go.Bar(x=s_test_to_display.index.astype(str),
y=s_test_to_display,
name='Test',
marker_color=colors['Test'], text=s_test_to_display.round(2), textposition='outside'
))

fig.update_layout(
title=f'{feature}: Predictive Power Score (PPS) Per Class',
xaxis_title='Class',
yaxis_title='Predictive Power Score (PPS)',
yaxis_range=[0, 1.05],
legend=dict(x=1.0, y=1.0),
barmode='group',
width=800, height=400
)

fig = get_pps_figure(per_class=True)
fig.update_layout(title=f'{feature}: Predictive Power Score (PPS) Per Class')
fig.add_trace(pps_df_to_trace(s_train_to_display, 'train'))
fig.add_trace(pps_df_to_trace(s_test_to_display, 'test'))
display.append(fig)

return ret_value, display
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

import deepchecks.ppscore as pps
from deepchecks.core import CheckResult, ConditionResult, ConditionCategory
from deepchecks.core.check_utils.single_feature_contribution_utils import get_pps_figure, pps_df_to_trace
from deepchecks.tabular import Context, SingleDatasetCheck
from deepchecks.utils.plot import create_colorbar_barchart_for_check
from deepchecks.utils.typing import Hashable
from deepchecks.utils.strings import format_number

Expand Down Expand Up @@ -87,23 +87,21 @@ def run_logic(self, context: Context, dataset_type: str = 'train') -> CheckResul

df_pps = pps.predictors(df=dataset.data[relevant_columns], y=dataset.label_name, random_seed=self.random_state,
**self.ppscore_params)
df_pps = df_pps.set_index('x', drop=True)
s_ppscore = df_pps['ppscore']
s_ppscore = df_pps.set_index('x', drop=True)['ppscore']
top_to_show = s_ppscore.head(self.n_top_features)

def plot(n_top_features=self.n_top_features):
top_to_show = s_ppscore.head(n_top_features)
# Create graph:
create_colorbar_barchart_for_check(x=top_to_show.index, y=top_to_show.values)
fig = get_pps_figure(per_class=False)
fig.add_trace(pps_df_to_trace(top_to_show, dataset_type))

text = [
'The Predictive Power Score (PPS) is used to estimate the ability of a feature to predict the '
f'label by itself. (Read more about {pps_html})'
'A high PPS (close to 1) can mean that this feature\'s success in predicting the label is'
f'label by itself (Read more about {pps_html}).'
' A high PPS (close to 1) can mean that this feature\'s success in predicting the label is'
' actually due to data leakage - meaning that the feature holds information that is based on the label '
'to begin with.']

# display only if not all scores are 0
display = [plot, *text] if s_ppscore.sum() else None
display = [fig, *text] if s_ppscore.sum() else None

return CheckResult(value=s_ppscore.to_dict(), display=display, header='Single Feature Contribution')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from deepchecks.tabular.checks.methodology import *
from deepchecks.tabular import Dataset
Expand All @@ -26,7 +25,7 @@

#%%

ds = Dataset(df, label='label')
ds = Dataset(df, label='label', cat_features=[])

#%%
# Running single_feature_contribution check
Expand Down

0 comments on commit c700e58

Please sign in to comment.