Skip to content

Commit

Permalink
Fixes to dataset creation logic (#279)
Browse files Browse the repository at this point in the history
Enable using the index levels of the input DataFrame as index or / and datatime for the Dataset
  • Loading branch information
noamzbr committed Dec 23, 2021
1 parent 625ac3d commit d12202e
Show file tree
Hide file tree
Showing 19 changed files with 1,459 additions and 1,393 deletions.
350 changes: 201 additions & 149 deletions deepchecks/base/dataset.py

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ def _date_train_test_leakage_duplicates(self, train_dataset: Dataset, test_datas
train_dataset.validate_date()
test_dataset.validate_date()

train_date = train_dataset.date_col
val_date = test_dataset.date_col
train_date = train_dataset.datetime_col
val_date = test_dataset.datetime_col

date_intersection = set(train_date).intersection(val_date)
if len(date_intersection) > 0:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ def _date_train_test_leakage_overlap(self, train_dataset: Dataset, test_dataset:
train_dataset.validate_date()
test_dataset.validate_date()

train_date = train_dataset.date_col
val_date = test_dataset.date_col
train_date = train_dataset.datetime_col
val_date = test_dataset.datetime_col

max_train_date = max(train_date)
dates_leaked = sum(date <= max_train_date for date in val_date)
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/checks/methodology/identifier_leakage.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _identifier_leakage(self, dataset: Union[pd.DataFrame, Dataset], ppscore_par
dataset.validate_label()
ppscore_params = ppscore_params or {}

relevant_columns = list(filter(None, [dataset.date_name, dataset.index_name, dataset.label_name]))
relevant_columns = list(filter(None, [dataset.datetime_name, dataset.index_name, dataset.label_name]))

if len(relevant_columns) == 1:
raise DeepchecksValueError('Dataset needs to have a date or index column.')
Expand Down
2 changes: 1 addition & 1 deletion deepchecks/utils/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def get_importance(name: str, feature_importances: pd.Series, ds: 'base.Dataset'
"""Return importance based on feature importance or label/date/index first."""
if name in feature_importances.keys():
return feature_importances[name]
if name in [ds.label_name, ds.date_name, ds.index_name]:
if name in [ds.label_name, ds.datetime_name, ds.index_name]:
return 1
return 0

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
"from datetime import datetime\n",
"import pandas as pd\n",
"\n",
"def dataset_from_dict(d: dict, date_name: str = None) -> Dataset:\n",
"def dataset_from_dict(d: dict, datetime_name: str = None) -> Dataset:\n",
" dataframe = pd.DataFrame(data=d)\n",
" return Dataset(dataframe, date_name=date_name)"
" return Dataset(dataframe, datetime_name=datetime_name)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@
"from datetime import datetime\n",
"import pandas as pd\n",
"\n",
"def dataset_from_dict(d: dict, date_name: str = None) -> Dataset:\n",
"def dataset_from_dict(d: dict, datetime_name: str = None) -> Dataset:\n",
" dataframe = pd.DataFrame(data=d)\n",
" return Dataset(dataframe, date_name=date_name)"
" return Dataset(dataframe, datetime_name=datetime_name)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion examples/checks/methodology/identifier_leakage.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
},
"outputs": [],
"source": [
"dataset = Dataset(df, label_name='label', index_name='x1', date_name='x2')"
"dataset = Dataset(df, label_name='label', index_name='x1', datetime_name='x2')"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion examples/checks/overview/columns_info.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
"data = {'index': index, 'date': date, 'a': cat_fe, 'b': num_fe, 'c': num_col, 'label': cat_fe}\n",
"df = pd.DataFrame.from_dict(data)\n",
"\n",
"dataset = Dataset(df, label_name='label', date_name='date', index_name='index', features=['a', 'b'])"
"dataset = Dataset(df, label_name='label', datetime_name='date', index_name='index', features=['a', 'b'])"
]
},
{
Expand Down
67 changes: 16 additions & 51 deletions examples/howto-guides/add_a_custom_check.ipynb

Large diffs are not rendered by default.

147 changes: 38 additions & 109 deletions examples/howto-guides/configure_check_conditions.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,7 @@
"cell_type": "code",
"execution_count": 1,
"id": "b3862684",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:22.006594Z",
"iopub.status.busy": "2021-12-23T11:43:22.004690Z",
"iopub.status.idle": "2021-12-23T11:43:23.894984Z",
"shell.execute_reply": "2021-12-23T11:43:23.896040Z"
}
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -92,14 +85,7 @@
"cell_type": "code",
"execution_count": 2,
"id": "db25f8ed",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.909278Z",
"iopub.status.busy": "2021-12-23T11:43:23.908286Z",
"iopub.status.idle": "2021-12-23T11:43:23.912215Z",
"shell.execute_reply": "2021-12-23T11:43:23.911297Z"
}
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -163,14 +149,7 @@
"cell_type": "code",
"execution_count": 3,
"id": "af7bd116",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.916352Z",
"iopub.status.busy": "2021-12-23T11:43:23.915701Z",
"iopub.status.idle": "2021-12-23T11:43:23.921001Z",
"shell.execute_reply": "2021-12-23T11:43:23.921445Z"
}
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -224,14 +203,7 @@
"cell_type": "code",
"execution_count": 4,
"id": "faf589a1",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.926435Z",
"iopub.status.busy": "2021-12-23T11:43:23.925901Z",
"iopub.status.idle": "2021-12-23T11:43:23.928026Z",
"shell.execute_reply": "2021-12-23T11:43:23.928481Z"
}
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -285,14 +257,7 @@
"cell_type": "code",
"execution_count": 5,
"id": "9488c030",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.933198Z",
"iopub.status.busy": "2021-12-23T11:43:23.932559Z",
"iopub.status.idle": "2021-12-23T11:43:23.938104Z",
"shell.execute_reply": "2021-12-23T11:43:23.938521Z"
}
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -355,14 +320,7 @@
"cell_type": "code",
"execution_count": 6,
"id": "2a1821bf",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.949014Z",
"iopub.status.busy": "2021-12-23T11:43:23.948106Z",
"iopub.status.idle": "2021-12-23T11:43:23.953788Z",
"shell.execute_reply": "2021-12-23T11:43:23.953295Z"
}
},
"metadata": {},
"outputs": [
{
"data": {
Expand Down Expand Up @@ -403,14 +361,7 @@
"cell_type": "code",
"execution_count": 7,
"id": "06ef9c99",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.959657Z",
"iopub.status.busy": "2021-12-23T11:43:23.959106Z",
"iopub.status.idle": "2021-12-23T11:43:23.962580Z",
"shell.execute_reply": "2021-12-23T11:43:23.962112Z"
}
},
"metadata": {},
"outputs": [],
"source": [
"from deepchecks import ConditionResult\n",
Expand Down Expand Up @@ -447,14 +398,7 @@
"cell_type": "code",
"execution_count": 8,
"id": "39b8606b",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.966431Z",
"iopub.status.busy": "2021-12-23T11:43:23.964582Z",
"iopub.status.idle": "2021-12-23T11:43:23.984061Z",
"shell.execute_reply": "2021-12-23T11:43:23.983570Z"
}
},
"metadata": {},
"outputs": [
{
"name": "stdout",
Expand All @@ -467,17 +411,16 @@
"data": {
"text/html": [
"\n",
" <h1 id=\"summary_B679F\">Suite for Condition</h1>\n",
" <p>The suite is composed of the following checks: Datasets Size Comparison.<br>\n",
" Each check may contain conditions (which results in \n",
" <h1 id=\"summary_1D7F7\">Suite for Condition</h1>\n",
" <p>The suite is composed of various checks such as: Datasets Size Comparison, etc...<br>\n",
" Each check may contain conditions (which results in \n",
" <span style=\"color: green;display:inline-block\">✓</span> /\n",
" <span style=\"color: red;display:inline-block\">✖</span> /\n",
" <span style=\"color: orange;font-weight:bold;display:inline-block\">!</span>\n",
" ),\n",
" as well as other outputs such as plots or tables.<br>\n",
" Suites, checks and conditions can all be modified (see tutorial [link]).</p>\n",
" <hr style=\"background-color: black;border: 0 none;color: black;height: 1px;\"><h2>Conditions Summary</h2>\n",
" "
" ), as well as other outputs such as plots or tables.<br>\n",
" Suites, checks and conditions can all be modified (see tutorial [link]).</p>\n",
" <hr style=\"background-color: black;border: 0 none;color: black;height: 1px;\"><h2>Conditions Summary</h2>\n",
" "
]
},
"metadata": {},
Expand All @@ -487,23 +430,23 @@
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_ac9c7_ table {\n",
"#T_f3410_ table {\n",
" text-align: left;\n",
"}\n",
"#T_ac9c7_ thead {\n",
"#T_f3410_ thead {\n",
" text-align: left;\n",
"}\n",
"#T_ac9c7_ tbody {\n",
"#T_f3410_ tbody {\n",
" text-align: left;\n",
"}\n",
"#T_ac9c7_ th {\n",
"#T_f3410_ th {\n",
" text-align: left;\n",
"}\n",
"#T_ac9c7_ td {\n",
"#T_f3410_ td {\n",
" text-align: left;\n",
"}\n",
"</style>\n",
"<table id=\"T_ac9c7_\">\n",
"<table id=\"T_f3410_\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"col_heading level0 col0\" >Status</th>\n",
Expand All @@ -514,10 +457,10 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_ac9c7_row0_col0\" class=\"data row0 col0\" ><div style=\"color: red;text-align: center\">✖</div></td>\n",
" <td id=\"T_ac9c7_row0_col1\" class=\"data row0 col1\" ><a href=#DatasetsSizeComparison_B679F>Datasets Size Comparison</a></td>\n",
" <td id=\"T_ac9c7_row0_col2\" class=\"data row0 col2\" >Test-Train ratio is between 0.4 to 0.6</td>\n",
" <td id=\"T_ac9c7_row0_col3\" class=\"data row0 col3\" >Test-Train ratio is 0.33</td>\n",
" <td id=\"T_f3410_row0_col0\" class=\"data row0 col0\" ><div style=\"color: red;text-align: center\">✖</div></td>\n",
" <td id=\"T_f3410_row0_col1\" class=\"data row0 col1\" ><a href=#DatasetsSizeComparison_1D7F7>Datasets Size Comparison</a></td>\n",
" <td id=\"T_f3410_row0_col2\" class=\"data row0 col2\" >Test-Train ratio is between 0.4 to 0.6</td>\n",
" <td id=\"T_f3410_row0_col3\" class=\"data row0 col3\" >Test-Train ratio is 0.33</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
Expand All @@ -538,7 +481,7 @@
{
"data": {
"text/html": [
"<h4 id=\"DatasetsSizeComparison_B679F\">Datasets Size Comparison</h4>"
"<h4 id=\"DatasetsSizeComparison_1D7F7\">Datasets Size Comparison</h4>"
]
},
"metadata": {},
Expand All @@ -557,23 +500,23 @@
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"#T_c46ba_ table {\n",
"#T_e0fbf_ table {\n",
" text-align: left;\n",
"}\n",
"#T_c46ba_ thead {\n",
"#T_e0fbf_ thead {\n",
" text-align: left;\n",
"}\n",
"#T_c46ba_ tbody {\n",
"#T_e0fbf_ tbody {\n",
" text-align: left;\n",
"}\n",
"#T_c46ba_ th {\n",
"#T_e0fbf_ th {\n",
" text-align: left;\n",
"}\n",
"#T_c46ba_ td {\n",
"#T_e0fbf_ td {\n",
" text-align: left;\n",
"}\n",
"</style>\n",
"<table id=\"T_c46ba_\">\n",
"<table id=\"T_e0fbf_\">\n",
" <thead>\n",
" <tr>\n",
" <th class=\"blank level0\" >&nbsp;</th>\n",
Expand All @@ -583,9 +526,9 @@
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th id=\"T_c46ba_level0_row0\" class=\"row_heading level0 row0\" >Size</th>\n",
" <td id=\"T_c46ba_row0_col0\" class=\"data row0 col0\" >9</td>\n",
" <td id=\"T_c46ba_row0_col1\" class=\"data row0 col1\" >3</td>\n",
" <th id=\"T_e0fbf_level0_row0\" class=\"row_heading level0 row0\" >Size</th>\n",
" <td id=\"T_e0fbf_row0_col0\" class=\"data row0 col0\" >9</td>\n",
" <td id=\"T_e0fbf_row0_col1\" class=\"data row0 col1\" >3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
Expand All @@ -597,16 +540,7 @@
{
"data": {
"text/html": [
"<br><a href=\"#summary_B679F\" style=\"font-size: 14px\">Go to top</a>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<br><a href=\"#summary_B679F\" style=\"font-size: 14px\">Go to top</a>"
"<br><a href=\"#summary_1D7F7\" style=\"font-size: 14px\">Go to top</a>"
]
},
"metadata": {},
Expand Down Expand Up @@ -647,12 +581,7 @@
"execution_count": 9,
"id": "b6ce24a3",
"metadata": {
"execution": {
"iopub.execute_input": "2021-12-23T11:43:23.989828Z",
"iopub.status.busy": "2021-12-23T11:43:23.989110Z",
"iopub.status.idle": "2021-12-23T11:43:23.990464Z",
"shell.execute_reply": "2021-12-23T11:43:23.990967Z"
}
"tags": []
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -690,7 +619,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
"version": "3.8.10"
}
},
"nbformat": 4,
Expand Down

0 comments on commit d12202e

Please sign in to comment.