Fixes to dataset creation logic (#279)

Enable using the index levels of the input DataFrame as index or / and datatime for the Dataset
deepchecks · Dec 23, 2021 · d12202e · d12202e
1 parent 625ac3d
commit d12202e
Show file tree

Hide file tree

Showing 19 changed files with 1,459 additions and 1,393 deletions.
diff --git a/deepchecks/base/dataset.py b/deepchecks/base/dataset.py
diff --git a/deepchecks/checks/methodology/date_train_test_leakage_duplicates.py b/deepchecks/checks/methodology/date_train_test_leakage_duplicates.py
@@ -53,8 +53,8 @@ def _date_train_test_leakage_duplicates(self, train_dataset: Dataset, test_datas
         train_dataset.validate_date()
         test_dataset.validate_date()
 
-        train_date = train_dataset.date_col
-        val_date = test_dataset.date_col
+        train_date = train_dataset.datetime_col
+        val_date = test_dataset.datetime_col
 
         date_intersection = set(train_date).intersection(val_date)
         if len(date_intersection) > 0:

diff --git a/deepchecks/checks/methodology/date_train_test_leakage_overlap.py b/deepchecks/checks/methodology/date_train_test_leakage_overlap.py
@@ -43,8 +43,8 @@ def _date_train_test_leakage_overlap(self, train_dataset: Dataset, test_dataset:
         train_dataset.validate_date()
         test_dataset.validate_date()
 
-        train_date = train_dataset.date_col
-        val_date = test_dataset.date_col
+        train_date = train_dataset.datetime_col
+        val_date = test_dataset.datetime_col
 
         max_train_date = max(train_date)
         dates_leaked = sum(date <= max_train_date for date in val_date)

diff --git a/deepchecks/checks/methodology/identifier_leakage.py b/deepchecks/checks/methodology/identifier_leakage.py
@@ -57,7 +57,7 @@ def _identifier_leakage(self, dataset: Union[pd.DataFrame, Dataset], ppscore_par
         dataset.validate_label()
         ppscore_params = ppscore_params or {}
 
-        relevant_columns = list(filter(None, [dataset.date_name, dataset.index_name, dataset.label_name]))
+        relevant_columns = list(filter(None, [dataset.datetime_name, dataset.index_name, dataset.label_name]))
 
         if len(relevant_columns) == 1:
             raise DeepchecksValueError('Dataset needs to have a date or index column.')

diff --git a/deepchecks/utils/features.py b/deepchecks/utils/features.py
@@ -165,7 +165,7 @@ def get_importance(name: str, feature_importances: pd.Series, ds: 'base.Dataset'
     """Return importance based on feature importance or label/date/index first."""
     if name in feature_importances.keys():
         return feature_importances[name]
-    if name in [ds.label_name, ds.date_name, ds.index_name]:
+    if name in [ds.label_name, ds.datetime_name, ds.index_name]:
         return 1
     return 0
 

diff --git a/examples/checks/methodology/date_train_validation_leakage_duplicates.ipynb b/examples/checks/methodology/date_train_validation_leakage_duplicates.ipynb
@@ -27,9 +27,9 @@
     "from datetime import datetime\n",
     "import pandas as pd\n",
     "\n",
-    "def dataset_from_dict(d: dict, date_name: str = None) -> Dataset:\n",
+    "def dataset_from_dict(d: dict, datetime_name: str = None) -> Dataset:\n",
     "    dataframe = pd.DataFrame(data=d)\n",
-    "    return Dataset(dataframe, date_name=date_name)"
+    "    return Dataset(dataframe, datetime_name=datetime_name)"
    ]
   },
   {

diff --git a/examples/checks/methodology/date_train_validation_leakage_overlap.ipynb b/examples/checks/methodology/date_train_validation_leakage_overlap.ipynb
@@ -27,9 +27,9 @@
     "from datetime import datetime\n",
     "import pandas as pd\n",
     "\n",
-    "def dataset_from_dict(d: dict, date_name: str = None) -> Dataset:\n",
+    "def dataset_from_dict(d: dict, datetime_name: str = None) -> Dataset:\n",
     "    dataframe = pd.DataFrame(data=d)\n",
-    "    return Dataset(dataframe, date_name=date_name)"
+    "    return Dataset(dataframe, datetime_name=datetime_name)"
    ]
   },
   {

diff --git a/examples/checks/methodology/identifier_leakage.ipynb b/examples/checks/methodology/identifier_leakage.ipynb
@@ -82,7 +82,7 @@
    },
    "outputs": [],
    "source": [
-    "dataset = Dataset(df, label_name='label', index_name='x1', date_name='x2')"
+    "dataset = Dataset(df, label_name='label', index_name='x1', datetime_name='x2')"
    ]
   },
   {

diff --git a/examples/checks/overview/columns_info.ipynb b/examples/checks/overview/columns_info.ipynb
@@ -75,7 +75,7 @@
     "data = {'index': index, 'date': date, 'a': cat_fe, 'b': num_fe, 'c': num_col, 'label': cat_fe}\n",
     "df = pd.DataFrame.from_dict(data)\n",
     "\n",
-    "dataset = Dataset(df, label_name='label', date_name='date', index_name='index', features=['a', 'b'])"
+    "dataset = Dataset(df, label_name='label', datetime_name='date', index_name='index', features=['a', 'b'])"
    ]
   },
   {

diff --git a/examples/howto-guides/add_a_custom_check.ipynb b/examples/howto-guides/add_a_custom_check.ipynb
diff --git a/examples/howto-guides/configure_check_conditions.ipynb b/examples/howto-guides/configure_check_conditions.ipynb
@@ -51,14 +51,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "id": "b3862684",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:22.006594Z",
-     "iopub.status.busy": "2021-12-23T11:43:22.004690Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.894984Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.896040Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -92,14 +85,7 @@
    "cell_type": "code",
    "execution_count": 2,
    "id": "db25f8ed",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.909278Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.908286Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.912215Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.911297Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -163,14 +149,7 @@
    "cell_type": "code",
    "execution_count": 3,
    "id": "af7bd116",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.916352Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.915701Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.921001Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.921445Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -224,14 +203,7 @@
    "cell_type": "code",
    "execution_count": 4,
    "id": "faf589a1",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.926435Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.925901Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.928026Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.928481Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -285,14 +257,7 @@
    "cell_type": "code",
    "execution_count": 5,
    "id": "9488c030",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.933198Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.932559Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.938104Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.938521Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -355,14 +320,7 @@
    "cell_type": "code",
    "execution_count": 6,
    "id": "2a1821bf",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.949014Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.948106Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.953788Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.953295Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -403,14 +361,7 @@
    "cell_type": "code",
    "execution_count": 7,
    "id": "06ef9c99",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.959657Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.959106Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.962580Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.962112Z"
-    }
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "from deepchecks import ConditionResult\n",
@@ -447,14 +398,7 @@
    "cell_type": "code",
    "execution_count": 8,
    "id": "39b8606b",
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.966431Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.964582Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.984061Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.983570Z"
-    }
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -467,17 +411,16 @@
      "data": {
       "text/html": [
        "\n",
-       "        <h1 id=\"summary_B679F\">Suite for Condition</h1>\n",
-       "        <p>The suite is composed of the following checks: Datasets Size Comparison.<br>\n",
-       "        Each check may contain conditions (which results in \n",
+       "    <h1 id=\"summary_1D7F7\">Suite for Condition</h1>\n",
+       "    <p>The suite is composed of various checks such as: Datasets Size Comparison, etc...<br>\n",
+       "    Each check may contain conditions (which results in \n",
        "    <span style=\"color: green;display:inline-block\">✓</span> /\n",
        "    <span style=\"color: red;display:inline-block\">✖</span> /\n",
        "    <span style=\"color: orange;font-weight:bold;display:inline-block\">!</span>\n",
-       "    ),\n",
-       "        as well as other outputs such as plots or tables.<br>\n",
-       "        Suites, checks and conditions can all be modified (see tutorial [link]).</p>\n",
-       "        <hr style=\"background-color: black;border: 0 none;color: black;height: 1px;\"><h2>Conditions Summary</h2>\n",
-       "        "
+       "    ), as well as other outputs such as plots or tables.<br>\n",
+       "    Suites, checks and conditions can all be modified (see tutorial [link]).</p>\n",
+       "    <hr style=\"background-color: black;border: 0 none;color: black;height: 1px;\"><h2>Conditions Summary</h2>\n",
+       "    "
       ]
      },
      "metadata": {},
@@ -487,23 +430,23 @@
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_ac9c7_ table {\n",
+       "#T_f3410_ table {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_ac9c7_ thead {\n",
+       "#T_f3410_ thead {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_ac9c7_ tbody {\n",
+       "#T_f3410_ tbody {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_ac9c7_ th {\n",
+       "#T_f3410_ th {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_ac9c7_ td {\n",
+       "#T_f3410_ td {\n",
        "  text-align: left;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_ac9c7_\">\n",
+       "<table id=\"T_f3410_\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"col_heading level0 col0\" >Status</th>\n",
@@ -514,10 +457,10 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <td id=\"T_ac9c7_row0_col0\" class=\"data row0 col0\" ><div style=\"color: red;text-align: center\">✖</div></td>\n",
-       "      <td id=\"T_ac9c7_row0_col1\" class=\"data row0 col1\" ><a href=#DatasetsSizeComparison_B679F>Datasets Size Comparison</a></td>\n",
-       "      <td id=\"T_ac9c7_row0_col2\" class=\"data row0 col2\" >Test-Train ratio is between 0.4 to 0.6</td>\n",
-       "      <td id=\"T_ac9c7_row0_col3\" class=\"data row0 col3\" >Test-Train ratio is 0.33</td>\n",
+       "      <td id=\"T_f3410_row0_col0\" class=\"data row0 col0\" ><div style=\"color: red;text-align: center\">✖</div></td>\n",
+       "      <td id=\"T_f3410_row0_col1\" class=\"data row0 col1\" ><a href=#DatasetsSizeComparison_1D7F7>Datasets Size Comparison</a></td>\n",
+       "      <td id=\"T_f3410_row0_col2\" class=\"data row0 col2\" >Test-Train ratio is between 0.4 to 0.6</td>\n",
+       "      <td id=\"T_f3410_row0_col3\" class=\"data row0 col3\" >Test-Train ratio is 0.33</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
@@ -538,7 +481,7 @@
     {
      "data": {
       "text/html": [
-       "<h4 id=\"DatasetsSizeComparison_B679F\">Datasets Size Comparison</h4>"
+       "<h4 id=\"DatasetsSizeComparison_1D7F7\">Datasets Size Comparison</h4>"
       ]
      },
      "metadata": {},
@@ -557,23 +500,23 @@
      "data": {
       "text/html": [
        "<style type=\"text/css\">\n",
-       "#T_c46ba_ table {\n",
+       "#T_e0fbf_ table {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_c46ba_ thead {\n",
+       "#T_e0fbf_ thead {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_c46ba_ tbody {\n",
+       "#T_e0fbf_ tbody {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_c46ba_ th {\n",
+       "#T_e0fbf_ th {\n",
        "  text-align: left;\n",
        "}\n",
-       "#T_c46ba_ td {\n",
+       "#T_e0fbf_ td {\n",
        "  text-align: left;\n",
        "}\n",
        "</style>\n",
-       "<table id=\"T_c46ba_\">\n",
+       "<table id=\"T_e0fbf_\">\n",
        "  <thead>\n",
        "    <tr>\n",
        "      <th class=\"blank level0\" >&nbsp;</th>\n",
@@ -583,9 +526,9 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th id=\"T_c46ba_level0_row0\" class=\"row_heading level0 row0\" >Size</th>\n",
-       "      <td id=\"T_c46ba_row0_col0\" class=\"data row0 col0\" >9</td>\n",
-       "      <td id=\"T_c46ba_row0_col1\" class=\"data row0 col1\" >3</td>\n",
+       "      <th id=\"T_e0fbf_level0_row0\" class=\"row_heading level0 row0\" >Size</th>\n",
+       "      <td id=\"T_e0fbf_row0_col0\" class=\"data row0 col0\" >9</td>\n",
+       "      <td id=\"T_e0fbf_row0_col1\" class=\"data row0 col1\" >3</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n"
@@ -597,16 +540,7 @@
     {
      "data": {
       "text/html": [
-       "<br><a href=\"#summary_B679F\" style=\"font-size: 14px\">Go to top</a>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<br><a href=\"#summary_B679F\" style=\"font-size: 14px\">Go to top</a>"
+       "<br><a href=\"#summary_1D7F7\" style=\"font-size: 14px\">Go to top</a>"
       ]
      },
      "metadata": {},
@@ -647,12 +581,7 @@
    "execution_count": 9,
    "id": "b6ce24a3",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2021-12-23T11:43:23.989828Z",
-     "iopub.status.busy": "2021-12-23T11:43:23.989110Z",
-     "iopub.status.idle": "2021-12-23T11:43:23.990464Z",
-     "shell.execute_reply": "2021-12-23T11:43:23.990967Z"
-    }
+    "tags": []
    },
    "outputs": [],
    "source": [
@@ -690,7 +619,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.9"
+   "version": "3.8.10"
   }
  },
  "nbformat": 4,