Merge pull request #505 from great-expectations/dev_eug_profiling_201…

…90618 Basic dataset profiler implementation + changes that it depends on
great-expectations · Jun 21, 2019 · acb2963 · acb2963
2 parents 072e7cf + a770b55
commit acb2963
Show file tree

Hide file tree

Showing 17 changed files with 499 additions and 230 deletions.
diff --git a/great_expectations/cli/cli.py b/great_expectations/cli/cli.py
@@ -195,7 +195,7 @@ def render(render_object):
               help='Maximum number of named data assets to profile.')
 @click.option('--profile_all_data_assets', '-A', is_flag=True, default=False,
               help='Profile ALL data assets within the target data source. If True, this will override --max_data_assets.')
-@click.option('--target_directory', '-d', default="./",
+@click.option('--target_directory', '-d', default="./great_expectations",
               help='The root of a project directory containing a great_expectations/ config.')
 def profile(datasource_name, max_data_assets, profile_all_data_assets, target_directory):
     """Profile a great expectations object.

diff --git a/great_expectations/cli/datasource.py b/great_expectations/cli/datasource.py
@@ -203,9 +203,15 @@ def add_datasource(context):
 """
 
 msg_sqlalchemy_go_to_notebook = """
-To create expectations for your SQL queries start Jupyter and open notebook
-great_expectations/notebooks/using_great_expectations_with_sql.ipynb -
+To create expectations for your SQL data assets start Jupyter and open the notebook
+great_expectations/notebooks/create_expectations_sql.ipynb.
 it will walk you through configuring the database connection and next steps.
+
+To launch with jupyter notebooks:
+    <blue>jupyter notebook great_expectations/notebooks/create_expectations_sql.ipynb</blue>
+
+To launch with jupyter lab:
+    <blue>jupyter lab great_expectations/notebooks/create_expectations_sql.ipynb</blue>
 """
 
 msg_spark_go_to_notebook = """

diff --git a/great_expectations/data_asset/data_asset.py b/great_expectations/data_asset/data_asset.py
@@ -1136,7 +1136,6 @@ def _format_map_output(
                     }
                 )
 
-
         if result_format['result_format'] == 'SUMMARY':
             return return_obj
 

diff --git a/great_expectations/data_asset/util.py b/great_expectations/data_asset/util.py
@@ -177,6 +177,9 @@ def recursively_convert_to_json_serializable(test_obj):
         # Note: Use np.floating to avoid FutureWarning from numpy
         return float(round(test_obj, sys.float_info.dig))
 
+    elif isinstance(test_obj, pd.Series):
+        return recursively_convert_to_json_serializable(test_obj.to_dict())
+
     elif isinstance(test_obj, pd.DataFrame):
         return recursively_convert_to_json_serializable(test_obj.to_dict(orient='records'))
 

diff --git a/great_expectations/dataset/dataset.py b/great_expectations/dataset/dataset.py
@@ -1647,6 +1647,95 @@ def expect_column_parameterized_distribution_ks_test_p_value_to_be_greater_than(
         """
         raise NotImplementedError
 
+    @DocInherit
+    @MetaDataset.column_aggregate_expectation
+    def expect_column_distinct_values_to_be_in_set(self,
+                                                   column,
+                                                   value_set,
+                                                   parse_strings_as_datetimes=None,
+                                                   result_format=None, include_config=False, catch_exceptions=None, meta=None):
+        """Expect the set of distinct column values to be contained by a given set.
+
+        The success value for this expectation will match that of expect_column_values_to_be_in_set, but this is an aggregate expectation
+        and so will provide aggregate semantics including an observed value.
+
+        For example:
+        ::
+
+            # my_df.my_col = [1,2,2,3,3,3]
+            >>> my_df.expect_column_distinct_values_to_be_in_set(
+                "my_col",
+                [2, 3, 4]
+            )
+            {
+              "success": false
+              "result": {
+                "observed_value": [1,2,3],
+                "details": {
+                    "value_counts": {
+                        "1": 1,
+                        "2": 2,
+                        "3": 3
+                    }
+                }
+              },
+            }
+
+        expect_column_distinct_values_to_be_in_set is a :func:`column_aggregate_expectation <great_expectations.data_asset.dataset.Dataset.column_aggregate_expectation>`.
+
+
+        Args:
+            column (str): \
+                The column name.
+            value_set (set-like): \
+                A set of objects used for comparison.
+
+        Keyword Args:
+            parse_strings_as_datetimes (boolean or None) : If True values provided in value_set will be parsed as \
+                datetimes before making comparisons.
+
+        Other Parameters:
+            result_format (str or None): \
+                Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
+                For more detail, see :ref:`result_format <result_format>`.
+            include_config (boolean): \
+                If True, then include the expectation config as part of the result object. \
+                For more detail, see :ref:`include_config`.
+            catch_exceptions (boolean or None): \
+                If True, then catch exceptions and include them as part of the result object. \
+                For more detail, see :ref:`catch_exceptions`.
+            meta (dict or None): \
+                A JSON-serializable dictionary (nesting allowed) that will be included in the output without modification. \
+                For more detail, see :ref:`meta`.
+
+        Returns:
+            A JSON-serializable expectation result object.
+
+            Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
+            :ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
+
+        See Also:
+            expect_column_distinct_values_to_contain_set
+        """
+        if parse_strings_as_datetimes:
+            parsed_value_set = self._parse_value_set(value_set)
+        else:
+            parsed_value_set = value_set
+
+        observed_value_counts = self.get_column_value_counts(column)
+        expected_value_set = set(parsed_value_set)
+        observed_value_set = set(observed_value_counts.index)
+
+        return {
+            "success": observed_value_set.issubset(expected_value_set),
+            "result": {
+                "observed_value": sorted(list(observed_value_set)),
+                "details": {
+                    "value_counts": observed_value_counts
+                }
+            }
+        }
+
     @DocInherit
     @MetaDataset.column_aggregate_expectation
     def expect_column_distinct_values_to_equal_set(self,
@@ -1714,13 +1803,17 @@ def expect_column_distinct_values_to_equal_set(self,
         else:
             parsed_value_set = value_set
 
+        observed_value_counts = self.get_column_value_counts(column)
         expected_value_set = set(parsed_value_set)
-        observed_value_set = set(self.get_column_value_counts(column).index)
+        observed_value_set = set(observed_value_counts.index)
 
         return {
             "success": observed_value_set == expected_value_set,
             "result": {
-                "observed_value": sorted(list(observed_value_set))
+                "observed_value": sorted(list(observed_value_set)),
+                "details": {
+                    "value_counts": observed_value_counts
+                }
             }
         }
 
@@ -1792,13 +1885,17 @@ def expect_column_distinct_values_to_contain_set(self,
         else:
             parsed_value_set = value_set
 
+        observed_value_counts = self.get_column_value_counts(column)
         expected_value_set = set(parsed_value_set)
-        observed_value_set = set(self.get_column_value_counts(column).index)
+        observed_value_set = set(observed_value_counts.index)
 
         return {
             "success": observed_value_set.issuperset(expected_value_set),
             "result": {
-                "observed_value": sorted(list(observed_value_set))
+                "observed_value": sorted(list(observed_value_set)),
+                "details": {
+                    "value_counts": observed_value_counts
+                }
             }
         }
 

diff --git a/great_expectations/dataset/pandas_dataset.py b/great_expectations/dataset/pandas_dataset.py
@@ -451,7 +451,13 @@ def expect_column_values_to_be_in_type_list(self, column, type_list,
         # Build one type list with each specified type list from type_map
         target_type_list = list()
         for type_ in type_list:
-            target_type_list += type_map[type_]
+            try:
+                target_type_list += type_map[type_]
+            except KeyError:
+                logger.warning("Unrecognized type: %s" % type_)
+
+        if len(target_type_list) == 0:
+            raise ValueError("No recognized pandas types in type_list")
 
         return column.map(lambda x: isinstance(x, tuple(target_type_list)))
 

diff --git a/great_expectations/datasource/sqlalchemy_source.py b/great_expectations/datasource/sqlalchemy_source.py
@@ -43,7 +43,7 @@ def __init__(self, datasource, name="default"):
 
     def _get_iterator(self, data_asset_name, **kwargs):
         if self._queries_path:
-            if data_asset_name in [path for path in os.walk(self._queries_path) if str(path).endswith(".sql")]:
+            if data_asset_name in [path[:-4] for path in os.listdir(self._queries_path) if str(path).endswith(".sql")]:
                 with open(os.path.join(self._queries_path, data_asset_name) + ".sql", "r") as data:
                     return iter([{
                         "query": data.read(),

diff --git a/great_expectations/init_notebooks/create_expectations_for_csv_files.ipynb b/great_expectations/init_notebooks/create_expectations_for_csv_files.ipynb
@@ -45,8 +45,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# context = ge.data_context.DataContext('../../', expectation_explorer=True)\n",
-    "context = ge.data_context.DataContext('../../', expectation_explorer=False)"
+    "# context = ge.data_context.DataContext('../', expectation_explorer=True)\n",
+    "context = ge.data_context.DataContext('../', expectation_explorer=False)"
    ]
   },
   {
@@ -121,7 +121,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "great_expectations.jupyter_ux.get_available_data_asset_names(context, data_source_name=data_source_name)"
+    "great_expectations.jupyter_ux.list_available_data_asset_names(context, data_source_name=data_source_name)"
    ]
   },
   {