Add ability to summarize selected intents using Preprocessor (#32)

* Add column_summary field to BaseIntent and fix broken tests * Implement column_summary field for NumericIntent and CategoricalIntent * Implement summarize functionality for Preprocessor using selected intents and column_summaries * Update requirements to fix CD error
georgian-io-archive · Jan 18, 2019 · 3b3b017 · 3b3b017
1 parent 171c35c
commit 3b3b017
Show file tree

Hide file tree

Showing 12 changed files with 436 additions and 9 deletions.
diff --git a/foreshadow/intents/base.py b/foreshadow/intents/base.py
@@ -143,6 +143,20 @@ def is_intent(cls, df):
         """
         pass  # pragma: no cover
 
+    @classmethod
+    @check_base
+    @abstractmethod
+    def column_summary(cls, df):
+        """Computes relavent statistics and returns a JSON dict of those values
+
+        Args:
+            df: pd.DataFrame to summarize
+
+        Returns:
+            A JSON dict of relavent statistics
+        """
+        pass  # pragma: no cover
+
     @classmethod
     def _check_intent(cls):
         """Validate class variables are setup properly"""

diff --git a/foreshadow/intents/general.py b/foreshadow/intents/general.py
@@ -1,6 +1,8 @@
 """
 General intents defenitions
 """
+import json
+from collections import OrderedDict
 
 import pandas as pd
 import numpy as np
@@ -11,6 +13,39 @@
 from ..transformers.smart import SimpleImputer, MultiImputer, Scaler, Encoder
 
 
+def _mode_freq(s, count=10):
+    """Computes the mode and the most frequent values
+
+        Args:
+            s (pandas.Series): the series to analyze
+            count (int): the n number of most frequent values
+
+        Returns:
+            A tuple with the list of modes and (the 10 most common values, their
+            frequency counts, % frequencies)
+    """
+    mode = s.mode().values.tolist()
+    vc = s.value_counts().nlargest(count).reset_index()
+    vc["PCT"] = vc.iloc[:, -1] / s.size
+    return (mode, vc.values.tolist())
+
+
+def _outliers(s, count=10):
+    """Computes the mode and the most frequent values
+
+        Args:
+            s (pandas.Series): the series to analyze
+            count (int): the n largest (magnitude) outliers
+
+        Returns a pandas.Series of outliers
+    """
+    out_ser = s[np.abs(s - s.mean()) > (3 * s.std())]
+    out_df = out_ser.to_frame()
+    out_df["selector"] = out_ser.abs()
+
+    return out_df.loc[out_df["selector"].nlargest(count).index].iloc[:, 0]
+
+
 class GenericIntent(BaseIntent):
     """See base class.
 
@@ -35,6 +70,11 @@ def is_intent(cls, df):
         """Returns true by default such that a column must match this"""
         return True
 
+    @classmethod
+    def column_summary(cls, df):
+        """No statistics can be computed for a general column"""
+        return {}
+
 
 class NumericIntent(GenericIntent):
     """See base class.
@@ -66,6 +106,52 @@ def is_intent(cls, df):
             .all()
         )
 
+    @classmethod
+    def column_summary(cls, df):
+        """Returns computed statistics for a NumericIntent column
+
+            The following are computed:
+                nan: count of nans pass into dataset
+                invalid: number of invalid values after converting to numeric
+                mean: -
+                std: -
+                min: -
+                25th: 25th percentile
+                median: -
+                75th: 75th percentile
+                max: -
+                mode: mode or np.nan if data is mostly unique
+                top10: top 10 most frequent values or empty array if mostly unique
+                    [(value, count),...,]
+                10outliers: largest 10 outliers
+
+        """
+
+        data = df.ix[:, 0]
+        nan_num = int(data.isnull().sum())
+        invalid_num = int(
+            pd.to_numeric(df.ix[:, 0], errors="coerce").isnull().sum() - nan_num
+        )
+        outliers = _outliers(data).values.tolist()
+        mode, top10 = _mode_freq(data)
+
+        return OrderedDict(
+            [
+                ("nan", nan_num),
+                ("invalid", invalid_num),
+                ("mean", data.mean()),
+                ("std", data.std()),
+                ("min", data.min()),
+                ("25th", data.quantile(0.25)),
+                ("median", data.quantile()),
+                ("75th", data.quantile(0.75)),
+                ("max", data.max()),
+                ("mode", mode),
+                ("top10", top10),
+                ("10outliers", outliers),
+            ]
+        )
+
 
 class CategoricalIntent(GenericIntent):
     """See base class.
@@ -94,3 +180,19 @@ def is_intent(cls, df):
             return True
         else:
             return (1.0 * data.nunique() / data.count()) < 0.2
+
+    @classmethod
+    def column_summary(cls, df):
+        """Returns computed statistics for a CategoricalIntent column
+
+            The following are computed:
+                nan: count of nans pass into dataset
+                mode: mode or np.nan if data is mostly unique
+                top10: top 10 most frequent values or empty array if mostly unique
+                    [(value, count),...,]
+        """
+        data = df.ix[:, 0]
+        nan_num = int(data.isnull().sum())
+        mode, top10 = _mode_freq(data)
+
+        return OrderedDict([("nan", nan_num), ("mode", mode), ("top10", top10)])
diff --git a/foreshadow/preprocessor.py b/foreshadow/preprocessor.py
@@ -372,6 +372,24 @@ def serialize(self):
             "y_var": self.y_var,
         }
 
+    def summarize(self, df):
+        """Uses each column's selected intent to generate statistics
+
+            Args:
+                df (pandas.DataFrame): The DataFrame to analyze
+
+            Returns: A json dictionary of values with each key representing
+                a column and its the value representing the results of that 
+                intent's column_summary() function
+        """
+        return {
+            k: {
+                "intent": self._intent_map[k].__name__,
+                "data": self._intent_map[k].column_summary(df[k]),
+            }
+            for k in self._intent_map.keys()
+        }
+
     def fit(self, X, y=None, **fit_params):
         """Fits internal pipeline to X data
 

diff --git a/foreshadow/tests/test_intents/test_base.py b/foreshadow/tests/test_intents/test_base.py
@@ -11,22 +11,26 @@ def test_call_classmethod_from_BaseIntent():
         BaseIntent.priority_traverse()
 
     with pytest.raises(TypeError) as e3:
-        BaseIntent.is_intent()
+        BaseIntent.is_intent([])
+
+    with pytest.raises(TypeError) as e4:
+        BaseIntent.column_summary([])
 
     assert "cannot be called on BaseIntent" in str(e1.value)
     assert "cannot be called on BaseIntent" in str(e2.value)
     assert "cannot be called on BaseIntent" in str(e3.value)
+    assert "cannot be called on BaseIntent" in str(e4.value)
 
 
-def test_mock_subclass_missing_is_intent():
+def test_mock_subclass_missing_abstract_methods():
     from foreshadow.intents.base import BaseIntent
 
     with pytest.raises(NotImplementedError) as e:
 
         class TestIntent(BaseIntent):
             pass
 
-    assert "has not implemented abstract methods is_intent" in str(e.value)
+    assert "has not implemented abstract methods" in str(e.value)
 
 
 def test_mock_subclass_missing_children():
@@ -39,6 +43,10 @@ class TestIntent(BaseIntent):
             def is_intent(cls, df):
                 return True
 
+            @classmethod
+            def column_summary(cls, df):
+                return {}
+
             dtype = "TEST"
 
     assert "Subclass must define" in str(e.value)
@@ -54,6 +62,10 @@ class TestIntent(BaseIntent):
             def is_intent(cls, df):
                 return True
 
+            @classmethod
+            def column_summary(cls, df):
+                return {}
+
             dtype = "TEST"
             children = []
 
@@ -70,6 +82,10 @@ class TestIntent(BaseIntent):
             def is_intent(cls, df):
                 return True
 
+            @classmethod
+            def column_summary(cls, df):
+                return {}
+
             dtype = "TEST"
             children = []
             single_pipeline_template = []
@@ -86,6 +102,10 @@ class TestIntent(BaseIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
         dtype = "TEST"
         children = []
         single_pipeline_template = []
@@ -109,6 +129,10 @@ class TestIntent(BaseIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent1(TestIntent):
         dtype = "TEST"
         children = ["TestIntent11", "TestIntent12"]
@@ -119,6 +143,10 @@ class TestIntent1(TestIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent2(TestIntent):
         dtype = "TEST"
         children = []
@@ -129,6 +157,10 @@ class TestIntent2(TestIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent11(TestIntent1):
         dtype = "TEST"
         children = []
@@ -139,6 +171,10 @@ class TestIntent11(TestIntent1):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent12(TestIntent1):
         dtype = "TEST"
         children = []
@@ -149,6 +185,10 @@ class TestIntent12(TestIntent1):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class_list = [
         "TestIntent",
         "TestIntent1",
@@ -177,6 +217,10 @@ class TestIntent(BaseIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent1(TestIntent):
         dtype = "TEST"
         children = ["TestIntent11", "TestIntent12"]
@@ -187,6 +231,10 @@ class TestIntent1(TestIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent2(TestIntent):
         dtype = "TEST"
         children = []
@@ -197,6 +245,10 @@ class TestIntent2(TestIntent):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent11(TestIntent1):
         dtype = "TEST"
         children = []
@@ -207,6 +259,10 @@ class TestIntent11(TestIntent1):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class TestIntent12(TestIntent1):
         dtype = "TEST"
         children = []
@@ -217,6 +273,11 @@ class TestIntent12(TestIntent1):
         def is_intent(cls, df):
             return True
 
+        @classmethod
+        def column_summary(cls, df):
+            return {}
+
     class_list = [TestIntent, TestIntent1, TestIntent11, TestIntent12, TestIntent2]
+
     assert class_list == list(TestIntent.priority_traverse())
     _unregister_intent(list(map(lambda x: x.__name__, class_list)))