Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Add ability to summarize selected intents using Preprocessor (#32)
Browse files Browse the repository at this point in the history
* Add column_summary field to BaseIntent and fix broken tests
* Implement column_summary field for NumericIntent and CategoricalIntent
* Implement summarize functionality for Preprocessor using selected intents and column_summaries
* Update requirements to fix CD error
  • Loading branch information
adithyabsk committed Jan 18, 2019
1 parent 171c35c commit 3b3b017
Show file tree
Hide file tree
Showing 12 changed files with 436 additions and 9 deletions.
14 changes: 14 additions & 0 deletions foreshadow/intents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,20 @@ def is_intent(cls, df):
"""
pass # pragma: no cover

@classmethod
@check_base
@abstractmethod
def column_summary(cls, df):
"""Computes relavent statistics and returns a JSON dict of those values
Args:
df: pd.DataFrame to summarize
Returns:
A JSON dict of relavent statistics
"""
pass # pragma: no cover

@classmethod
def _check_intent(cls):
"""Validate class variables are setup properly"""
Expand Down
102 changes: 102 additions & 0 deletions foreshadow/intents/general.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
General intents defenitions
"""
import json
from collections import OrderedDict

import pandas as pd
import numpy as np
Expand All @@ -11,6 +13,39 @@
from ..transformers.smart import SimpleImputer, MultiImputer, Scaler, Encoder


def _mode_freq(s, count=10):
"""Computes the mode and the most frequent values
Args:
s (pandas.Series): the series to analyze
count (int): the n number of most frequent values
Returns:
A tuple with the list of modes and (the 10 most common values, their
frequency counts, % frequencies)
"""
mode = s.mode().values.tolist()
vc = s.value_counts().nlargest(count).reset_index()
vc["PCT"] = vc.iloc[:, -1] / s.size
return (mode, vc.values.tolist())


def _outliers(s, count=10):
"""Computes the mode and the most frequent values
Args:
s (pandas.Series): the series to analyze
count (int): the n largest (magnitude) outliers
Returns a pandas.Series of outliers
"""
out_ser = s[np.abs(s - s.mean()) > (3 * s.std())]
out_df = out_ser.to_frame()
out_df["selector"] = out_ser.abs()

return out_df.loc[out_df["selector"].nlargest(count).index].iloc[:, 0]


class GenericIntent(BaseIntent):
"""See base class.
Expand All @@ -35,6 +70,11 @@ def is_intent(cls, df):
"""Returns true by default such that a column must match this"""
return True

@classmethod
def column_summary(cls, df):
"""No statistics can be computed for a general column"""
return {}


class NumericIntent(GenericIntent):
"""See base class.
Expand Down Expand Up @@ -66,6 +106,52 @@ def is_intent(cls, df):
.all()
)

@classmethod
def column_summary(cls, df):
"""Returns computed statistics for a NumericIntent column
The following are computed:
nan: count of nans pass into dataset
invalid: number of invalid values after converting to numeric
mean: -
std: -
min: -
25th: 25th percentile
median: -
75th: 75th percentile
max: -
mode: mode or np.nan if data is mostly unique
top10: top 10 most frequent values or empty array if mostly unique
[(value, count),...,]
10outliers: largest 10 outliers
"""

data = df.ix[:, 0]
nan_num = int(data.isnull().sum())
invalid_num = int(
pd.to_numeric(df.ix[:, 0], errors="coerce").isnull().sum() - nan_num
)
outliers = _outliers(data).values.tolist()
mode, top10 = _mode_freq(data)

return OrderedDict(
[
("nan", nan_num),
("invalid", invalid_num),
("mean", data.mean()),
("std", data.std()),
("min", data.min()),
("25th", data.quantile(0.25)),
("median", data.quantile()),
("75th", data.quantile(0.75)),
("max", data.max()),
("mode", mode),
("top10", top10),
("10outliers", outliers),
]
)


class CategoricalIntent(GenericIntent):
"""See base class.
Expand Down Expand Up @@ -94,3 +180,19 @@ def is_intent(cls, df):
return True
else:
return (1.0 * data.nunique() / data.count()) < 0.2

@classmethod
def column_summary(cls, df):
"""Returns computed statistics for a CategoricalIntent column
The following are computed:
nan: count of nans pass into dataset
mode: mode or np.nan if data is mostly unique
top10: top 10 most frequent values or empty array if mostly unique
[(value, count),...,]
"""
data = df.ix[:, 0]
nan_num = int(data.isnull().sum())
mode, top10 = _mode_freq(data)

return OrderedDict([("nan", nan_num), ("mode", mode), ("top10", top10)])
18 changes: 18 additions & 0 deletions foreshadow/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,24 @@ def serialize(self):
"y_var": self.y_var,
}

def summarize(self, df):
"""Uses each column's selected intent to generate statistics
Args:
df (pandas.DataFrame): The DataFrame to analyze
Returns: A json dictionary of values with each key representing
a column and its the value representing the results of that
intent's column_summary() function
"""
return {
k: {
"intent": self._intent_map[k].__name__,
"data": self._intent_map[k].column_summary(df[k]),
}
for k in self._intent_map.keys()
}

def fit(self, X, y=None, **fit_params):
"""Fits internal pipeline to X data
Expand Down
67 changes: 64 additions & 3 deletions foreshadow/tests/test_intents/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,26 @@ def test_call_classmethod_from_BaseIntent():
BaseIntent.priority_traverse()

with pytest.raises(TypeError) as e3:
BaseIntent.is_intent()
BaseIntent.is_intent([])

with pytest.raises(TypeError) as e4:
BaseIntent.column_summary([])

assert "cannot be called on BaseIntent" in str(e1.value)
assert "cannot be called on BaseIntent" in str(e2.value)
assert "cannot be called on BaseIntent" in str(e3.value)
assert "cannot be called on BaseIntent" in str(e4.value)


def test_mock_subclass_missing_is_intent():
def test_mock_subclass_missing_abstract_methods():
from foreshadow.intents.base import BaseIntent

with pytest.raises(NotImplementedError) as e:

class TestIntent(BaseIntent):
pass

assert "has not implemented abstract methods is_intent" in str(e.value)
assert "has not implemented abstract methods" in str(e.value)


def test_mock_subclass_missing_children():
Expand All @@ -39,6 +43,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"

assert "Subclass must define" in str(e.value)
Expand All @@ -54,6 +62,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"
children = []

Expand All @@ -70,6 +82,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"
children = []
single_pipeline_template = []
Expand All @@ -86,6 +102,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"
children = []
single_pipeline_template = []
Expand All @@ -109,6 +129,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent1(TestIntent):
dtype = "TEST"
children = ["TestIntent11", "TestIntent12"]
Expand All @@ -119,6 +143,10 @@ class TestIntent1(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent2(TestIntent):
dtype = "TEST"
children = []
Expand All @@ -129,6 +157,10 @@ class TestIntent2(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent11(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -139,6 +171,10 @@ class TestIntent11(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent12(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -149,6 +185,10 @@ class TestIntent12(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class_list = [
"TestIntent",
"TestIntent1",
Expand Down Expand Up @@ -177,6 +217,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent1(TestIntent):
dtype = "TEST"
children = ["TestIntent11", "TestIntent12"]
Expand All @@ -187,6 +231,10 @@ class TestIntent1(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent2(TestIntent):
dtype = "TEST"
children = []
Expand All @@ -197,6 +245,10 @@ class TestIntent2(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent11(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -207,6 +259,10 @@ class TestIntent11(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent12(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -217,6 +273,11 @@ class TestIntent12(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class_list = [TestIntent, TestIntent1, TestIntent11, TestIntent12, TestIntent2]

assert class_list == list(TestIntent.priority_traverse())
_unregister_intent(list(map(lambda x: x.__name__, class_list)))

0 comments on commit 3b3b017

Please sign in to comment.