Skip to content
This repository has been archived by the owner on Jan 9, 2024. It is now read-only.

Commit

Permalink
Merge ef51a25 into 33eee1c
Browse files Browse the repository at this point in the history
  • Loading branch information
adithyabsk committed Jan 2, 2019
2 parents 33eee1c + ef51a25 commit 093867b
Show file tree
Hide file tree
Showing 8 changed files with 394 additions and 4 deletions.
14 changes: 14 additions & 0 deletions foreshadow/intents/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,20 @@ def is_intent(cls, df):
"""
pass # pragma: no cover

@classmethod
@check_base
@abstractmethod
def column_summary(cls, df):
"""Computes relavent statistics and returns a JSON dict of those values
Args:
df: pd.DataFrame to summarize
Returns:
A JSON dict of relavent statistics
"""
pass # pragma: no cover

@classmethod
def _check_intent(cls):
"""Validate class variables are setup properly"""
Expand Down
91 changes: 91 additions & 0 deletions foreshadow/intents/general.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""
General intents defenitions
"""
import json
from collections import OrderedDict

import pandas as pd
import numpy as np
Expand All @@ -11,6 +13,25 @@
from ..transformers.smart import SimpleImputer, MultiImputer, Scaler, Encoder


def mode_freq(series, count=10):
"""Computes the mode and the most frequent values
Args:
series (pandas.Series): the series to analyze
count (int): the n number of most frequent values
"""

vc = series.value_counts()
if series[~series.isnull()].nunique() == 1:
return None, []
else:
mode = series.mode().values.tolist()
if len(mode) == 1:
mode = mode[0]
return (mode, vc.nlargest(count).reset_index().values.tolist())


class GenericIntent(BaseIntent):
"""See base class.
Expand All @@ -35,6 +56,11 @@ def is_intent(cls, df):
"""Returns true by default such that a column must match this"""
return True

@classmethod
def column_summary(cls, df):
"""No statistics can be computed for a general column"""
return {}


class NumericIntent(GenericIntent):
"""See base class.
Expand Down Expand Up @@ -66,6 +92,55 @@ def is_intent(cls, df):
.all()
)

@classmethod
def column_summary(cls, df):
"""Returns computed statistics for a NumericIntent column
The following are computed:
nan: count of nans pass into dataset
invalid: number of invalid values after converting to numeric
mean: -
std: -
min: -
25th: 25th percentile
median: -
75th: 75th percentile
max: -
mode: mode or np.nan if data is mostly unique
top10: top 10 most frequent values or empty array if mostly unique
[(value, count),...,]
10outliers: largest 10 outliers
"""
data = df.ix[:, 0]
nan_num = int(data.isnull().sum())
invalid_num = int(
pd.to_numeric(df.ix[:, 0], errors="coerce").isnull().sum() - nan_num
)
outliers = (
data[np.abs(data - data.mean()) > (3 * data.std())]
.nlargest(10)
.values.tolist()
)
mode, top10 = mode_freq(data)

return OrderedDict(
[
("nan", nan_num),
("invalid", invalid_num),
("mean", data.mean()),
("std", data.std()),
("min", data.min()),
("25th", data.quantile(0.25)),
("median", data.quantile()),
("75th", data.quantile(0.75)),
("max", data.max()),
("mode", mode),
("top10", top10),
("10outliers", outliers),
]
)


class CategoricalIntent(GenericIntent):
"""See base class.
Expand Down Expand Up @@ -94,3 +169,19 @@ def is_intent(cls, df):
return True
else:
return (1.0 * data.nunique() / data.count()) < 0.2

@classmethod
def column_summary(cls, df):
"""Returns computed statistics for a CategoricalIntent column
The following are computed:
nan: count of nans pass into dataset
mode: mode or np.nan if data is mostly unique
top10: top 10 most frequent values or empty array if mostly unique
[(value, count),...,]
"""
data = df.ix[:, 0]
nan_num = int(data.isnull().sum())
mode, top10 = mode_freq(data)

return OrderedDict([("nan", nan_num), ("mode", mode), ("top10", top10)])
18 changes: 18 additions & 0 deletions foreshadow/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,24 @@ def serialize(self):
"y_var": self.y_var,
}

def summarize(self, df):
"""Uses each column's selected intent to generate statistics
Args:
df (pandas.DataFrame): The DataFrame to analyze
Returns: A json dictionary of values with each key representing
a column and its the value representing the results of that
intent's column_summary() function
"""
return {
k: {
"intent": self._intent_map[k].__name__,
"data": self._intent_map[k].column_summary(df[k]),
}
for k in self._intent_map.keys()
}

def fit(self, X, y=None, **fit_params):
"""Fits internal pipeline to X data
Expand Down
66 changes: 63 additions & 3 deletions foreshadow/tests/test_intents/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,26 @@ def test_call_classmethod_from_BaseIntent():
BaseIntent.priority_traverse()

with pytest.raises(TypeError) as e3:
BaseIntent.is_intent()
BaseIntent.is_intent([])

with pytest.raises(TypeError) as e4:
BaseIntent.column_summary([])

assert "cannot be called on BaseIntent" in str(e1.value)
assert "cannot be called on BaseIntent" in str(e2.value)
assert "cannot be called on BaseIntent" in str(e3.value)
assert "cannot be called on BaseIntent" in str(e4.value)


def test_mock_subclass_missing_is_intent():
def test_mock_subclass_missing_abstract_methods():
from foreshadow.intents.base import BaseIntent

with pytest.raises(NotImplementedError) as e:

class TestIntent(BaseIntent):
pass

assert "has not implemented abstract methods is_intent" in str(e.value)
assert "has not implemented abstract methods" in str(e.value)


def test_mock_subclass_missing_children():
Expand All @@ -39,6 +43,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"

assert "Subclass must define" in str(e.value)
Expand All @@ -54,6 +62,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"
children = []

Expand All @@ -70,6 +82,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"
children = []
single_pipeline_template = []
Expand All @@ -86,6 +102,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

dtype = "TEST"
children = []
single_pipeline_template = []
Expand All @@ -109,6 +129,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent1(TestIntent):
dtype = "TEST"
children = ["TestIntent11", "TestIntent12"]
Expand All @@ -119,6 +143,10 @@ class TestIntent1(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent2(TestIntent):
dtype = "TEST"
children = []
Expand All @@ -129,6 +157,10 @@ class TestIntent2(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent11(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -139,6 +171,10 @@ class TestIntent11(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent12(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -149,6 +185,10 @@ class TestIntent12(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class_list = [
"TestIntent",
"TestIntent1",
Expand Down Expand Up @@ -177,6 +217,10 @@ class TestIntent(BaseIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent1(TestIntent):
dtype = "TEST"
children = ["TestIntent11", "TestIntent12"]
Expand All @@ -187,6 +231,10 @@ class TestIntent1(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent2(TestIntent):
dtype = "TEST"
children = []
Expand All @@ -197,6 +245,10 @@ class TestIntent2(TestIntent):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent11(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -207,6 +259,10 @@ class TestIntent11(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class TestIntent12(TestIntent1):
dtype = "TEST"
children = []
Expand All @@ -217,6 +273,10 @@ class TestIntent12(TestIntent1):
def is_intent(cls, df):
return True

@classmethod
def column_summary(cls, df):
return {}

class_list = [TestIntent, TestIntent1, TestIntent2, TestIntent11, TestIntent12]
assert class_list == list(TestIntent.priority_traverse())
_unregister_intent(list(map(lambda x: x.__name__, class_list)))

0 comments on commit 093867b

Please sign in to comment.