In [1]:
import numpy as np
import pandas as pd

import great_expectations as ge
from great_expectations import jupyter_ux

ge.__version__

2019-09-03T08:15:44-0400 - INFO - Great Expectations logging enabled at INFO level by JupyterUX module.


'0.7.8__develop'

In [2]:
## Build a sample dataset

arr = np.random.normal(0, 1, size=(1000, 2))
df = ge.dataset.PandasDataset(arr)

In [3]:
## Add nan to one of the columns
df.iloc[df.sample(n=200, random_state=42).index, 0] = np.nan

2019-09-03T08:15:44-0400 - INFO - 	0 expectation(s) included in expectation_suite.


In [4]:
# Sanity check

print(df[0].mean())
print(df[1].mean())
print(len(df[0]))
print(len(df[0]))

-0.010992152593699036
0.02775779505956932
1000
1000


In [5]:
# Sanity check with expectation; notice missing count == 200 as expected
df.expect_column_values_to_be_between(0, -2, 2, mostly=0.95)

{'success': True,
 'result': {'element_count': 1000,
  'missing_count': 200,
  'missing_percent': 0.2,
  'unexpected_count': 35,
  'unexpected_percent': 0.035,
  'unexpected_percent_nonmissing': 0.04375,
  'partial_unexpected_list': [-2.4033208540028412,
   2.1502800125318458,
   2.073732781983799,
   -2.249650417687381,
   2.04795047563228,
   -2.1726062732984786,
   -3.0591044634104514,
   -2.2027013079982862,
   -3.4556200297278057,
   2.2348324470898677,
   2.4558580336475506,
   -2.2315575942268113,
   2.8059498930506224,
   2.4093065996175094,
   -2.2105236145371396,
   2.613132377841119,
   2.165261195568698,
   3.5796465541410876,
   2.181309684837725,
   -2.4369409996835043]}}

In [6]:
# Sanity check with expectation; notice missing count == 0 as expected
df.expect_column_values_to_be_between(1, -2, 2, mostly=0.95)

{'success': True,
 'result': {'element_count': 1000,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 47,
  'unexpected_percent': 0.047,
  'unexpected_percent_nonmissing': 0.047,
  'partial_unexpected_list': [-3.125848356837646,
   -2.020496744927424,
   3.361073270662882,
   2.002711626637642,
   -2.202524145192944,
   -2.057317226179811,
   -2.0523993802762965,
   2.151414645665574,
   2.399518487894312,
   -2.6512801651040903,
   2.2140311059608813,
   -2.0331049035979034,
   -2.513228680180754,
   -2.0609690728228163,
   2.8796064746097585,
   -2.4002678271683173,
   -2.534450679847885,
   2.692123055869081,
   -2.0694862842788857,
   2.289306339883892]}}

In [7]:
## Observe failure: we cannot use continuous partition without 
part_0 = ge.dataset.util.continuous_partition_data(df[0])

ValueError: autodetected range of [nan, nan] is not finite

In [8]:
## Observe success: without NAN everything is as expected
part_1 = ge.dataset.util.continuous_partition_data(df[1])

In [9]:
## Simple workaround: we can ask for 'uniform' bins instead of auto.
part_0_uniform = ge.dataset.util.continuous_partition_data(df[0], bins='uniform')

# DRAWBACK: we lose the auto guessing about the *number* of bins
print(len(part_1["bins"]))
print(len(part_0_uniform["bins"]))

28
11


In [10]:
## Sligthly better workaround: use a heuristic like numpy does to guess the right n_bins
# See: https://docs.scipy.org/doc/numpy/reference/generated/numpy.histogram_bin_edges.html#numpy.histogram_bin_edges
# Manually use max of sturges and fd

n_bins = np.max(
  np.log2(len(df[0])),
  # cannot use sturges since np.percentile(df[0], [0.25, 0.75]) returns nan
)
part_0_uniform_n_bins = ge.dataset.util.continuous_partition_data(df[0], bins='uniform', n_bins=np.log2(len(df[0])))

# This works, and will eventually scale with bigger data, but still suggests fewer bins than fd
# However, fd requires quartiles, which np also fails to provide when the data contains NAN
print(len(part_1["bins"]))
print(len(part_0_uniform["bins"]))

28
11


In [11]:
# np.histogram also gives another workaround: specifying range. 
## Recommendation: We should accept kwargs to pass to np.histogram
b, e = np.histogram(df[0], range=(df[0].min(), df[0].max()))

# Note that this still doesn't give the same nice behavior as auto
len(e)

11