In [1]:
import pandas as pd
import base64
import json
import pickle
import great_expectations as ge

from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
output_notebook()

In [2]:
import numpy as np
from scipy import stats

In [3]:
df = pd.DataFrame()
df['norm_0_1_1000'] = stats.norm(loc = 0, scale = 1).rvs(size = 1000)
df['norm_0_1_1000_2'] = stats.norm(loc = 0, scale = 1).rvs(size = 1000)
df['norm_0_1_100'] = stats.norm(loc = 0, scale = 1).rvs(size = 1000)
df['norm_1_1_1000'] = stats.norm(loc = 1, scale = 1).rvs(size = 1000)
df['norm_10_1_1000'] = stats.norm(loc = 10, scale = 1).rvs(size = 1000)

In [4]:
unreasonably_clean_data = ge.df(df)

## Remember that this is not a statistical test!

We are simply making expectations about the *sample* of data that we have in front of us.

In [5]:
unreasonably_clean_data.expect_column_mean_to_be_between('norm_0_1_1000', -0.1, 0.1)

{'exception_list': 0.03490171009127916, 'success': True}

In [6]:
p1 = figure(title='norm_0_1_1000')
hist, edges = np.histogram(df['norm_0_1_1000'], density=True, bins=20)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p1.xaxis.axis_label = 'x'
p1.yaxis.axis_label = 'Pr(x)'

p2 = figure(title='norm_1_1_1000')
hist, edges = np.histogram(df['norm_1_1_1000'], density=True, bins=20)
p2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p2.xaxis.axis_label = 'x'
p2.yaxis.axis_label = 'Pr(x)'

show(gridplot(p1, p2, ncols=2, plot_width=400))

## Now, we *are* going to include statistical tests, but we're going to try to make lots of simplifying assumptions since we are oriented around ease of use.
### First, a sanity check:

Kolmogorov–Smirnov test should not reject null of same distribution for our $N(0,1)$ samples, and the 2-sample test *should* reject null for our $N(1,1)$ and $N(10,1)$ samples.

In [7]:
print(stats.ks_2samp(df['norm_0_1_1000'], df['norm_0_1_1000_2']))
print(stats.ks_2samp(np.random.choice(df['norm_0_1_1000'], size=200), df['norm_0_1_100']))
print(stats.ks_2samp(np.random.choice(df['norm_0_1_1000'], size=100), df['norm_0_1_100']))
print(stats.ks_2samp(np.random.choice(df['norm_0_1_1000'], size=50), df['norm_0_1_100']))
print(stats.ks_2samp(df['norm_0_1_1000'], df['norm_0_1_100']))
print(stats.ks_2samp(df['norm_0_1_1000'], df['norm_1_1_1000']))
print(stats.ks_2samp(df['norm_0_1_1000'], df['norm_10_1_1000']))

Ks_2sampResult(statistic=0.032000000000000028, pvalue=0.67851038238288908)
Ks_2sampResult(statistic=0.066000000000000003, pvalue=0.44945135080461551)
Ks_2sampResult(statistic=0.17899999999999999, pvalue=0.0050199491343059996)
Ks_2sampResult(statistic=0.16800000000000004, pvalue=0.12220813966505226)
Ks_2sampResult(statistic=0.052000000000000018, pvalue=0.12984378016123618)
Ks_2sampResult(statistic=0.38200000000000001, pvalue=1.6480810459816335e-64)
Ks_2sampResult(statistic=1.0, pvalue=0.0)


### Next, let's make a simple nonparametric model of our data

Our basic plan: build a kernel density estimate, evaluate it, and compare with new samples.

We will use all defaults: the gaussian kernel and scott's rule for bandwidth.

In [8]:
kde = stats.kde.gaussian_kde(df['norm_0_1_1000'])

## Now, let's inspect the estimate.

In [9]:
x = np.arange(start=np.min(df['norm_0_1_1000']), stop= np.max(df['norm_0_1_1000']), step=kde.covariance_factor())
Y = kde.evaluate(x)
p1.line(x, Y, line_width = 2, line_alpha=0.8, legend="KDE")
p1.legend.location = "center_right"
show(p1)

In [10]:
df.expect_column_numerical_distribution_to_be_kde('norm_0_1_1000_2', base64.b64encode(pickle.dumps(kde)).decode('ascii'))

{'exception_list': 0.61113808598482944, 'success': True}

In [11]:
df.expect_column_numerical_distribution_to_be_kde('norm_1_1_1000', base64.b64encode(pickle.dumps(kde)).decode('ascii'))

{'exception_list': 0.0, 'success': False}

In [12]:
df.expect_column_numerical_distribution_to_be('norm_0_1_1000_2', np.random.choice(df['norm_0_1_1000'], size=int(len(df['norm_0_1_1000'])/10)).tolist())

{'exception_list': 0.039329134916473885, 'success': False}

In [13]:
df.expect_column_numerical_distribution_to_be('norm_1_1_1000', np.random.choice(df['norm_0_1_1000'], size=int(len(df['norm_0_1_1000'])/10)).tolist())

{'exception_list': 2.9047132809905824e-09, 'success': False}

In [14]:
df.expect_column_numerical_distribution_to_be('norm_0_1_100', df['norm_0_1_1000'].tolist())

{'exception_list': 0.12984378016123618, 'success': True}

In [15]:
print(json.dumps(df.get_expectations_config(), indent=2))

{
  "dataset_name": null,
  "expectations": [
    {
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "norm_0_1_1000"
      }
    },
    {
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "norm_0_1_1000_2"
      }
    },
    {
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "norm_0_1_100"
      }
    },
    {
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "norm_1_1_1000"
      }
    },
    {
      "expectation_type": "expect_column_to_exist",
      "kwargs": {
        "column": "norm_10_1_1000"
      }
    },
    {
      "expectation_type": "expect_column_mean_to_be_between",
      "kwargs": {
        "min_value": -0.1,
        "max_value": 0.1,
        "column": "norm_0_1_1000"
      }
    },
    {
      "expectation_type": "expect_column_numerical_distribution_to_be_kde",
      "kwargs": {
        "kde": "gANjc2NpcHkuc3RhdHMua2R