In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import base64
import json
import pickle
import great_expectations as ge

from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
output_notebook()

In [None]:
import numpy as np
import scipy
import sklearn

In [None]:
df = pd.DataFrame(scipy.stats.norm(loc = 0, scale =1).rvs(size = (1000,3)))
df.columns = ['n_1', 'n_2', 'n_3']

In [None]:
df = pd.DataFrame()
df['norm_0_1_1000'] = scipy.stats.norm(loc = 0, scale = 1).rvs(size = 1000)
df['norm_0_1_1000_2'] = scipy.stats.norm(loc = 0, scale = 1).rvs(size = 1000)
df['norm_0_1_100'] = scipy.stats.norm(loc = 0, scale = 1).rvs(size = 1000)
df['norm_1_1_1000'] = scipy.stats.norm(loc = 1, scale = 1).rvs(size = 1000)
df['norm_10_1_1000'] = scipy.stats.norm(loc = 10, scale = 1).rvs(size = 1000)

In [None]:
unreasonably_clean_data = ge.df(df)

## Remember that this is not a statistical test!

We are simply making expectations about the *sample* of data that we have in front of us.

In [None]:
unreasonably_clean_data.expect_column_mean_to_be_between('norm_0_1_1000', -0.1, 0.1)

In [None]:
p1 = figure(title='norm_0_1_1000')
hist, edges = np.histogram(df['norm_0_1_1000'], density=True, bins=20)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p1.xaxis.axis_label = 'x'
p1.yaxis.axis_label = 'Pr(x)'

p2 = figure(title='norm_1_1_1000')
hist, edges = np.histogram(df['norm_1_1_1000'], density=True, bins=20)
p2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p2.xaxis.axis_label = 'x'
p2.yaxis.axis_label = 'Pr(x)'

show(gridplot(p1, p2, ncols=2, plot_width=400))

## Now, we *are* going to include statistical tests, but we're going to try to make lots of simplifying assumptions since we are oriented around ease of use.
### First, a sanity check:

Kolmogorov–Smirnov test should not reject null of same distribution for our $N(0,1)$ samples, and the 2-sample test *should* reject null for our $N(1,1)$ and $N(10,1)$ samples.

In [None]:
print(scipy.stats.ks_2samp(df['norm_0_1_1000'], df['norm_0_1_1000_2']))
print(scipy.stats.ks_2samp(df['norm_0_1_1000'], df['norm_0_1_100']))
print(scipy.stats.ks_2samp(df['norm_0_1_1000'], df['norm_1_1_1000']))
print(scipy.stats.ks_2samp(df['norm_0_1_1000'], df['norm_10_1_1000']))

### Next, let's make a simple nonparametric model of our data

Our basic plan: build a kernel density estimate, evaluate it, and compare with new samples.

We will use all defaults: the gaussian kernel and scott's rule for bandwidth.

In [None]:
kde = scipy.stats.kde.gaussian_kde(df['norm_0_1_1000'])

## Now, let's inspect the estimate.

In [None]:
x = np.arange(start=np.min(df['norm_0_1_1000']), stop= np.max(df['norm_0_1_1000']), step=kde.covariance_factor())
Y = kde.evaluate(x)
p1.line(x, Y, line_width = 2, line_alpha=0.8, legend="KDE")
p1.legend.location = "center_right"
show(p1)

In [None]:
df.expect_column_numerical_distribution_to_be('norm_0_1_1000_2', base64.b64encode(pickle.dumps(kde)).decode('ascii'))

In [None]:
df.expect_column_numerical_distribution_to_be('norm_1_1_1000', base64.b64encode(pickle.dumps(kde)).decode('ascii'))

In [None]:
print(json.dumps(df.get_expectations_config(), indent=2))