In [None]:
import pandas as pd
import great_expectations as ge

from bokeh.plotting import figure, output_notebook, show
from bokeh.layouts import gridplot
output_notebook()

In [None]:
import numpy as np
from scipy import stats

In [None]:
def generateData(size):
    w = 0.3
    df = pd.DataFrame()
    df['norm_0_1'] = stats.norm(loc = 0, scale = 1).rvs(size = size)
    df['norm_0_1_b'] = stats.norm(loc = 0, scale = 1).rvs(size = size)
    df['norm_1_1'] = stats.norm(loc = 1, scale = 1).rvs(size = size)
    df['norm_10_1'] = stats.norm(loc = 10, scale = 1).rvs(size = size)
    df['bimodal'] = np.concatenate((df['norm_0_1'][0:int(size/2)],df['norm_10_1'][int(size/2):]))
    return df

In [None]:
df_1000 = generateData(1000)
df_100 = generateData(100)
df_10000 = generateData(10000)
df_1000000 = generateData(1000000)

In [None]:
unreasonably_clean_data = ge.df(df_10000)

## Remember that this is not a statistical test!

We are simply making expectations about the *sample* of data that we have in front of us.

In [None]:
unreasonably_clean_data.expect_column_mean_to_be_between('norm_0_1', -0.1, 0.1)

In [None]:
p1 = figure(title='norm_0_1')
hist, edges = np.histogram(unreasonably_clean_data['norm_0_1'], density=True, bins=20)
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p1.xaxis.axis_label = 'x'
p1.yaxis.axis_label = 'Pr(x)'

p2 = figure(title='norm_1_1')
hist, edges = np.histogram(unreasonably_clean_data['norm_1_1'], density=True, bins=20)
p2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p2.xaxis.axis_label = 'x'
p2.yaxis.axis_label = 'Pr(x)'

p3 = figure(title='bimodal')
hist, edges = np.histogram(unreasonably_clean_data['bimodal'], density=True, bins=20)
p3.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
p3.xaxis.axis_label = 'x'
p3.yaxis.axis_label = 'Pr(x)'

show(gridplot(p1, p2, p3, ncols=2, plot_width=400))

## Now, we *are* going to include statistical tests, but we're going to try to make lots of simplifying assumptions since we are oriented around ease of use.
### First, a sanity check:

Kolmogorov–Smirnov test should not reject null of same distribution for our $N(0,1)$ samples, and the 2-sample test *should* reject null for our $N(1,1)$ and $N(10,1)$ samples.

In [None]:
#print(stats.ks_2samp(df['norm_0_1'], df['norm_0_1_b']))
#print(stats.ks_2samp(np.random.choice(df['norm_0_1'], size=200), df['norm_0_1']))
#print(stats.ks_2samp(np.random.choice(df['norm_0_1'], size=100), df['norm_0_1']))
#print(stats.ks_2samp(np.random.choice(df['norm_0_1'], size=50), df['norm_0_1']))
#print(stats.ks_2samp(df['norm_0_1'], df['norm_0_1']))
#print(stats.ks_2samp(df['norm_0_1'], df['norm_1_1']))
#print(stats.ks_2samp(df['norm_0_1'], df['norm_10_1']))

### Next, let's make a simple nonparametric model of our data

Our basic plan: build a kernel density estimate, evaluate it, and compare with new samples.

We will use all defaults: the gaussian kernel and scott's rule for bandwidth.

In [None]:
#kde = stats.kde.gaussian_kde(df['norm_0_1_1000'])

In [None]:
partition, cdf_vals = ge.util.kde_compress_data(unreasonably_clean_data['bimodal'])

## Now, let's inspect the estimate.

In [None]:
#x = np.arange(start=np.min(df['norm_0_1_1000']), stop= np.max(df['norm_0_1_1000']), step=kde.covariance_factor())
#Y = kde.evaluate(x)
#p1.line(x, Y, line_width = 2, line_alpha=0.8, legend="KDE")
#p1.legend.location = "center_right"
#show(p1)

In [None]:
# Generate an empirical cdf for the given data
def empirical_cdf(partition, data):
    return [np.sum(data < x) / len(data) for x in partition]

In [None]:
cdf = figure(title = "cdf")
cdf.line(partition, cdf_vals, line_width = 2, line_alpha = 0.8, legend = "Estimated CDF")
cdf.line(partition, empirical_cdf(partition, unreasonably_clean_data['bimodal']), line_width = 2, line_alpha = 0.8, color='red', legend="Empirical CDF")
cdf.line(partition, empirical_cdf(partition, np.random.choice(unreasonably_clean_data['bimodal'], size=len(partition), replace=False)), line_width = 2, line_alpha = 0.8, color='green', legend="Sampled Empirical CDF")
cdf.legend.location = "bottom_right"
show(cdf)

In [None]:
unreasonably_clean_data.expect_column_numerical_distribution_to_be('bimodal', partition, cdf_vals)

In [None]:
unreasonably_clean_data.expect_column_numerical_distribution_to_be('bimodal', partition, cdf_vals, sample_size=len(unreasonably_clean_data['bimodal']))

In [None]:
unreasonably_clean_data.save_expectations_config('test_config.json')

### Now consider categorical data

In [None]:
values = ['cat', 'dog', 'fish', 'turtle', 'none']

def build_cat_data(values, frequencies):
    lists = [[val] * times for val, times in zip(values, frequencies) ]
    return [item for sublist in lists for item in sublist]

In [None]:
cat_vals = stats.multinomial.rvs(1000, [0.1, 0.3, 0.2, 0.18, 0.22], size=2)
diff_cat_vals = stats.multinomial.rvs(1000, [0.2, 0.3, 0.2, 0.13, 0.18], size=1)

In [None]:
stats.chisquare(cat_vals[0], cat_vals[1])

In [None]:
df1 = pd.DataFrame(build_cat_data(values, cat_vals[0]), columns=['animals'])
vals, observed = ge.util.categorical_model(df1['animals'])

In [None]:
df = pd.DataFrame()
df['animals'] = build_cat_data(values, cat_vals[1])
df['moreanimals'] = build_cat_data(values, diff_cat_vals[0])
categorical_sample_data = ge.df(df)

In [None]:
categorical_sample_data.expect_column_frequency_distribution_to_be('animals', vals, observed)

### TODO: the above test statistic should be the same for the test in cell 30...

In [None]:
categorical_sample_data.expect_column_frequency_distribution_to_be('moreanimals', vals, observed)

In [None]:
categorical_sample_data.save_expectations_config('test_cat_file.json')