# Using great_expectations

In [None]:
import great_expectations as ge
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

### Before great_expectations

Data Source: Hospital Charge data set at data.gov

In [None]:
df = pd.read_csv('/home/derek/Desktop/hospitals.csv')
df.sample(10)

In [None]:
print(df.columns)

### With great_expectations

In [None]:
df = ge.dataset.PandasDataSet(df)

In [None]:
df.expect_column_to_exist('DRG Definition')

In [None]:
df.expect_column_value_lengths_to_equal('Provider State', 2)

In [None]:
df.expect_column_values_to_be_between(' Total Discharges ', 0, 200)

In [None]:
df.expect_column_values_to_be_of_type(' Average Covered Charges ', 'float', target_datasource='python')

In [None]:
df.expect_column_unique_value_count_to_be_between('Provider Id', 3000, 4000)

In [None]:
df.expect_column_values_to_match_regex('Provider Street Address', r'^[0-9]*[\b\w]+', output_format="COMPLETE")

### Incoming Data

Suppose a model was trained on data with a specified format. We can check the assumptions of new test data before it goes through the model.

In [None]:
df2 = pd.read_csv("/home/derek/Desktop/additional_hospitals.csv")
df2 = ge.dataset.PandasDataSet(df2)

In [None]:
df2.expect_column_value_lengths_to_equal('Provider State', 2)

In [None]:
df2.expect_column_values_to_be_between(' Total Discharges ', 0, 200)

In [None]:
df2.expect_column_values_to_be_of_type(' Average Covered Charges ', 'float', target_datasource='python')

In [None]:
df2.expect_column_values_to_match_regex('Provider Street Address', r'^[0-9]*[\b\w]+', output_format="COMPLETE")