# Create Expectations
## Always know what to expect of your data

- [Docs](https://great-expectations.readthedocs.io/en/latest/)
- [GitHub](https://github.com/great-expectations/great_expectations)

In [None]:
import json
import os

import great_expectations as ge
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
### Not implemented yet!!!

# The method uses convention (or the "dot" config file) to scan the directory that contains expectation configs for the project

ge.list_dataset_configs()

# output: []

In [None]:
### Not implemented yet!!!

claims_dataset_config = ge.get_dataset_config('diabetis_data')



In [None]:
# Since ge.get_dataset_config does not exist, we are faking its output:

claims_dataset_config = {'dataset_name': 'diabetis_data',
                         'meta': {'great_expectations.__version__': '0.4.5'},
                         'expectations': [],
                         }
claims_dataset_config


In [None]:
# We have to have access to some instance of this dataset type in order to create expectations config.
# Let's load this instance from a CSV file:

file_path = '/Users/eugenemandel/Downloads/dataset_diabetes/diabetic_data.csv'
xargs = {
    'encoding':'UTF-8', 
    'sep': ',',
#     'dtype': object,
    'error_bad_lines': False
}
df = pd.read_csv(file_path, **xargs)

# Let's connect the dataset instance with the expectations config (dataset type)
df = ge.from_pandas(df, expectations_config=claims_dataset_config)

In [None]:
df.head()

### Let's add some expectations to this dataset type

In [None]:
df.expect_column_values_to_be_unique('encounter_id')

In [None]:
df.expect_column_values_to_be_unique('patient_nbr', mostly=1.0)

In [None]:
# If we need to iterate to get the arguments of an expectations, we just call the same expect* method multiple times - 
# this overrides the previous "versions" of this expectation.

df.expect_column_values_to_be_unique('patient_nbr', mostly=0.5)

In [None]:
df.expect_column_unique_value_count_to_be_between('patient_nbr', min_value=0, max_value=0)

In [None]:
# We don't really care about it in the input dataset, but we want to refer to the observed value when we create expectations for the output

df.expect_column_unique_value_count_to_be_between('patient_nbr', min_value=0, max_value=1000000)

In [None]:
df.expect_column_values_to_be_in_set('weight', ['?'])


In [None]:
# Let's iterate and capture the right set of values:

df.expect_column_values_to_be_in_set('weight', ['[175-200)', '>200', '[25-50)', '[150-175)', '[125-150)', '[75-100)','[50-75)','[0-25)','[100-125)', '?'])


In [None]:
df.get_expectations_config()

In [None]:
### Not implemented yet!!!

# This method would take the dataset type name and a config dict and save it to a file.
# The method figures out the file path based on convention/configuration.

ge.save_dataset_config('claims', df.get_expectations_config())

In [None]:
# Since the previous method does not exist, we are saving the config this way:

with open('../dataset_expectations_configs/{0:s}.json'.format(df.get_expectations_config()['dataset_name']), 'w') as outfile:  
    json.dump(df.get_expectations_config(), outfile)
    
    

In [None]:
# import json

# with open('diabetes_config_jpc_20190423.json') as json_file:  
#     e_config = json.load(json_file)

### Our pipeline outputs 2 datasets. Let's create expectations for the one that holds number of encounters for each patient.

In [None]:
### Not implemented yet!!!

claims_dataset_config = ge.get_dataset_config('encounters_per_patient')



In [None]:
# Since ge.get_dataset_config does not exist, we are faking its output:

encounters_count_dataset_config = {'dataset_name': 'encounters_per_patient',
                         'meta': {'great_expectations.__version__': '0.4.5'},
                         'expectations': [],
                         }
encounters_count_dataset_config


In [None]:
### Not implemented yet!!!

# We don't have a data sample, since the pipeline is not implemented yet, so we are creating an empty
# data asset as a container for our expectations

df_encounters_count = ge.data_asset.DataAsset(expectations_config=encounters_count_dataset_config, do_not_evaluate_now=True, overwrite_existing_expectations="Never")


In [None]:
df_encounters_count.expect_table_row_count_to_equal(
    value: {"$PARAMETER": "diabetis_data.expect_column_unique_value_count_to_be_between[patient_nbr].result[observed_value]",
                "$PARAMETER.diabetis_data.expect_column_unique_value_count_to_be_between[patient_nbr].result[observed_value]": 10}
)

In [None]:
### Not implemented yet!!!

# This method would take the dataset type name and a config dict and save it to a file.
# The method figures out the file path based on convention/configuration.

ge.save_dataset_config('encounters_per_patient', df.get_expectations_config())