In [None]:
import pandas as pd
import json
import great_expectations as ge

In [None]:
my_df = ge.read_csv("http://bit.ly/kaggletrain")

In [None]:
my_df

In [None]:
# df = pd.read_csv("http://bit.ly/kaggletrain")

# df = ge.from_pandas(my_pandas_dataframe)

In [None]:
my_df.head()

In [None]:
my_df.Pclass.value_counts()

In [None]:
my_df[my_df.Age==35].head()

In [None]:
my_df.expect_column_distinct_values_to_be_in_set("Sex", ["male", "female"])

In [None]:
my_df.expect_column_values_to_be_in_set("Pclass", [1,3], mostly=.85)

In [None]:
from great_expectations.dataset import PandasDataset, MetaPandasDataset

class TitanicDataset(PandasDataset):

    _data_asset_type = "TitanicDataset"

    @MetaPandasDataset.column_map_expectation
    def expect_column_string_not_null_or_empty(self, column):
        """ 
        Custom expectation that checks a string column's value for each row is not NULL and not an empty string
        """
        return column.map(lambda x: (pd.notnull(x)) & (x.strip()!=""))

    
    @MetaPandasDataset.column_map_expectation
    def expect_column_in_range(self, column, range_min, range_max):
        """
        Custom expectation that checks that the column's value for each row is in range [range_min, range_max]
        """
        return column.map(lambda x: (range_min <= float(x) <= range_max))
    


In [None]:
my_df = ge.from_pandas(pd.DataFrame.from_records(my_df), dataset_class=TitanicDataset)

In [None]:
my_df.expect_column_string_not_null_or_empty("Name")


In [None]:
my_df.expect_column_values_to_be_unique(column="PassengerId")

In [None]:
my_df.expect_column_in_range("Fare", range_min=1, range_max=50)

In [None]:
my_df.validate()

In [None]:
my_df.get_expectation_suite()
# my_df.get_expectation_suite(discard_failed_expectations=False)

In [None]:
with open( "my_expectation_file.json", "w") as my_file:
    my_file.write(
        json.dumps(my_df.get_expectation_suite().to_json_dict())
        # json.dumps(my_df.validate().to_json_dict())
    )