In [1]:
import pandas as pd
import great_expectations as ge

In [2]:
train_path = "training.1600000.processed.noemoticon.csv"

# Reading the dataset with no columns titles and with latin encoding 
df = pd.read_csv(train_path, sep = ",", encoding='latin-1', header=None, error_bad_lines=False)

# As the data has no column titles, we will add our own
df.columns = ["target", "ids", "date", "flag", "user", "text"]

# Convert the dataframe to a Great Expectations dataset
df = ge.dataset.PandasDataset(df)

In [3]:
# Presence of specific features
df.expect_table_columns_to_match_ordered_list(column_list=["target", "ids", "date", "flag", "user", "text"])

{'success': True,
 'result': {'observed_value': ['target',
   'ids',
   'date',
   'flag',
   'user',
   'text']}}

In [4]:
# Missing values
df.expect_column_values_to_not_be_null(column="target")
df.expect_column_values_to_not_be_null(column="ids")
df.expect_column_values_to_not_be_null(column="date")
df.expect_column_values_to_not_be_null(column="flag")
df.expect_column_values_to_not_be_null(column="user")
df.expect_column_values_to_not_be_null(column="text")

{'success': True,
 'result': {'element_count': 1600000,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 0,
  'unexpected_percent': 0.0,
  'partial_unexpected_list': []}}

In [5]:
# Unique values
df.expect_column_values_to_be_unique(column="ids")

{'success': False,
 'result': {'element_count': 1600000,
  'missing_count': 0,
  'missing_percent': 0.0,
  'unexpected_count': 3370,
  'unexpected_percent': 0.00210625,
  'unexpected_percent_nonmissing': 0.00210625,
  'partial_unexpected_list': [1467863684,
   1467880442,
   1468053611,
   1468100580,
   1468115720,
   1468131748,
   1468161883,
   1468224250,
   1468310350,
   1468338634,
   1468363676,
   1468502040,
   1468503801,
   1468544973,
   1468586841,
   1468639063,
   1468652839,
   1468714181,
   1468758512,
   1468833927]}}

In [6]:
# Type adherence
df.expect_column_values_to_be_of_type(column="user", type_="str")
df.expect_column_values_to_be_of_type(column="text", type_="str")
df.expect_column_values_to_be_of_type(column="target", type_="int64")

{'success': True, 'result': {'observed_value': 'int64'}}

In [7]:
# Expectation suite
expectation_suite = df.get_expectation_suite(discard_failed_expectations=False)
df.validate(expectation_suite=expectation_suite,only_return_failures=True)

{'results': [{'success': False,
   'result': {'element_count': 1600000,
    'missing_count': 0,
    'missing_percent': 0.0,
    'unexpected_count': 3370,
    'unexpected_percent': 0.00210625,
    'unexpected_percent_nonmissing': 0.00210625,
    'partial_unexpected_list': [1467863684,
     1467880442,
     1468053611,
     1468100580,
     1468115720,
     1468131748,
     1468161883,
     1468224250,
     1468310350,
     1468338634,
     1468363676,
     1468502040,
     1468503801,
     1468544973,
     1468586841,
     1468639063,
     1468652839,
     1468714181,
     1468758512,
     1468833927]},
   'exception_info': {'raised_exception': False,
    'exception_message': None,
    'exception_traceback': None},
   'expectation_config': {'expectation_type': 'expect_column_values_to_be_unique',
    'kwargs': {'column': 'ids'}}}],
 'success': False,
 'statistics': {'evaluated_expectations': 11,
  'successful_expectations': 10,
  'unsuccessful_expectations': 1,
  'success_percent': 90.9