In [1]:
import great_expectations as ge

In [2]:
ge.__version__

'0.9.0b1+236.ga5f5f293.dirty'

In [3]:
import os
import shutil

In [4]:
project_dir = "./contexts/big_query_demo"
shutil.rmtree(project_dir, ignore_errors=True)
os.makedirs(project_dir)

In [5]:
context = ge.data_context.DataContext.create("./contexts/big_query_demo")

In [6]:
# Set your BigQuery project here. You must have 'bigquery.jobs.create' permission in the relevant project
# Note that you will be able to access additional data beyond what is in the project you specify
project = 'superconductive-internal'

# Set your BigQuery credentials; for example set GOOGLE_APPLICATION_CREDENTIALS environment variable to the location
# of a credential json file
datasource = context.add_datasource("bigquery",
                                    class_name="SqlAlchemyDatasource", 
                                    connection_string='bigquery://' + project)

In [7]:
context.create_expectation_suite("chicago_taxi.warning")

{
  "data_asset_type": null,
  "expectations": [],
  "meta": {
    "great_expectations.__version__": "0.9.0b1+236.ga5f5f293.dirty"
  }
}

In [8]:
batch_kwargs = {
    "datasource": "bigquery",
    # This is specifying the full path via the BigQuery project.dataset.table format
    "table": "bigquery-public-data.chicago_taxi_trips.taxi_trips"  
}

In [9]:
batch = context.get_batch(batch_kwargs=batch_kwargs, expectation_suite_name="chicago_taxi.warning")

In [14]:
# Notice that our Dataset exposes metadata for us
batch.columns

[{'name': 'unique_key', 'type': String(), 'nullable': False, 'default': None},
 {'name': 'taxi_id', 'type': String(), 'nullable': False, 'default': None},
 {'name': 'trip_start_timestamp',
  'type': TIMESTAMP(),
  'nullable': True,
  'default': None},
 {'name': 'trip_end_timestamp',
  'type': TIMESTAMP(),
  'nullable': True,
  'default': None},
 {'name': 'trip_seconds',
  'type': Integer(),
  'nullable': True,
  'default': None},
 {'name': 'trip_miles', 'type': Float(), 'nullable': True, 'default': None},
 {'name': 'pickup_census_tract',
  'type': Integer(),
  'nullable': True,
  'default': None},
 {'name': 'dropoff_census_tract',
  'type': Integer(),
  'nullable': True,
  'default': None},
 {'name': 'pickup_community_area',
  'type': Integer(),
  'nullable': True,
  'default': None},
 {'name': 'dropoff_community_area',
  'type': Integer(),
  'nullable': True,
  'default': None},
 {'name': 'fare', 'type': Float(), 'nullable': True, 'default': None},
 {'name': 'tips', 'type': Float(), '

In [11]:
# And so we can create some expectations extremely quickly -- without running a query/job

batch.expect_column_to_exist('trip_start_timestamp')

{
  "expectation_config": null,
  "exception_info": null,
  "result": {},
  "success": true,
  "meta": {}
}

In [12]:
# Now let's run a real expectation:

batch.expect_column_values_to_be_unique('trip_start_timestamp')

{
  "expectation_config": null,
  "exception_info": null,
  "result": {
    "element_count": 190955873,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 190955873,
    "unexpected_percent": 100.0,
    "unexpected_percent_nonmissing": 100.0,
    "partial_unexpected_list": [
      "2019-03-23T21:30:00+00:00",
      "2019-03-20T21:45:00+00:00",
      "2019-03-26T05:00:00+00:00",
      "2019-03-23T14:45:00+00:00",
      "2019-03-20T18:15:00+00:00",
      "2019-03-23T17:15:00+00:00",
      "2019-03-29T11:30:00+00:00",
      "2019-03-24T21:30:00+00:00",
      "2019-03-29T14:45:00+00:00",
      "2015-01-27T21:00:00+00:00",
      "2019-03-28T19:15:00+00:00",
      "2019-03-19T03:15:00+00:00",
      "2019-03-27T05:45:00+00:00",
      "2019-03-24T06:15:00+00:00",
      "2019-03-25T14:30:00+00:00",
      "2019-03-29T22:15:00+00:00",
      "2014-04-06T00:30:00+00:00",
      "2019-03-26T19:30:00+00:00",
      "2014-08-24T01:15:00+00:00",
      "2019-03-29T10:00:00+00:00"


In [16]:
# Surprising! NONE of the 190M trips had a unique start time??? That seems unlikely. Oh wait, the 
# data has been statisitically censored to five-minute increments. What a cool insight!

# GE is actively expanding its vocabulary of expectations and we plan to build time-based expectations
# that are able to capture insights like these.

In [19]:
# Well, at least we should believe that the rides happen in the Chicago environs...
batch.expect_column_values_to_be_between("pickup_latitude", 41.5, 42.1)

{
  "expectation_config": null,
  "exception_info": null,
  "result": {
    "element_count": 190955873,
    "missing_count": 22814686,
    "missing_percent": 11.947622056117646,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {}
}

In [20]:
batch.expect_column_values_to_be_between("pickup_longitude", -88, -87.5)

{
  "expectation_config": null,
  "exception_info": null,
  "result": {
    "element_count": 190955873,
    "missing_count": 22814686,
    "missing_percent": 11.947622056117646,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {}
}

In [None]:
# ... and so on

In [21]:
# Let's save our new suite
batch.save_expectation_suite()

In [22]:
# Let's see what we've created
context.get_expectation_suite("chicago_taxi.warning")

{
  "data_asset_type": "Dataset",
  "expectations": [
    {
      "expectation_type": "expect_column_to_exist",
      "meta": {},
      "kwargs": {
        "column": "trip_start_timestamp"
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {},
      "kwargs": {
        "column": "pickup_latitude",
        "min_value": 41.5,
        "max_value": 42.1
      }
    },
    {
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {},
      "kwargs": {
        "column": "pickup_longitude",
        "min_value": -88,
        "max_value": -87.5
      }
    }
  ],
  "meta": {
    "great_expectations.__version__": "0.9.0b1+236.ga5f5f293.dirty"
  }
}

In [None]:
# Note that we didn't save the (failed) expect_column_values_to_be_unique -- by default expectations that
# are not true when created are not saved.