Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 60 additions & 15 deletions gcp_variant_transforms/testing/integration/run_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,17 +73,15 @@ def __init__(self,
test_name,
table_name,
input_pattern,
validation_query,
expected_query_result,
assertion_configs,
**kwargs):

self._name = test_name
dataset_id = context.dataset_id
self._project = context.project
self._table_name = '{}.{}'.format(dataset_id, table_name)
output_table = '{}:{}'.format(context.project, self._table_name)
self._validation_query = (" ").join(validation_query)
self._expected_query_result = expected_query_result
self._assertion_configs = assertion_configs
args = ['--input_pattern {}'.format(input_pattern),
'--output_table {}'.format(output_table),
'--project {}'.format(context.project),
Expand Down Expand Up @@ -148,28 +146,64 @@ def _handle_failure(self, response):
'No traceback. See logs for more information on error.')

def validate_table(self):
"""Runs a simple query against the output table and verifies aggregates."""
"""Runs queries against the output table and verifies results."""
client = bigquery.Client(project=self._project)
# TODO(bashir2): Create macros for common queries and add the option for
# having a list of queries instead of just one.
query = self._validation_query.format(TABLE_NAME=self._table_name)
query_job = client.query(query)
# TODO(yifangchen): Create macros for common queries
query_formatter = QueryFormatter(self._table_name)
for assertion_config in self._assertion_configs:
query = query_formatter.format_query(assertion_config['query'])
assertion = QueryAssertion(client, query, assertion_config[
'expected_result'])
assertion.run_assertion()


class QueryAssertion(object):
"""Runs a query and verifies that the output matches the expected result."""

def __init__(self, client, query, expected_result):
self._client = client
self._query = query
self._expected_result = expected_result

def run_assertion(self):
query_job = self._client.query(self._query)
assert query_job.state == 'RUNNING'
iterator = query_job.result(timeout=60)
rows = list(iterator)
if len(rows) != 1:
raise TestCaseFailure('Expected one row in query result, got {}'.format(
len(rows)))
row = rows[0]
if len(self._expected_query_result) != len(row):
if len(self._expected_result) != len(row):
raise TestCaseFailure(
'Expected {} columns in the query result, got {}'.format(
len(self._expected_query_result), len(row)))
for key in self._expected_query_result.keys():
if self._expected_query_result[key] != row.get(key):
len(self._expected_result), len(row)))
for key in self._expected_result.keys():
if self._expected_result[key] != row.get(key):
raise TestCaseFailure(
'Column {} mismatch: expected {}, got {}'.format(
key, self._expected_query_result[key], row.get(key)))
key, self._expected_result[key], row.get(key)))


class QueryFormatter(object):
"""Formats a query.

Replaces keyword TABLE_NAME and eventually macros in the query.
"""

def __init__(self, table_name):
Copy link
Member

@bashir2 bashir2 Feb 27, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider documenting the type (with # type:, see next comment) for anything that is not "private". Since you already use IntelliJ with PyCharm, you should get nice warnings when there are type issues (we will enforce fixing these warnings later as a presubmit check). Check processed_variants.py for examples of these and what/how to add imports needed for type checking.

# type: (str) -> None
self._table_name = table_name

def format_query(self, query):
# type: (List[str]) -> str
"""Formats the given ``query``.

Formatting logic is as follows:
- Concatenates ``query`` parts into one string.
- Replaces TABLE_NAME with the table associated for the query.
"""
return (' ').join(query).format(TABLE_NAME=self._table_name)


class TestContextManager(object):
Expand Down Expand Up @@ -272,11 +306,22 @@ def _load_test_config(filename):

def _validate_test(test, filename):
required_keys = ['test_name', 'table_name', 'input_pattern',
'validation_query', 'expected_query_result']
'assertion_configs']
for key in required_keys:
if key not in test:
raise ValueError('Test case in {} is missing required key: {}'.format(
filename, key))
assertion_configs = test['assertion_configs']
for assertion_config in assertion_configs:
_validate_assertion_config(assertion_config)


def _validate_assertion_config(assertion_config):
required_keys = ['query', 'expected_result']
for key in required_keys:
if key not in assertion_config:
raise ValueError('Test case in {} is missing required key: {}'.format(
assertion_config, key))


def _run_test(test, context):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_0",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 5,
"sum_start": 3607195,
"sum_end": 3607203
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 5,
"sum_start": 3607195,
"sum_end": 3607203
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_0_bz2",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf.bz2",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 5,
"sum_start": 3607195,
"sum_end": 3607203
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 5,
"sum_start": 3607195,
"sum_end": 3607203
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_0_gz",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.0.vcf.gz",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 5,
"sum_start": 3607195,
"sum_end": 3607203
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 5,
"sum_start": 3607195,
"sum_end": 3607203
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_1",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.1-large.vcf",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 9882,
"sum_start": 5434957328,
"sum_end": 5435327553
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 9882,
"sum_start": 5434957328,
"sum_end": 5435327553
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_1_gz",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.1-large.vcf.gz",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 9882,
"sum_start": 5434957328,
"sum_end": 5435327553
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 9882,
"sum_start": 5434957328,
"sum_end": 5435327553
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_2",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2.vcf",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 13,
"sum_start": 23031929,
"sum_end": 23033052
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 13,
"sum_start": 23031929,
"sum_end": 23033052
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,29 @@
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2_VEP.vcf",
"annotation_field": "CSQ",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(DISTINCT CSQ.Feature) AS num_features ",
"FROM {TABLE_NAME} AS t, t.alternate_bases as alts, alts.CSQ as CSQ ",
"WHERE start_position = 1110695 AND alts.alt = 'G'"
],
"expected_query_result": {
"num_features": 3
}
"assertion_configs": [
{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that we have the support, please add a second query to this config, just checking the aggregates like all other tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct. One of the next tasks is to design more test cases.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, but still it would have been nice to have at least one case that it does multiple queries in this very PR (specially because this one lacked the simple query in every other test). Anyways, up to you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 11,
"sum_start": 21801693,
"sum_end": 21802814
}
},
{
"query": [
"SELECT COUNT(DISTINCT CSQ.Feature) AS num_features ",
"FROM {TABLE_NAME} AS t, t.alternate_bases as alts, alts.CSQ as CSQ ",
"WHERE start_position = 1110695 AND alts.alt = 'G'"
],
"expected_result": {
"num_features": 3
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,19 @@
"table_name": "valid_4_2_gz",
"input_pattern": "gs://gcp-variant-transforms-testfiles/small_tests/valid-4.2.vcf.gz",
"runner": "DataflowRunner",
"validation_query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_query_result": {
"num_rows": 13,
"sum_start": 23031929,
"sum_end": 23033052
}
"assertion_configs": [
{
"query": [
"SELECT COUNT(0) AS num_rows, ",
" SUM(start_position) AS sum_start, ",
" SUM(end_position) AS sum_end ",
"FROM {TABLE_NAME}"
],
"expected_result": {
"num_rows": 13,
"sum_start": 23031929,
"sum_end": 23033052
}
}
]
}