coiled · jrbourbeau · Sep 23, 2021 · Aug 5, 2021 · Aug 5, 2021 · Aug 5, 2021
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
@@ -0,0 +1,16 @@
+name: Linting
+
+on:
+  push:
+    branches: main
+  pull_request:
+    branches: main
+
+jobs:
+  checks:
+    name: "pre-commit hooks"
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+      - uses: pre-commit/action@v2.0.0
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+repos:
+-   repo: https://github.com/psf/black
+    rev: 20.8b1
+    hooks:
+    - id: black
+      language_version: python3
+      exclude: versioneer.py
+-   repo: https://gitlab.com/pycqa/flake8
+    rev: 3.8.3
+    hooks:
+    - id: flake8
+      language_version: python3
+-   repo: https://github.com/pycqa/isort
+    rev: 5.8.0
+    hooks:
+    - id: isort
+      language_version: python3
diff --git a/dask_bigquery/__init__.py b/dask_bigquery/__init__.py
@@ -0,0 +1 @@
+from .core import read_gbq
diff --git a/dask_bigquery/core.py b/dask_bigquery/core.py
@@ -0,0 +1,260 @@
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterable
+from contextlib import contextmanager
+from functools import partial
+
+import pandas as pd
+import pyarrow
+from dask.base import tokenize
+from dask.dataframe.core import new_dd_object
+from dask.highlevelgraph import HighLevelGraph
+from dask.layers import DataFrameIOLayer
+from google.cloud import bigquery, bigquery_storage
+
+
+@contextmanager
+def bigquery_client(project_id=None, with_storage_api=False):
+    """This context manager is a temporary solution until there is an
+    upstream solution to handle this.
+    See  googleapis/google-cloud-python#9457
+    and googleapis/gapic-generator-python#575 for reference.
+    """
+
+    bq_storage_client = None
+    bq_client = bigquery.Client(project_id)
+    try:
+        if with_storage_api:
+            bq_storage_client = bigquery_storage.BigQueryReadClient(
+                credentials=bq_client._credentials
+            )
+            yield bq_client, bq_storage_client
+        else:
+            yield bq_client
+    finally:
+        bq_client.close()
+
+
+def _stream_to_dfs(bqs_client, stream_name, schema, timeout):
+    """Given a Storage API client and a stream name, yield all dataframes."""
+    return [
+        pyarrow.ipc.read_record_batch(
+            pyarrow.py_buffer(message.arrow_record_batch.serialized_record_batch),
+            schema,
+        ).to_pandas()
+        for message in bqs_client.read_rows(name=stream_name, offset=0, timeout=timeout)
+    ]
+
+
+def bigquery_read_partition_field(
+    make_create_read_session_request: callable,
+    project_id: str,
+    timeout: int,
+    partition_field: str,
+    row_filter: str,
+) -> pd.DataFrame:
+    """Read a single batch of rows via BQ Storage API, in Arrow binary format.
+    Args:
+        project_id: BigQuery project
+        create_read_session_request: kwargs to pass to `bqs_client.create_read_session`
+        as `request`
+        partition_field: BigQuery field for partitions, to be used as Dask index col for
+        divisions
+            NOTE: Please set if specifying `row_restriction` filters in TableReadOptions.
+    Adapted from
+    https://github.com/googleapis/python-bigquery-storage/blob/a0fc0af5b4447ce8b50c365d4d081b9443b8490e/google/cloud/bigquery_storage_v1/reader.py.
+    """
+    with bigquery_client(project_id, with_storage_api=True) as (bq_client, bqs_client):
+        session = bqs_client.create_read_session(
+            make_create_read_session_request(row_filter=row_filter)
+        )
+        schema = pyarrow.ipc.read_schema(
+            pyarrow.py_buffer(session.arrow_schema.serialized_schema)
+        )
+
+        shards = [
+            df
+            for stream in session.streams
+            for df in _stream_to_dfs(bqs_client, stream.name, schema, timeout=timeout)
+        ]
+        # NOTE: if no rows satisfying the row_restriction, then `shards` will be empty list
+        if len(shards) == 0:
+            shards = [schema.empty_table().to_pandas()]
+        shards = [shard.set_index(partition_field, drop=True) for shard in shards]
+
+    return pd.concat(shards)
+
+
+def bigquery_read(
+    make_create_read_session_request: callable,
+    project_id: str,
+    timeout: int,
+    stream_name: str,
+) -> pd.DataFrame:
+    """Read a single batch of rows via BQ Storage API, in Arrow binary format.
+    Args:
+        project_id: BigQuery project
+        create_read_session_request: kwargs to pass to `bqs_client.create_read_session`
+        as `request`
+        stream_name: BigQuery Storage API Stream "name".
+            NOTE: Please set if reading from Storage API without any `row_restriction`.
+            https://cloud.google.com/bigquery/docs/reference/storage/rpc/google.cloud.bigquery.storage.v1beta1#stream
+    NOTE: `partition_field` and `stream_name` kwargs are mutually exclusive.
+    Adapted from
+    https://github.com/googleapis/python-bigquery-storage/blob/a0fc0af5b4447ce8b50c365d4d081b9443b8490e/google/cloud/bigquery_storage_v1/reader.py.
+    """
+    with bigquery_client(project_id, with_storage_api=True) as (bq_client, bqs_client):
+        session = bqs_client.create_read_session(make_create_read_session_request())
+        schema = pyarrow.ipc.read_schema(
+            pyarrow.py_buffer(session.arrow_schema.serialized_schema)
+        )
+        shards = _stream_to_dfs(bqs_client, stream_name, schema, timeout=timeout)
+        # NOTE: BQ Storage API can return empty streams
+        if len(shards) == 0:
+            shards = [schema.empty_table().to_pandas()]
+
+    return pd.concat(shards)
+
+
+def read_gbq(
+    project_id: str,
+    dataset_id: str,
+    table_id: str,
+    partition_field: str = None,
+    partitions: Iterable[str] = None,
+    row_filter="",
+    fields: list[str] = (),
+    read_timeout: int = 3600,
+):
+    """Read table as dask dataframe using BigQuery Storage API via Arrow format.
+    If `partition_field` and `partitions` are specified, then the resulting dask dataframe
+    will be partitioned along the same boundaries. Otherwise, partitions will be approximately
+    balanced according to BigQuery stream allocation logic.
+    If `partition_field` is specified but not included in `fields` (either implicitly by requesting
+    all fields, or explicitly by inclusion in the list `fields`), then it will still be included
+    in the query in order to have it available for dask dataframe indexing.
+    Args:
+        project_id: BigQuery project
+        dataset_id: BigQuery dataset within project
+        table_id: BigQuery table within dataset
+        partition_field: to specify filters of form "WHERE {partition_field} = ..."
+        partitions: all values to select of `partition_field`
+        fields: names of the fields (columns) to select (default None to "SELECT *")
+        read_timeout: # of seconds an individual read request has before timing out
+    Returns:
+        dask dataframe
+    See https://github.com/dask/dask/issues/3121 for additional context.
+    """
+    if (partition_field is None) and (partitions is not None):
+        raise ValueError("Specified `partitions` without `partition_field`.")
+
+    # If `partition_field` is not part of the `fields` filter, fetch it anyway to be able
+    # to set it as dask dataframe index. We want this to be able to have consistent:
+    # BQ partitioning + dask divisions + pandas index values
+    if (partition_field is not None) and fields and (partition_field not in fields):
+        fields = (partition_field, *fields)
+
+    # These read tasks seems to cause deadlocks (or at least long stuck workers out of touch with
+    # the scheduler), particularly when mixed with other tasks that execute C code. Anecdotally
+    # annotating the tasks with a higher priority seems to help (but not fully solve) the issue at
+    # the expense of higher cluster memory usage.
+    with bigquery_client(project_id, with_storage_api=True) as (
+        bq_client,
+        bqs_client,
+    ):
+        table_ref = bq_client.get_table(".".join((dataset_id, table_id)))
+        if table_ref.table_type == "VIEW":
+            raise TypeError("Table type VIEW not supported")
+
+        # The protobuf types can't be pickled (may be able to tweak w/ copyreg), so instead use a
+        # generator func.
+        def make_create_read_session_request(row_filter=""):
+            return bigquery_storage.types.CreateReadSessionRequest(
+                max_stream_count=100,  # 0 -> use as many streams as BQ Storage will provide
+                parent=f"projects/{project_id}",
+                read_session=bigquery_storage.types.ReadSession(
+                    data_format=bigquery_storage.types.DataFormat.ARROW,
+                    read_options=bigquery_storage.types.ReadSession.TableReadOptions(
+                        row_restriction=row_filter,
+                        selected_fields=fields,
+                    ),
+                    table=table_ref.to_bqstorage(),
+                ),
+            )
+
+        # Create a read session in order to detect the schema.
+        # Read sessions are light weight and will be auto-deleted after 24 hours.
+        session = bqs_client.create_read_session(
+            make_create_read_session_request(row_filter=row_filter)
+        )
+        schema = pyarrow.ipc.read_schema(
+            pyarrow.py_buffer(session.arrow_schema.serialized_schema)
+        )
+        meta = schema.empty_table().to_pandas()
+
+        label = "read-gbq-"
+        output_name = label + tokenize(
+            project_id,
+            dataset_id,
+            table_id,
+            partition_field,
+            partitions,
+            row_filter,
+            fields,
+            read_timeout,
+        )
+
+        if partition_field is not None:
+            if row_filter:
+                raise ValueError("Cannot pass both `partition_field` and `row_filter`")
+
+            meta = meta.set_index(partition_field, drop=True)
+
+            if partitions is None:
+                logging.info(
+                    "Specified `partition_field` without `partitions`; reading full table."
+                )
+                partitions = [
+                    p
+                    for p in bq_client.list_partitions(f"{dataset_id}.{table_id}")
+                    if p != "__NULL__"
+                ]
+                # TODO generalize to ranges (as opposed to discrete values)
+
+            partitions = sorted(partitions)
+            row_filters = [
+                f'{partition_field} = "{partition_value}"'
+                for partition_value in partitions
+            ]
+            layer = DataFrameIOLayer(
+                output_name,
+                meta.columns,
+                row_filters,
+                partial(
+                    bigquery_read_partition_field,
+                    make_create_read_session_request,
+                    project_id,
+                    read_timeout,
+                    partition_field,
+                ),
+                label=label,
+            )
+            divisions = (*partitions, partitions[-1])
+        else:
+            layer = DataFrameIOLayer(
+                output_name,
+                meta.columns,
+                [stream.name for stream in session.streams],
+                partial(
+                    bigquery_read,
+                    make_create_read_session_request,
+                    project_id,
+                    read_timeout,
+                ),
+                label=label,
+            )
+            divisions = tuple([None] * (len(session.streams) + 1))
+
+        graph = HighLevelGraph({output_name: layer}, {output_name: set()})
+        return new_dd_object(graph, output_name, meta, divisions)
diff --git a/dask_bigquery/tests/test_core.py b/dask_bigquery/tests/test_core.py
@@ -0,0 +1,95 @@
+import random
+import uuid
+
+import pandas as pd
+import pytest
+from dask.dataframe.utils import assert_eq
+from distributed.utils_test import cluster_fixture  # noqa: F401
+from distributed.utils_test import client, loop  # noqa: F401
+from google.cloud import bigquery
+
+from dask_bigquery import read_gbq
+
+# These tests are run locally and assume the user is already athenticated.
+# It also assumes that the user has created a project called dask-bigquery.
+
+
+@pytest.fixture
+def df():
+    records = [
+        {
+            "name": random.choice(["fred", "wilma", "barney", "betty"]),
+            "number": random.randint(0, 100),
+            "idx": i,
+        }
+        for i in range(10)
+    ]
+
+    yield pd.DataFrame(records)
+
+
+@pytest.fixture
+def dataset(df):
+    "Push some data to BigQuery using pandas gbq"
+    project_id = "dask-bigquery"
+    dataset_id = uuid.uuid4().hex
+    table_id = "table_test"
+    # push data to gbq
+    pd.DataFrame.to_gbq(
+        df,
+        destination_table=f"{dataset_id}.{table_id}",
+        project_id=project_id,
+        chunksize=5,
+        if_exists="append",
+    )
+    yield (project_id, dataset_id, table_id)
+
+    with bigquery.Client() as bq_client:
+        bq_client.delete_dataset(
+            dataset=f"{project_id}.{dataset_id}",
+            delete_contents=True,
+        )
+
+
+# test simple read
+def test_read_gbq(df, dataset, client):
+    """Test simple read of data pushed to BigQuery using pandas-gbq"""
+    project_id, dataset_id, table_id = dataset
+    ddf = read_gbq(project_id=project_id, dataset_id=dataset_id, table_id=table_id)
+
+    assert list(ddf.columns) == ["name", "number", "idx"]
+    assert ddf.npartitions == 2
+    assert assert_eq(ddf.set_index("idx"), df.set_index("idx"))
+
+
+# test partitioned data: this test requires a copy of the public dataset
+# bigquery-public-data.covid19_public_forecasts.county_14d into a the
+# project dask-bigquery
+
+
+@pytest.mark.parametrize(
+    "fields",
+    ([], ["county_name"], ["county_name", "county_fips_code"]),
+    ids=["no_fields", "missing_partition_field", "fields"],
+)
+def test_read_gbq_partitioning(fields, client):
+    partitions = ["Teton", "Loudoun"]
+    ddf = read_gbq(
+        project_id="dask-bigquery",
+        dataset_id="covid19_public_forecasts",
+        table_id="county_14d",
+        partition_field="county_name",
+        partitions=partitions,
+        fields=fields,
+    )
+
+    assert len(ddf)  # check it's not empty
+    loaded = set(ddf.columns) | {ddf.index.name}
+
+    if fields:
+        assert loaded == set(fields) | {"county_name"}
+    else:  # all columns loaded
+        assert loaded >= set(["county_name", "county_fips_code"])
+
+    assert ddf.npartitions == len(partitions)
+    assert list(ddf.divisions) == sorted(ddf.divisions)