Skip to content

Commit

Permalink
feat: Support GCS filesystem for bytewax engine (#3774)
Browse files Browse the repository at this point in the history
* fix: Support param timeout when persisting

Signed-off-by: Hai Nguyen <quanghai.ng1512@gmail.com>

* fix: fix java integration test

Signed-off-by: Hai Nguyen <quanghai.ng1512@gmail.com>

---------

Signed-off-by: Hai Nguyen <quanghai.ng1512@gmail.com>
  • Loading branch information
sudohainguyen committed Oct 20, 2023
1 parent f05a6e7 commit fb6b807
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import pyarrow as pa
import pyarrow.parquet as pq
import s3fs
from bytewax.dataflow import Dataflow # type: ignore
from bytewax.execution import cluster_main
from bytewax.inputs import ManualInputConfig, distribute
Expand All @@ -29,8 +28,7 @@ def __init__(
self._run_dataflow()

def process_path(self, path):
fs = s3fs.S3FileSystem()
dataset = pq.ParquetDataset(path, filesystem=fs, use_legacy_dataset=False)
dataset = pq.ParquetDataset(path, use_legacy_dataset=False)
batches = []
for fragment in dataset.fragments:
for batch in fragment.to_table().to_batches():
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@
"google-cloud-datastore>=2.1.0,<3",
"google-cloud-storage>=1.34.0,<3",
"google-cloud-bigtable>=2.11.0,<3",
"gcsfs",
]

REDIS_REQUIRED = [
Expand Down Expand Up @@ -158,7 +159,7 @@
"moto",
"mypy>=0.981,<0.990",
"avro==1.10.0",
"gcsfs>=0.4.0,<=2022.01.0",
"gcsfs",
"urllib3>=1.25.4,<2",
"psutil==5.9.0",
"py>=1.11.0", # https://github.com/pytest-dev/pytest/issues/10420
Expand Down

0 comments on commit fb6b807

Please sign in to comment.