-
Notifications
You must be signed in to change notification settings - Fork 172
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
add parquet data writer #403
Changes from all commits
8353095
7a5c27a
12431c7
e80cb4d
654c977
e1bb088
3d5073c
ce8ab9f
c6dbb8f
ff4a8ed
9a3e07f
c723469
1cccb14
aa61349
351324b
9f03539
fcf2820
ad73d6b
3ed0e2a
8227652
e735971
ac1338c
5cd8f5f
ca8334f
98cf9dd
fc3ac52
ed90517
afe16e5
c5692c2
0f62cb3
fb1b82d
a84c7ad
2732ca3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,15 @@ | ||
import abc | ||
|
||
# import jsonlines | ||
from dataclasses import dataclass | ||
from typing import Any, Dict, Sequence, IO, Type | ||
from typing import Any, Dict, Sequence, IO, Type, Optional, List, cast | ||
|
||
from dlt.common import json | ||
from dlt.common.typing import StrAny | ||
from dlt.common.schema.typing import TTableSchemaColumns | ||
from dlt.common.destination import TLoaderFileFormat, DestinationCapabilitiesContext | ||
|
||
from dlt.common.configuration import with_config, known_sections, configspec | ||
from dlt.common.configuration.specs import BaseConfiguration | ||
|
||
@dataclass | ||
class TFileFormatSpec: | ||
|
@@ -67,6 +69,8 @@ def class_factory(file_format: TLoaderFileFormat) -> Type["DataWriter"]: | |
return JsonlListPUAEncodeWriter | ||
elif file_format == "insert_values": | ||
return InsertValuesWriter | ||
elif file_format == "parquet": | ||
return ParquetDataWriter # type: ignore | ||
else: | ||
raise ValueError(file_format) | ||
|
||
|
@@ -173,3 +177,67 @@ def data_format(cls) -> TFileFormatSpec: | |
supports_compression=True, | ||
requires_destination_capabilities=True, | ||
) | ||
return TFileFormatSpec("insert_values", "insert_values", False, False, requires_destination_capabilities=True) | ||
|
||
|
||
@configspec | ||
class ParquetDataWriterConfiguration(BaseConfiguration): | ||
flavor: str = "spark" | ||
version: str = "2.4" | ||
data_page_size: int = 1024 * 1024 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cool! let's add a section: then "data_writer" is always mandatory and it will blend in with generic settings in buffered |
||
|
||
__section__: str = known_sections.DATA_WRITER | ||
|
||
class ParquetDataWriter(DataWriter): | ||
|
||
@with_config(spec=ParquetDataWriterConfiguration) | ||
def __init__(self, | ||
f: IO[Any], | ||
caps: DestinationCapabilitiesContext = None, | ||
*, | ||
flavor: str = "spark", | ||
version: str = "2.4", | ||
data_page_size: int = 1024 * 1024 | ||
) -> None: | ||
super().__init__(f, caps) | ||
from dlt.common.libs.pyarrow import pyarrow | ||
|
||
self.writer: Optional[pyarrow.parquet.ParquetWriter] = None | ||
self.schema: Optional[pyarrow.Schema] = None | ||
self.complex_indices: List[str] = None | ||
self.parquet_flavor = flavor | ||
self.parquet_version = version | ||
self.parquet_data_page_size = data_page_size | ||
|
||
def write_header(self, columns_schema: TTableSchemaColumns) -> None: | ||
from dlt.common.libs.pyarrow import pyarrow, get_py_arrow_datatype | ||
|
||
# build schema | ||
self.schema = pyarrow.schema([pyarrow.field(name, get_py_arrow_datatype(schema_item["data_type"]), nullable=schema_item["nullable"]) for name, schema_item in columns_schema.items()]) | ||
# find row items that are of the complex type (could be abstracted out for use in other writers?) | ||
self.complex_indices = [i for i, field in columns_schema.items() if field["data_type"] == "complex"] | ||
self.writer = pyarrow.parquet.ParquetWriter(self._f, self.schema, flavor=self.parquet_flavor, version=self.parquet_version, data_page_size=self.parquet_data_page_size) | ||
|
||
|
||
def write_data(self, rows: Sequence[Any]) -> None: | ||
super().write_data(rows) | ||
from dlt.common.libs.pyarrow import pyarrow | ||
|
||
# replace complex types with json | ||
for key in self.complex_indices: | ||
for row in rows: | ||
if key in row: | ||
row[key] = json.dumps(row[key]) if row[key] else row[key] | ||
|
||
table = pyarrow.Table.from_pylist(rows, schema=self.schema) | ||
# Write | ||
self.writer.write_table(table) | ||
|
||
def write_footer(self) -> None: | ||
self.writer.close() | ||
self.writer = None | ||
|
||
|
||
@classmethod | ||
def data_format(cls) -> TFileFormatSpec: | ||
return TFileFormatSpec("parquet", "parquet", True, False, requires_destination_capabilities=True, supports_compression=False) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
from dlt.common.exceptions import MissingDependencyException | ||
from typing import Any | ||
|
||
try: | ||
import pyarrow | ||
import pyarrow.parquet | ||
except ImportError: | ||
raise MissingDependencyException("DLT parquet Helpers", ["parquet"], "DLT Helpers for for parquet.") | ||
|
||
|
||
def get_py_arrow_datatype(column_type: str) -> Any: | ||
if column_type == "text": | ||
return pyarrow.string() | ||
elif column_type == "double": | ||
return pyarrow.float64() | ||
elif column_type == "bool": | ||
return pyarrow.bool_() | ||
elif column_type == "timestamp": | ||
return pyarrow.timestamp('ms') | ||
elif column_type == "bigint": | ||
return pyarrow.int64() | ||
elif column_type == "binary": | ||
return pyarrow.binary() | ||
elif column_type == "complex": | ||
return pyarrow.string() | ||
elif column_type == "decimal": | ||
return pyarrow.decimal128(38, 18) | ||
elif column_type == "wei": | ||
return pyarrow.decimal128(38, 0) | ||
elif column_type == "date": | ||
return pyarrow.date32() | ||
else: | ||
raise ValueError(column_type) |
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
from dlt.common.configuration.inject import get_orig_args, last_config | ||
from dlt.common.destination.reference import DestinationReference, TDestinationReferenceArg | ||
from dlt.common.pipeline import LoadInfo, PipelineContext, get_dlt_pipelines_dir | ||
from dlt.common.data_writers import TLoaderFileFormat | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we need it here? |
||
|
||
from dlt.pipeline.configuration import PipelineConfiguration, ensure_correct_pipeline_kwargs | ||
from dlt.pipeline.pipeline import Pipeline | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah you've found the bug we could not pinpoint with @z3z1ma for quite some time 🚀 #370
obviously double rotation will not work