Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions client.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,26 @@ import (
csvFile "github.com/cloudquery/filetypes/v3/csv"
jsonFile "github.com/cloudquery/filetypes/v3/json"
"github.com/cloudquery/filetypes/v3/parquet"
"github.com/cloudquery/filetypes/v3/types"
)

type Client struct {
spec *FileSpec

types.FileType
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is used for all interactions with the underlying writer. For read tests, separate Clients are used which are still below.


csv *csvFile.Client
json *jsonFile.Client
parquet *parquet.Client
}

var (
_ types.FileType = (*Client)(nil)
_ types.FileType = (*csvFile.Client)(nil)
_ types.FileType = (*jsonFile.Client)(nil)
_ types.FileType = (*parquet.Client)(nil)
)

// NewClient creates a new client for the given spec
func NewClient(spec *FileSpec) (*Client, error) {
err := spec.UnmarshalSpec()
Expand All @@ -40,8 +50,9 @@ func NewClient(spec *FileSpec) (*Client, error) {
return &Client{}, err
}
return &Client{
spec: spec,
csv: client,
spec: spec,
csv: client,
FileType: client,
}, nil

case FormatTypeJSON:
Expand All @@ -50,8 +61,9 @@ func NewClient(spec *FileSpec) (*Client, error) {
return &Client{}, err
}
return &Client{
spec: spec,
json: client,
spec: spec,
json: client,
FileType: client,
}, nil

case FormatTypeParquet:
Expand All @@ -60,8 +72,9 @@ func NewClient(spec *FileSpec) (*Client, error) {
return &Client{}, err
}
return &Client{
spec: spec,
parquet: client,
spec: spec,
parquet: client,
FileType: client,
}, nil

default:
Expand Down
28 changes: 23 additions & 5 deletions csv/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,29 +9,47 @@ import (
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/apache/arrow/go/v13/arrow/csv"
"github.com/apache/arrow/go/v13/arrow/memory"
"github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/schema"
)

func (cl *Client) WriteTableBatch(w io.Writer, table *schema.Table, records []arrow.Record) error {
newSchema := convertSchema(table.ToArrowSchema())
type Handle struct {
w *csv.Writer
}

var _ types.Handle = (*Handle)(nil)

func (cl *Client) WriteHeader(w io.Writer, t *schema.Table) (types.Handle, error) {
s := t.ToArrowSchema()
newSchema := convertSchema(s)
writer := csv.NewWriter(w, newSchema,
csv.WithComma(cl.Delimiter),
csv.WithHeader(cl.IncludeHeaders),
csv.WithNullWriter(""),
)

return &Handle{
w: writer,
}, nil
}

func (h *Handle) WriteContent(records []arrow.Record) error {
for _, record := range records {
castRec := castToString(record)

if err := writer.Write(castRec); err != nil {
if err := h.w.Write(castRec); err != nil {
return fmt.Errorf("failed to write record to csv: %w", err)
}
if err := writer.Flush(); err != nil {
if err := h.w.Flush(); err != nil {
return fmt.Errorf("failed to flush csv writer: %w", err)
}
}
return nil
}

func (h *Handle) WriteFooter() error {
return h.w.Flush()
}

func convertSchema(sch *arrow.Schema) *arrow.Schema {
oldFields := sch.Fields()
fields := make([]arrow.Field, len(oldFields))
Expand Down
6 changes: 4 additions & 2 deletions csv/write_read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/apache/arrow/go/v13/arrow"
"github.com/bradleyjkemp/cupaloy/v2"
"github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/plugins/destination"
"github.com/cloudquery/plugin-sdk/v3/schema"
"github.com/google/uuid"
Expand Down Expand Up @@ -48,7 +49,7 @@ func TestWriteRead(t *testing.T) {
writer := bufio.NewWriter(&b)
reader := bufio.NewReader(&b)

if err := cl.WriteTableBatch(writer, table, records); err != nil {
if err := types.WriteAll(cl, writer, table, records); err != nil {
t.Fatal(err)
}
writer.Flush()
Expand Down Expand Up @@ -107,9 +108,10 @@ func BenchmarkWrite(b *testing.B) {
writer := bufio.NewWriter(&buf)
b.ResetTimer()
for i := 0; i < b.N; i++ {
if err := cl.WriteTableBatch(writer, table, records); err != nil {
if err := types.WriteAll(cl, writer, table, records); err != nil {
b.Fatal(err)
}

err = writer.Flush()
if err != nil {
b.Fatal(err)
Expand Down
23 changes: 20 additions & 3 deletions json/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,38 @@ import (

"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/schema"
"github.com/goccy/go-json"
)

func (c *Client) WriteTableBatch(w io.Writer, _ *schema.Table, records []arrow.Record) error {
type Handle struct {
w io.Writer
}

var _ types.Handle = (*Handle)(nil)

func (*Client) WriteHeader(w io.Writer, _ *schema.Table) (types.Handle, error) {
return &Handle{
w: w,
}, nil
}

func (*Handle) WriteFooter() error {
return nil
}

func (h *Handle) WriteContent(records []arrow.Record) error {
for _, r := range records {
err := c.writeRecord(w, r)
err := writeRecord(h.w, r)
if err != nil {
return err
}
}
return nil
}

func (*Client) writeRecord(w io.Writer, record arrow.Record) error {
func writeRecord(w io.Writer, record arrow.Record) error {
arr := array.RecordToStructArray(record)
enc := json.NewEncoder(w)
enc.SetEscapeHTML(false)
Expand Down
7 changes: 4 additions & 3 deletions json/write_read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/apache/arrow/go/v13/arrow"
"github.com/bradleyjkemp/cupaloy/v2"
"github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/plugins/destination"
"github.com/cloudquery/plugin-sdk/v3/schema"
"github.com/google/uuid"
Expand All @@ -29,7 +30,7 @@ func TestWrite(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if err := cl.WriteTableBatch(&b, table, records); err != nil {
if err := types.WriteAll(cl, &b, table, records); err != nil {
t.Fatal(err)
}
t.Log(b.String())
Expand All @@ -56,7 +57,7 @@ func TestWriteRead(t *testing.T) {
writer := bufio.NewWriter(&b)
reader := bufio.NewReader(&b)

if err := cl.WriteTableBatch(writer, table, records); err != nil {
if err := types.WriteAll(cl, writer, table, records); err != nil {
t.Fatal(err)
}
writer.Flush()
Expand Down Expand Up @@ -113,7 +114,7 @@ func BenchmarkWrite(b *testing.B) {
writer := bufio.NewWriter(&buf)
b.ResetTimer()
for i := 0; i < b.N; i++ {
if err := cl.WriteTableBatch(writer, table, records); err != nil {
if err := types.WriteAll(cl, writer, table, records); err != nil {
b.Fatal(err)
}
err = writer.Flush()
Expand Down
32 changes: 27 additions & 5 deletions parquet/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,49 @@ import (
"github.com/apache/arrow/go/v13/parquet"
"github.com/apache/arrow/go/v13/parquet/compress"
"github.com/apache/arrow/go/v13/parquet/pqarrow"
ftypes "github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/schema"
"github.com/cloudquery/plugin-sdk/v3/types"
)

func (*Client) WriteTableBatch(w io.Writer, table *schema.Table, records []arrow.Record) error {
type Handle struct {
w *pqarrow.FileWriter
s *arrow.Schema
}

var _ ftypes.Handle = (*Handle)(nil)

func (*Client) WriteHeader(w io.Writer, t *schema.Table) (ftypes.Handle, error) {
props := parquet.NewWriterProperties(
parquet.WithMaxRowGroupLength(128*1024*1024), // 128M
parquet.WithCompression(compress.Codecs.Snappy),
)
arrprops := pqarrow.DefaultWriterProps()
newSchema := convertSchema(table.ToArrowSchema())
newSchema := convertSchema(t.ToArrowSchema())
fw, err := pqarrow.NewFileWriter(newSchema, w, props, arrprops)
if err != nil {
return err
return nil, err
}

return &Handle{
w: fw,
s: newSchema,
}, nil
}

func (h *Handle) WriteFooter() error {
err := h.w.Close()
h.w = nil
return err
}

func (h *Handle) WriteContent(records []arrow.Record) error {
for _, rec := range records {
if err := fw.Write(transformRecord(newSchema, rec)); err != nil {
if err := h.w.Write(transformRecord(h.s, rec)); err != nil {
return err
}
}
return fw.Close()
return nil
}

func convertSchema(sc *arrow.Schema) *arrow.Schema {
Expand Down
5 changes: 3 additions & 2 deletions parquet/write_read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/plugins/destination"
"github.com/cloudquery/plugin-sdk/v3/schema"
)
Expand All @@ -31,7 +32,7 @@ func TestWriteRead(t *testing.T) {
if err != nil {
t.Fatal(err)
}
if err := cl.WriteTableBatch(writer, table, records); err != nil {
if err := types.WriteAll(cl, writer, table, records); err != nil {
t.Fatal(err)
}
err = writer.Flush()
Expand Down Expand Up @@ -85,7 +86,7 @@ func BenchmarkWrite(b *testing.B) {
writer := bufio.NewWriter(&buf)
b.ResetTimer()
for i := 0; i < b.N; i++ {
if err := cl.WriteTableBatch(writer, table, records); err != nil {
if err := types.WriteAll(cl, writer, table, records); err != nil {
b.Fatal(err)
}
err = writer.Flush()
Expand Down
29 changes: 29 additions & 0 deletions types/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package types

import (
"io"

"github.com/apache/arrow/go/v13/arrow"
"github.com/cloudquery/plugin-sdk/v3/schema"
)

type FileType interface {
WriteHeader(io.Writer, *schema.Table) (Handle, error)
}

type Handle interface {
WriteContent([]arrow.Record) error
WriteFooter() error
}

func WriteAll(f FileType, w io.Writer, t *schema.Table, records []arrow.Record) error {
h, err := f.WriteHeader(w, t)
if err != nil {
return err
}
if err := h.WriteContent(records); err != nil {
return err
}

return h.WriteFooter()
}
19 changes: 2 additions & 17 deletions write.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,10 @@ import (
"io"

"github.com/apache/arrow/go/v13/arrow"
"github.com/cloudquery/filetypes/v3/types"
"github.com/cloudquery/plugin-sdk/v3/schema"
)

func (cl *Client) WriteTableBatchFile(w io.Writer, table *schema.Table, records []arrow.Record) error {
switch cl.spec.Format {
case FormatTypeCSV:
if err := cl.csv.WriteTableBatch(w, table, records); err != nil {
return err
}
case FormatTypeJSON:
if err := cl.json.WriteTableBatch(w, table, records); err != nil {
return err
}
case FormatTypeParquet:
if err := cl.parquet.WriteTableBatch(w, table, records); err != nil {
return err
}
default:
panic("unknown format " + cl.spec.Format)
}
return nil
return types.WriteAll(cl.FileType, w, table, records)
}