Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions csv/testdata/TestWriteRead-default.csv

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions csv/testdata/TestWriteRead-with_delimiter.csv

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions csv/testdata/TestWriteRead-with_delimiter_headers.csv

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions csv/testdata/TestWriteRead-with_headers.csv

Large diffs are not rendered by default.

9 changes: 3 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,14 @@ module github.com/cloudquery/filetypes/v3
go 1.19

require (
github.com/apache/arrow/go/v13 v13.0.0-20230531201200-cbc17a98dfd9
github.com/apache/arrow/go/v13 v13.0.0-20230601070034-e07e22c5580a
github.com/bradleyjkemp/cupaloy/v2 v2.8.0
github.com/cloudquery/plugin-sdk/v3 v3.6.7
github.com/cloudquery/plugin-sdk/v3 v3.8.1
github.com/goccy/go-json v0.10.2
github.com/google/uuid v1.3.0
github.com/stretchr/testify v1.8.4
)

require google.golang.org/genproto/googleapis/rpc v0.0.0-20230530153820-e85fd2cbaebc // indirect

replace github.com/apache/arrow/go/v13 => github.com/cloudquery/arrow/go/v13 v13.0.0-20230521112802-adef07d4bbaa

require (
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c // indirect
github.com/andybalholm/brotli v1.0.5 // indirect
Expand Down Expand Up @@ -46,6 +42,7 @@ require (
golang.org/x/text v0.9.0 // indirect
golang.org/x/tools v0.7.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
google.golang.org/grpc v1.55.0 // indirect
google.golang.org/protobuf v1.30.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
Expand Down
12 changes: 6 additions & 6 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@ github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c h1:RGWPOewvK
github.com/JohnCGriffin/overflow v0.0.0-20211019200055-46fa312c352c/go.mod h1:X0CRv0ky0k6m906ixxpzmDRLvX58TFUKS2eePweuyxk=
github.com/andybalholm/brotli v1.0.5 h1:8uQZIdzKmjc/iuPu7O2ioW48L81FgatrcpfFmiq/cCs=
github.com/andybalholm/brotli v1.0.5/go.mod h1:fO7iG3H7G2nSZ7m0zPUDn85XEX2GTukHGRSepvi9Eig=
github.com/apache/arrow/go/v13 v13.0.0-20230601070034-e07e22c5580a h1:aaBBrE3hCkzdE5OAybYjbgYMijWoqkaWkUknoppVTn4=
github.com/apache/arrow/go/v13 v13.0.0-20230601070034-e07e22c5580a/go.mod h1:/XatdE3kDIBqZKhZ7OBUHwP2jaASDFZHqF4puOWM8po=
github.com/apache/thrift v0.18.1 h1:lNhK/1nqjbwbiOPDBPFJVKxgDEGSepKuTh6OLiXW8kg=
github.com/apache/thrift v0.18.1/go.mod h1:rdQn/dCcDKEWjjylUeueum4vQEjG2v8v2PqriUnbr+I=
github.com/bradleyjkemp/cupaloy/v2 v2.8.0 h1:any4BmKE+jGIaMpnU8YgH/I2LPiLBufr6oMMlVBbn9M=
github.com/bradleyjkemp/cupaloy/v2 v2.8.0/go.mod h1:bm7JXdkRd4BHJk9HpwqAI8BoAY1lps46Enkdqw6aRX0=
github.com/cloudquery/arrow/go/v13 v13.0.0-20230521112802-adef07d4bbaa h1:6y3l+YgGqMJsx5TrxFHPjxDqZ5c3M9+r3dv+CYIRl44=
github.com/cloudquery/arrow/go/v13 v13.0.0-20230521112802-adef07d4bbaa/go.mod h1:/XatdE3kDIBqZKhZ7OBUHwP2jaASDFZHqF4puOWM8po=
github.com/cloudquery/plugin-pb-go v1.0.8 h1:wn3GXhcNItcP+6wUUZuzUFbvdL59liKBO37/izMi+FQ=
github.com/cloudquery/plugin-pb-go v1.0.8/go.mod h1:vAGA27psem7ZZNAY4a3S9TKuA/JDQWstjKcHPJX91Mc=
github.com/cloudquery/plugin-sdk/v3 v3.6.7 h1:QJqZGHs+3uN+CE3y9oBOjkchH/v5XisRnVP2X4aj9Wo=
github.com/cloudquery/plugin-sdk/v3 v3.6.7/go.mod h1:+ta6OETfGfzh6nCpCyZi5Er1rj+zvn7m2QR2wokEvA8=
github.com/cloudquery/plugin-sdk/v3 v3.8.1 h1:Rj+3zBLmscKSbG+JPLT5bzgv56oPwBHRSMGyJ1DSfyc=
github.com/cloudquery/plugin-sdk/v3 v3.8.1/go.mod h1:8PHS8cMjWPeXrurnI30dyHwViK4HJUZLA6uys+F2fXQ=
github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -100,8 +100,8 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 h1:H2TDz8ibqkAF6YGhCdN3jS9O0/s90v0rJh3X/OLHEUk=
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8=
gonum.org/v1/gonum v0.11.0 h1:f1IJhK4Km5tBJmaiJXtk/PkL4cdVX6J+tGiM187uT5E=
google.golang.org/genproto/googleapis/rpc v0.0.0-20230530153820-e85fd2cbaebc h1:XSJ8Vk1SWuNr8S18z1NZSziL0CPIXLCCMDOEFtHBOFc=
google.golang.org/genproto/googleapis/rpc v0.0.0-20230530153820-e85fd2cbaebc/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA=
google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A=
google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU=
google.golang.org/grpc v1.55.0 h1:3Oj82/tFSCeUrRTg/5E/7d/W5A1tj6Ky1ABAuZuv5ag=
google.golang.org/grpc v1.55.0/go.mod h1:iYEXKGkEBhg1PjZQvoYEVPTDkHo1/bjTnfwTeGONTY8=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
Expand Down
4 changes: 2 additions & 2 deletions json/testdata/TestWriteRead.jsonl

Large diffs are not rendered by default.

63 changes: 49 additions & 14 deletions parquet/read.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,34 +79,69 @@ func reverseTransformArray(dt arrow.DataType, arr arrow.Array) arrow.Array {
return reverseTransformTime32(dt.(*arrow.Time32Type), arr)
case *array.Time64:
return reverseTransformTime64(dt.(*arrow.Time64Type), arr)
case *array.Struct:
return reverseTransformStruct(dt.(*arrow.StructType), arr)

case array.ListLike:
values := reverseTransformArray(dt.(listLikeType).Elem(), arr.ListValues())
res := array.NewListData(array.NewData(
var child arrow.ArrayData
switch dt := dt.(type) {
case *arrow.MapType:
child = reverseTransformArray(dt.ValueType(), arr.ListValues()).Data()
case listLikeType:
child = reverseTransformArray(dt.Elem(), arr.ListValues()).Data()
default:
panic("unsupported list like conv to " + dt.String())
}
return array.MakeFromData(array.NewData(
dt, arr.Len(),
arr.Data().Buffers(),
[]arrow.ArrayData{values.Data()},
[]arrow.ArrayData{child},
arr.NullN(), arr.Data().Offset(),
))
return res
}

if isUnsupportedType(dt) {
return reverseTransformFromString(dt, arr)
default:
return arr
}

return arr
}

func reverseTransformFromString(dt arrow.DataType, col arrow.Array) arrow.Array {
func reverseTransformFromString(dt arrow.DataType, arr arrow.Array) arrow.Array {
builder := array.NewBuilder(memory.DefaultAllocator, dt)
for i := 0; i < col.Len(); i++ {
if col.IsNull(i) {
for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}
if err := builder.AppendValueFromString(col.ValueStr(i)); err != nil {
panic(fmt.Errorf("failed to append string %q value: %w", col.ValueStr(i), err))
if err := builder.AppendValueFromString(arr.ValueStr(i)); err != nil {
panic(fmt.Errorf("failed to append string %q value: %w", arr.ValueStr(i), err))
}
}
return builder.NewArray()
}

func reverseTransformStruct(dt *arrow.StructType, arr *array.Struct) *array.Struct {
children := make([]arrow.Array, arr.NumField())
names := make([]string, arr.NumField())
for i := range children {
children[i] = reverseTransformArray(dt.Field(i).Type, arr.Field(i))
names[i] = dt.Field(i).Name
}

// structs are sometimes read oddly when the outer struct is nullable but the inner one isn't
builder := array.NewStructBuilder(memory.DefaultAllocator, dt)

for i := 0; i < arr.Len(); i++ {
if arr.IsNull(i) {
builder.AppendNull()
continue
}

builder.Append(true)
for j, c := range children {
if err := builder.FieldBuilder(j).AppendValueFromString(c.ValueStr(i)); err != nil {
panic(err)
}
}
}

return builder.NewStructArray()
}
102 changes: 52 additions & 50 deletions parquet/write.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,58 +32,50 @@ func (*Client) WriteTableBatch(w io.Writer, table *schema.Table, records []arrow
return fw.Close()
}

func convertSchema(sch *arrow.Schema) *arrow.Schema {
oldFields := sch.Fields()
fields := make([]arrow.Field, len(oldFields))
for i := range fields {
fields[i] = oldFields[i]
fields[i].Type = transformDataType(fields[i].Type)
}

md := sch.Metadata()
newSchema := arrow.NewSchema(fields, &md)
return newSchema
func convertSchema(sc *arrow.Schema) *arrow.Schema {
md := arrow.MetadataFrom(sc.Metadata().ToMap())
return arrow.NewSchema(convertFieldTypes(sc.Fields()), &md)
}

func transformDataType(t arrow.DataType) arrow.DataType {
switch dt := t.(type) {
case *types.JSONType,
*types.MACType,
*types.InetType,
*types.UUIDType:
return arrow.BinaryTypes.String
case listLikeType:
return arrow.ListOf(transformDataType(dt.Elem()))
}

if isUnsupportedType(t) {
return arrow.BinaryTypes.String
func convertFieldTypes(fields []arrow.Field) []arrow.Field {
newFields := make([]arrow.Field, len(fields))
copy(newFields, fields)
for i := range newFields {
newFields[i].Type = transformDataType(newFields[i].Type)
}

return t
return newFields
}

func isUnsupportedType(t arrow.DataType) bool {
func transformDataType(t arrow.DataType) arrow.DataType {
switch dt := t.(type) {
case *arrow.DurationType,
*arrow.DayTimeIntervalType,
*arrow.MonthDayNanoIntervalType,
*arrow.MonthIntervalType: // unsupported in pqarrow
return true
return arrow.BinaryTypes.String

case *arrow.LargeBinaryType,
*arrow.LargeListType,
*arrow.LargeStringType: // not yet implemented in arrow
return true
return arrow.BinaryTypes.String

case *types.JSONType,
*types.MACType,
*types.InetType,
*types.UUIDType:
return arrow.BinaryTypes.String

case *arrow.StructType:
for _, f := range dt.Fields() {
if isUnsupportedType(f.Type) {
return true
}
}
return arrow.StructOf(convertFieldTypes(dt.Fields())...)

case *arrow.MapType:
return arrow.MapOf(transformDataType(dt.KeyType()), transformDataType(dt.ItemType()))

case listLikeType:
return isUnsupportedType(dt.Elem())
return arrow.ListOf(transformDataType(dt.Elem()))
default:
return t
}
return false
}

// transformRecord casts extension columns or unsupported columns to string. It does not release the original record.
Expand All @@ -96,28 +88,38 @@ func transformRecord(sc *arrow.Schema, rec arrow.Record) arrow.Record {
}

func transformArray(arr arrow.Array) arrow.Array {
switch arr := arr.(type) {
case *types.UUIDArray,
*types.InetArray,
*types.MACArray,
*types.JSONArray,
*array.Struct:
if arrow.TypeEqual(arrow.BinaryTypes.String, transformDataType(arr.DataType())) {
return transformToString(arr)
}

switch arr := arr.(type) {
case *array.Struct:
dt := arr.DataType().(*arrow.StructType)
children := make([]arrow.ArrayData, arr.NumField())
names := make([]string, arr.NumField())
for i := range children {
children[i] = transformArray(arr.Field(i)).Data()
names[i] = dt.Field(i).Name
}

return array.NewStructData(array.NewData(
transformDataType(dt), arr.Len(),
arr.Data().Buffers(),
children,
arr.NullN(), arr.Data().Offset(),
))

case array.ListLike:
values := transformArray(arr.ListValues())
return array.NewListData(array.NewData(
return array.MakeFromData(array.NewData(
transformDataType(arr.DataType()), arr.Len(),
arr.Data().Buffers(),
[]arrow.ArrayData{values.Data()},
[]arrow.ArrayData{transformArray(arr.ListValues()).Data()},
arr.NullN(), arr.Data().Offset(),
))
}

if isUnsupportedType(arr.DataType()) {
return transformToString(arr)
default:
return arr
}

return arr
}

func transformToString(arr arrow.Array) arrow.Array {
Expand Down
6 changes: 4 additions & 2 deletions parquet/write_read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"time"

"github.com/apache/arrow/go/v13/arrow"
"github.com/apache/arrow/go/v13/arrow/array"
"github.com/cloudquery/plugin-sdk/v3/plugins/destination"
"github.com/cloudquery/plugin-sdk/v3/schema"
)
Expand Down Expand Up @@ -51,8 +52,9 @@ func TestWriteRead(t *testing.T) {
}()
totalCount := 0
for got := range ch {
if diff := destination.RecordDiff(records[totalCount], got); diff != "" {
t.Fatalf("got diff: %s", diff)
curr := records[totalCount]
if !array.RecordApproxEqual(curr, got) {
t.Fatalf("got diff (record %d): %s\n", totalCount, destination.RecordDiff(records[totalCount], got))
}
totalCount++
}
Expand Down