Skip to content

Commit f9c4ccb

Browse files
authored
feat(bigquery): expose ParquetOptions for loads and external tables (#4016)
Adapts ParquetOptions to behave similarly to the various other format options like CSV/BigTable/Sheets. Also, refactors the FileConfig tests to be table-test style to make it easier to test multiple configs.
1 parent a825ef4 commit f9c4ccb

File tree

4 files changed

+173
-44
lines changed

4 files changed

+173
-44
lines changed

Diff for: bigquery/external.go

+33-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ type ExternalDataConfig struct {
9090
// when reading data.
9191
MaxBadRecords int64
9292

93-
// Additional options for CSV, GoogleSheets and Bigtable formats.
93+
// Additional options for CSV, GoogleSheets, Bigtable, and Parquet formats.
9494
Options ExternalDataConfigOptions
9595

9696
// HivePartitioningOptions allows use of Hive partitioning based on the
@@ -139,6 +139,8 @@ func bqToExternalDataConfig(q *bq.ExternalDataConfiguration) (*ExternalDataConfi
139139
if err != nil {
140140
return nil, err
141141
}
142+
case q.ParquetOptions != nil:
143+
e.Options = bqToParquetOptions(q.ParquetOptions)
142144
}
143145
return e, nil
144146
}
@@ -416,6 +418,36 @@ func bqToBigtableColumn(q *bq.BigtableColumn) (*BigtableColumn, error) {
416418
return b, nil
417419
}
418420

421+
// ParquetOptions are additional options for Parquet external data sources.
422+
type ParquetOptions struct {
423+
// EnumAsString indicates whether to infer Parquet ENUM logical type as
424+
// STRING instead of BYTES by default.
425+
EnumAsString bool
426+
427+
// EnableListInference indicates whether to use schema inference
428+
// specifically for Parquet LIST logical type.
429+
EnableListInference bool
430+
}
431+
432+
func (o *ParquetOptions) populateExternalDataConfig(c *bq.ExternalDataConfiguration) {
433+
if o != nil {
434+
c.ParquetOptions = &bq.ParquetOptions{
435+
EnumAsString: o.EnumAsString,
436+
EnableListInference: o.EnableListInference,
437+
}
438+
}
439+
}
440+
441+
func bqToParquetOptions(q *bq.ParquetOptions) *ParquetOptions {
442+
if q == nil {
443+
return nil
444+
}
445+
return &ParquetOptions{
446+
EnumAsString: q.EnumAsString,
447+
EnableListInference: q.EnableListInference,
448+
}
449+
}
450+
419451
// HivePartitioningMode is used in conjunction with HivePartitioningOptions.
420452
type HivePartitioningMode string
421453

Diff for: bigquery/external_test.go

+7
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,13 @@ func TestExternalDataConfig(t *testing.T) {
8080
},
8181
},
8282
},
83+
{
84+
SourceFormat: Parquet,
85+
Options: &ParquetOptions{
86+
EnumAsString: true,
87+
EnableListInference: true,
88+
},
89+
},
8390
} {
8491
q := want.toBQ()
8592
got, err := bqToExternalDataConfig(&q)

Diff for: bigquery/file.go

+15
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ type FileConfig struct {
7474

7575
// Additional options for CSV files.
7676
CSVOptions
77+
78+
// Additional options for Parquet files.
79+
ParquetOptions *ParquetOptions
7780
}
7881

7982
func (fc *FileConfig) populateLoadConfig(conf *bq.JobConfigurationLoad) {
@@ -89,6 +92,12 @@ func (fc *FileConfig) populateLoadConfig(conf *bq.JobConfigurationLoad) {
8992
if fc.Schema != nil {
9093
conf.Schema = fc.Schema.toBQ()
9194
}
95+
if fc.ParquetOptions != nil {
96+
conf.ParquetOptions = &bq.ParquetOptions{
97+
EnumAsString: fc.ParquetOptions.EnumAsString,
98+
EnableListInference: fc.ParquetOptions.EnableListInference,
99+
}
100+
}
92101
conf.Quote = fc.quote()
93102
}
94103

@@ -122,6 +131,12 @@ func (fc *FileConfig) populateExternalDataConfig(conf *bq.ExternalDataConfigurat
122131
if format == CSV {
123132
fc.CSVOptions.populateExternalDataConfig(conf)
124133
}
134+
if fc.ParquetOptions != nil {
135+
conf.ParquetOptions = &bq.ParquetOptions{
136+
EnumAsString: fc.ParquetOptions.EnumAsString,
137+
EnableListInference: fc.ParquetOptions.EnableListInference,
138+
}
139+
}
125140
}
126141

127142
// Encoding specifies the character encoding of data to be loaded into BigQuery.

Diff for: bigquery/file_test.go

+118-43
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ package bigquery
1717
import (
1818
"testing"
1919

20-
"cloud.google.com/go/internal/pretty"
2120
"cloud.google.com/go/internal/testutil"
2221
bq "google.golang.org/api/bigquery/v2"
2322
)
@@ -45,54 +44,130 @@ var (
4544
)
4645

4746
func TestFileConfigPopulateLoadConfig(t *testing.T) {
48-
want := &bq.JobConfigurationLoad{
49-
SourceFormat: "CSV",
50-
FieldDelimiter: "\t",
51-
SkipLeadingRows: 8,
52-
AllowJaggedRows: true,
53-
AllowQuotedNewlines: true,
54-
Autodetect: true,
55-
Encoding: "UTF-8",
56-
MaxBadRecords: 7,
57-
IgnoreUnknownValues: true,
58-
Schema: &bq.TableSchema{
59-
Fields: []*bq.TableFieldSchema{
60-
bqStringFieldSchema(),
61-
bqNestedFieldSchema(),
62-
}},
63-
Quote: &hyphen,
47+
testcases := []struct {
48+
description string
49+
fileConfig *FileConfig
50+
want *bq.JobConfigurationLoad
51+
}{
52+
{
53+
description: "default json",
54+
fileConfig: &FileConfig{
55+
SourceFormat: JSON,
56+
},
57+
want: &bq.JobConfigurationLoad{
58+
SourceFormat: "NEWLINE_DELIMITED_JSON",
59+
},
60+
},
61+
{
62+
description: "csv",
63+
fileConfig: &fc,
64+
want: &bq.JobConfigurationLoad{
65+
SourceFormat: "CSV",
66+
FieldDelimiter: "\t",
67+
SkipLeadingRows: 8,
68+
AllowJaggedRows: true,
69+
AllowQuotedNewlines: true,
70+
Autodetect: true,
71+
Encoding: "UTF-8",
72+
MaxBadRecords: 7,
73+
IgnoreUnknownValues: true,
74+
Schema: &bq.TableSchema{
75+
Fields: []*bq.TableFieldSchema{
76+
bqStringFieldSchema(),
77+
bqNestedFieldSchema(),
78+
}},
79+
Quote: &hyphen,
80+
},
81+
},
82+
{
83+
description: "parquet",
84+
fileConfig: &FileConfig{
85+
SourceFormat: Parquet,
86+
ParquetOptions: &ParquetOptions{
87+
EnumAsString: true,
88+
EnableListInference: true,
89+
},
90+
},
91+
want: &bq.JobConfigurationLoad{
92+
SourceFormat: "PARQUET",
93+
ParquetOptions: &bq.ParquetOptions{
94+
EnumAsString: true,
95+
EnableListInference: true,
96+
},
97+
},
98+
},
6499
}
65-
got := &bq.JobConfigurationLoad{}
66-
fc.populateLoadConfig(got)
67-
if !testutil.Equal(got, want) {
68-
t.Errorf("got:\n%v\nwant:\n%v", pretty.Value(got), pretty.Value(want))
100+
for _, tc := range testcases {
101+
got := &bq.JobConfigurationLoad{}
102+
tc.fileConfig.populateLoadConfig(got)
103+
if diff := testutil.Diff(got, tc.want); diff != "" {
104+
t.Errorf("case %s, got=-, want=+:\n%s", tc.description, diff)
105+
}
69106
}
70107
}
71108

72109
func TestFileConfigPopulateExternalDataConfig(t *testing.T) {
73-
got := &bq.ExternalDataConfiguration{}
74-
fc.populateExternalDataConfig(got)
75-
76-
want := &bq.ExternalDataConfiguration{
77-
SourceFormat: "CSV",
78-
Autodetect: true,
79-
MaxBadRecords: 7,
80-
IgnoreUnknownValues: true,
81-
Schema: &bq.TableSchema{
82-
Fields: []*bq.TableFieldSchema{
83-
bqStringFieldSchema(),
84-
bqNestedFieldSchema(),
85-
}},
86-
CsvOptions: &bq.CsvOptions{
87-
AllowJaggedRows: true,
88-
AllowQuotedNewlines: true,
89-
Encoding: "UTF-8",
90-
FieldDelimiter: "\t",
91-
Quote: &hyphen,
92-
SkipLeadingRows: 8,
110+
testcases := []struct {
111+
description string
112+
fileConfig *FileConfig
113+
want *bq.ExternalDataConfiguration
114+
}{
115+
{
116+
description: "json defaults",
117+
fileConfig: &FileConfig{
118+
SourceFormat: JSON,
119+
},
120+
want: &bq.ExternalDataConfiguration{
121+
SourceFormat: "NEWLINE_DELIMITED_JSON",
122+
},
123+
},
124+
{
125+
description: "csv fileconfig",
126+
fileConfig: &fc,
127+
want: &bq.ExternalDataConfiguration{
128+
SourceFormat: "CSV",
129+
Autodetect: true,
130+
MaxBadRecords: 7,
131+
IgnoreUnknownValues: true,
132+
Schema: &bq.TableSchema{
133+
Fields: []*bq.TableFieldSchema{
134+
bqStringFieldSchema(),
135+
bqNestedFieldSchema(),
136+
}},
137+
CsvOptions: &bq.CsvOptions{
138+
AllowJaggedRows: true,
139+
AllowQuotedNewlines: true,
140+
Encoding: "UTF-8",
141+
FieldDelimiter: "\t",
142+
Quote: &hyphen,
143+
SkipLeadingRows: 8,
144+
},
145+
},
146+
},
147+
{
148+
description: "parquet",
149+
fileConfig: &FileConfig{
150+
SourceFormat: Parquet,
151+
ParquetOptions: &ParquetOptions{
152+
EnumAsString: true,
153+
EnableListInference: true,
154+
},
155+
},
156+
want: &bq.ExternalDataConfiguration{
157+
SourceFormat: "PARQUET",
158+
ParquetOptions: &bq.ParquetOptions{
159+
EnumAsString: true,
160+
EnableListInference: true,
161+
},
162+
},
93163
},
94164
}
95-
if diff := testutil.Diff(got, want); diff != "" {
96-
t.Errorf("got=-, want=+:\n%s", diff)
165+
for _, tc := range testcases {
166+
got := &bq.ExternalDataConfiguration{}
167+
tc.fileConfig.populateExternalDataConfig(got)
168+
if diff := testutil.Diff(got, tc.want); diff != "" {
169+
t.Errorf("case %s, got=-, want=+:\n%s", tc.description, diff)
170+
}
97171
}
172+
98173
}

0 commit comments

Comments
 (0)