This repository has been archived by the owner on Jul 19, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 73
/
parquet.go
66 lines (61 loc) 路 1.89 KB
/
parquet.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
package main
import (
"context"
"fmt"
"os"
"strings"
"github.com/dustin/go-humanize"
"github.com/olekukonko/tablewriter"
"github.com/segmentio/parquet-go"
)
func parquetInspect(ctx context.Context, path string) error {
f, err := os.Open(path)
if err != nil {
return err
}
defer f.Close()
stats, err := f.Stat()
if err != nil {
return err
}
pf, err := parquet.OpenFile(f, stats.Size())
if err != nil {
return err
}
out := output(ctx)
fmt.Fprintln(out, "schema:", pf.Schema())
meta := pf.Metadata()
fmt.Println("Num Rows:", meta.NumRows)
for i, rg := range meta.RowGroups {
fmt.Fprintln(out, "\t Row group:", i)
fmt.Fprintln(out, "\t\t Row Count:", rg.NumRows)
fmt.Fprintln(out, "\t\t Row size:", humanize.Bytes(uint64(rg.TotalByteSize)))
fmt.Fprintln(out, "\t\t Columns:")
table := tablewriter.NewWriter(out)
table.SetHeader([]string{
"Col", "Type", "NumVal", "TotalCompressedSize", "TotalUncompressedSize", "Compression", "%", "PageCount", "AvgPageSize",
})
for j, ds := range rg.Columns {
offsets := pf.OffsetIndexes()[j]
var avgPageSize int64
for _, offset := range offsets.PageLocations {
avgPageSize += int64(offset.CompressedPageSize)
}
avgPageSize /= int64(len(offsets.PageLocations))
table.Append(
[]string{
strings.Join(ds.MetaData.PathInSchema, "/"),
ds.MetaData.Type.String(),
fmt.Sprintf("%d", ds.MetaData.NumValues),
humanize.Bytes(uint64(ds.MetaData.TotalCompressedSize)),
humanize.Bytes(uint64(ds.MetaData.TotalUncompressedSize)),
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalUncompressedSize-ds.MetaData.TotalCompressedSize)/float64(ds.MetaData.TotalCompressedSize)*100),
fmt.Sprintf("%.2f", float64(ds.MetaData.TotalCompressedSize)/float64(rg.TotalByteSize)*100),
fmt.Sprintf("%d", len(offsets.PageLocations)),
humanize.Bytes(uint64(avgPageSize)),
})
}
table.Render()
}
return nil
}