Permalink
Browse files

Parquet "meta" command (#389)

* add nullcount to PackDefinition - performance wise it's already calculated here

* add statistics parameter to Write method

* stats feature
- change build badge on readme.txt to azure pipelines
- open up internal thrift structres when required
- creating parq meta - this can only view file metadata for now, and cannot decode min/max values

* more pretty output

* remove ParquetOptions from IDataTypeHandler.Read as it's not used anywhere

* exposing ReadSingle in IDataTypeHandler

* finishing up the meta command and decoding stats

* extra comment in test
  • Loading branch information...
aloneguid committed Jan 11, 2019
1 parent c09e8bb commit 85f642b1daf90254c953b2bfdbbf1d193979bbb0
Showing with 3,131 additions and 2,533 deletions.
  1. +3 −2 README.md
  2. +72 −0 doc/parq.md
  3. +9 −0 src/Parquet.CLI/Commands/FileInputCommand.cs
  4. +119 −0 src/Parquet.CLI/Commands/MetaCommand.cs
  5. +9 −0 src/Parquet.CLI/Help.Designer.cs
  6. +3 −0 src/Parquet.CLI/Help.resx
  7. +3 −3 src/Parquet.CLI/Parquet.CLI.csproj
  8. +12 −0 src/Parquet.CLI/Program.cs
  9. +1 −1 src/Parquet.Runner/Parquet.Runner.csproj
  10. +1 −1 src/Parquet.Test/Rows/LazyColumnEnumeratorTest.cs
  11. +2 −0 src/Parquet.Test/TestBase.cs
  12. +11 −7 src/Parquet/3rdparty/Thrift/Protocol/TAbstractBase.cs
  13. +5 −4 src/Parquet/3rdparty/Thrift/Protocol/TBase.cs
  14. +31 −30 src/Parquet/3rdparty/Thrift/Protocol/TField.cs
  15. +24 −24 src/Parquet/3rdparty/Thrift/Protocol/TList.cs
  16. +31 −30 src/Parquet/3rdparty/Thrift/Protocol/TMap.cs
  17. +31 −30 src/Parquet/3rdparty/Thrift/Protocol/TMessage.cs
  18. +10 −10 src/Parquet/3rdparty/Thrift/Protocol/TMessageType.cs
  19. +101 −96 src/Parquet/3rdparty/Thrift/Protocol/TProtocol.cs
  20. +28 −28 src/Parquet/3rdparty/Thrift/Protocol/TSet.cs
  21. +17 −17 src/Parquet/3rdparty/Thrift/Protocol/TStruct.cs
  22. +19 −19 src/Parquet/3rdparty/Thrift/Protocol/TType.cs
  23. +116 −113 src/Parquet/3rdparty/Thrift/Transport/TTransport.cs
  24. +15 −13 src/Parquet/Data/BasicDataTypeHandler.cs
  25. +4 −8 src/Parquet/Data/BasicPrimitiveDataTypeHandler.cs
  26. +10 −6 src/Parquet/Data/Concrete/BooleanDataTypeHandler.cs
  27. +12 −10 src/Parquet/Data/Concrete/ByteArrayDataTypeHandler.cs
  28. +1 −1 src/Parquet/Data/Concrete/ByteDataTypeHandler.cs
  29. +2 −25 src/Parquet/Data/Concrete/DateTimeDataTypeHandler.cs
  30. +19 −11 src/Parquet/Data/Concrete/DateTimeOffsetDataTypeHandler.cs
  31. +24 −2 src/Parquet/Data/Concrete/DecimalDataTypeHandler.cs
  32. +1 −1 src/Parquet/Data/Concrete/DoubleDataTypeHandler.cs
  33. +1 −1 src/Parquet/Data/Concrete/FloatDataTypeHandler.cs
  34. +1 −1 src/Parquet/Data/Concrete/Int16DataTypeHandler.cs
  35. +1 −1 src/Parquet/Data/Concrete/Int32DataTypeHandler.cs
  36. +1 −1 src/Parquet/Data/Concrete/Int64DataTypeHandler.cs
  37. +1 −1 src/Parquet/Data/Concrete/Int96DataTypeHandler.cs
  38. +10 −2 src/Parquet/Data/Concrete/IntervalDataTypeHandler.cs
  39. +2 −7 src/Parquet/Data/Concrete/SignedByteDataTypeHandler.cs
  40. +11 −19 src/Parquet/Data/Concrete/StringDataTypeHandler.cs
  41. +2 −4 src/Parquet/Data/Concrete/UnsignedInt16DataTypeHandler.cs
  42. +3 −2 src/Parquet/Data/DataColumn.cs
  43. +11 −3 src/Parquet/Data/IDataTypeHandler.cs
  44. +6 −10 src/Parquet/Data/NonDataDataTypeHandler.cs
  45. +1 −4 src/Parquet/Extensions/ThriftExtensions.cs
  46. +2 −2 src/Parquet/File/DataColumnReader.cs
  47. +17 −4 src/Parquet/File/DataColumnWriter.cs
  48. +1 −0 src/Parquet/File/ThriftFooter.cs
  49. +21 −4 src/Parquet/ParquetExtensions.cs
  50. +4 −1 src/Parquet/ParquetReader.cs
  51. +178 −157 src/Parquet/Thrift/ColumnChunk.cs
  52. +546 −489 src/Parquet/Thrift/ColumnMetaData.cs
  53. +12 −10 src/Parquet/Thrift/CompressionCodec.cs
  54. +118 −116 src/Parquet/Thrift/ConvertedType.cs
  55. +2 −0 src/Parquet/Thrift/DataPageHeader.cs
  56. +2 −0 src/Parquet/Thrift/DataPageHeaderV2.cs
  57. +2 −0 src/Parquet/Thrift/DictionaryPageHeader.cs
  58. +57 −55 src/Parquet/Thrift/Encoding.cs
  59. +20 −18 src/Parquet/Thrift/FieldRepetitionType.cs
  60. +305 −275 src/Parquet/Thrift/FileMetaData.cs
  61. +2 −0 src/Parquet/Thrift/IndexPageHeader.cs
  62. +119 −104 src/Parquet/Thrift/KeyValue.cs
  63. +141 −125 src/Parquet/Thrift/PageEncodingStats.cs
  64. +2 −0 src/Parquet/Thrift/PageHeader.cs
  65. +9 −7 src/Parquet/Thrift/PageType.cs
  66. +220 −198 src/Parquet/Thrift/RowGroup.cs
  67. +4 −4 src/Parquet/Thrift/SchemaElement.cs
  68. +138 −122 src/Parquet/Thrift/SortingColumn.cs
  69. +226 −199 src/Parquet/Thrift/Statistics.cs
  70. +19 −17 src/Parquet/Thrift/Type.cs
  71. +146 −55 src/spark-experiments/parquet.iml
  72. +4 −21 src/spark-experiments/pom.xml
  73. +1 −1 src/spark-experiments/src/main/scala/alltestdata.sc
  74. +16 −1 src/spark-experiments/src/main/scala/com/ivan/parquet/ScalaApp.scala
@@ -9,9 +9,10 @@ Note that [Elastacloud](https://elastacloud.com/Home) provides commercial suppor
## Status

[![NuGet](https://img.shields.io/nuget/v/Parquet.Net.svg)](https://www.nuget.org/packages/Parquet.Net)
[![Build status](https://ci.appveyor.com/api/projects/status/w3o50mweytm85uxb/branch/master?svg=true)](https://ci.appveyor.com/project/aloneguid/parquet-dotnet/branch/master)

Latest unstable versions are avaible on [this NuGet feed](https://ci.appveyor.com/nuget/parquet-dotnet) (it will also contain packages built from your PRs).
| Core Build | Windows/Linux/Mac Tests |
|------------|----------------|
|[![Build status](https://aloneguid.visualstudio.com/Parquet.Net/_apis/build/status/Core%20Build)](https://aloneguid.visualstudio.com/Parquet.Net/_build/latest?definitionId=30)|[![Build status](https://aloneguid.visualstudio.com/Parquet.Net/_apis/build/status/Multi%20OS%20Tests)](https://aloneguid.visualstudio.com/Parquet.Net/_build/latest?definitionId=29)|

**Fully managed** .NET library to read and write [Apache Parquet](https://parquet.apache.org/) files. Supports:
- `.NET 4.5` and up.
@@ -62,6 +62,78 @@ As JSON is usually human readable you can use this command to view the file.

By default **parq** displays the first 10 rows of the source file, however you can override it with `--max-rows` option.

### Viewing Internal Metadata

Internal metadata is grabbed from parquet file internals and describes pretty much everything we know about the file. This metadata is not by default exposed from Parquet.Net API as it's hard to work with from the user perspective, however it can be extremely useful for performance tuning and general understanding how a particular file is structured.

To view this metadata, type

```powershell
parq meta <path-to-file>
````

sample output:

```bash
parq meta stats_test.parquet
```

```
parq v1.0.0
File Metadata
Created By parquet-mr version 1.8.3 (build aef7230e114214b7cc962a8f3fc5aeed6ce80828)
Total Rows 2
Version 1
Key-Value Metadata
org.apache.spark.sql.parquet.row.metadata {"type":"struct","fields":[{"name":"isbn","type":"string","nullable":true,"metadata":{}},{"name":"author","type":"string","nullable":true,"metadata":{}}]}
Row Groups
Row Group #0
Total Rows 2
Total Byte Size 162 (0.16 KiB)
Column #0
File Offset 4
File Path
Codec UNCOMPRESSED
Data Page Offset 4
Dictionary Page Offset 0
Index Page Offset 0
Encodings RLE, PLAIN, BIT_PACKED
Total Values 2
Path in Schema isbn
Compressed Size 67 (0.07 KiB)
Uncompressed Size 67 (0.07 KiB)
Type BYTE_ARRAY
Statistics
Null Count 0
Distinct Count undefined
Min 12345-6
Max 12345-7
Column #1
File Offset 71
File Path
Codec UNCOMPRESSED
Data Page Offset 71
Dictionary Page Offset 0
Index Page Offset 0
Encodings RLE, PLAIN, BIT_PACKED
Total Values 2
Path in Schema author
Compressed Size 95 (0.09 KiB)
Uncompressed Size 95 (0.09 KiB)
Type BYTE_ARRAY
Statistics
Null Count 0
Distinct Count undefined
Min Ivan Gavryliuk
Max Richard Conway
```

### More Commands

@@ -4,6 +4,7 @@
using System.Text;
using Cpf.Widgets;
using Parquet.Data.Rows;
using Table = Parquet.Data.Rows.Table;

namespace Parquet.CLI.Commands
{
@@ -36,5 +37,13 @@ protected Table ReadTable(int maxRows = 10)
}
}
}

protected Thrift.FileMetaData ReadInternalMetadata()
{
using (var reader = ParquetReader.OpenFromFile(_path))
{
return reader.ThriftMetadata;
}
}
}
}
@@ -0,0 +1,119 @@
using System;
using System.Collections.Generic;
using System.Text;
using Cpf.Widgets;
using static Cpf.PoshConsole;
using NetBox.Extensions;
using System.Linq;

namespace Parquet.CLI.Commands
{
class MetaCommand : FileInputCommand
{
public MetaCommand(string path) : base(path)
{
Telemetry.CommandExecuted("meta",
"path", path);
}

public void Execute()
{
Thrift.FileMetaData fileMeta = ReadInternalMetadata();

//root metadata
WriteLine("File Metadata", T.HeadingTextColor);
var t = new Table("name", "value");
t.AddRow("Created By", fileMeta.Created_by);
t.AddRow("Total Rows", fileMeta.Num_rows);
t.AddRow("Version", fileMeta.Version);
t.Render(false, 0, T.HeadingTextColor, T.NormalTextColor);
WriteLine();

//custom key-values
if (fileMeta.Key_value_metadata == null || fileMeta.Key_value_metadata.Count == 0)
{
WriteLine("no custom key-value metadata", ConsoleColor.Gray);
}
else
{
WriteLine("Key-Value Metadata", T.HeadingTextColor);
t = new Table("key", "vlaue");
foreach(Thrift.KeyValue kv in fileMeta.Key_value_metadata)
{
t.AddRow(kv.Key, kv.Value);
}
t.Render(false, 0, T.HeadingTextColor, T.NormalTextColor);
}
WriteLine();

//row groups
WriteLine("Row Groups", T.HeadingTextColor);
int i = 0;
foreach(Thrift.RowGroup rg in fileMeta.Row_groups)
{
Print(fileMeta, rg, i++);
}
}

private void Print(Thrift.FileMetaData fileMeta, Thrift.RowGroup rg, int index)
{
WriteLine();
PoshWriteLine($" Row Group #{{{index}}}", ConsoleColor.Red);
var t = new Table("name", "value");
t.AddRow("Total Rows", rg.Num_rows);
t.AddRow("Total Byte Size", GetSizeString(rg.Total_byte_size));
t.Render(false, 2, T.HeadingTextColor, T.NormalTextColor);
WriteLine();

//columns
int i = 0;
foreach(Thrift.ColumnChunk column in rg.Columns)
{
t = new Table("name", "value");
t.AddRow("File Offset", column.File_offset);
t.AddRow("File Path", column.File_path ?? string.Empty);
t.AddRow("Codec", column.Meta_data.Codec);
t.AddRow("Data Page Offset", column.Meta_data.Data_page_offset);
t.AddRow("Dictionary Page Offset", column.Meta_data.Dictionary_page_offset);
t.AddRow("Index Page Offset", column.Meta_data.Index_page_offset);
t.AddRow("Encodings", string.Join(", ", column.Meta_data.Encodings));
//t.AddRow("", column.Meta_data.Encoding_stats[0].)
t.AddRow("Total Values", column.Meta_data.Num_values);
t.AddRow("Path in Schema", string.Join(".", column.Meta_data.Path_in_schema));
t.AddRow("Compressed Size", GetSizeString(column.Meta_data.Total_compressed_size));
t.AddRow("Uncompressed Size", GetSizeString(column.Meta_data.Total_uncompressed_size));
t.AddRow("Type", column.Meta_data.Type);
PoshWriteLine($" Column #{{{i++}}}", ConsoleColor.Red);
t.Render(false, 4, T.HeadingTextColor, T.NormalTextColor);
PrintStatistics(fileMeta, column, column.Meta_data.Statistics);

WriteLine();
}
}

private string GetSizeString(long size)
{
return $"{size} ({size.ToFileSizeString()})";
}

private void PrintStatistics(Thrift.FileMetaData fileMeta, Thrift.ColumnChunk column, Thrift.Statistics stats)
{
WriteLine(" Statistics", T.HeadingTextColor);

if(stats == null || !(stats.__isset.null_count || stats.__isset.distinct_count || stats.__isset.min || stats.__isset.max))
{
WriteLine(" none defined", T.ErrorTextColor);
return;
}

const string undefined = "undefined";

var t = new Table("name", "value");
t.AddRow("Null Count", stats.__isset.null_count ? stats.Null_count.ToString() : undefined);
t.AddRow("Distinct Count", stats.__isset.distinct_count ? stats.Distinct_count.ToString() : undefined);
t.AddRow("Min", stats.__isset.min ? fileMeta.DecodeSingleStatsValue(column, stats.Min) : undefined);
t.AddRow("Max", stats.__isset.max ? fileMeta.DecodeSingleStatsValue(column, stats.Max) : undefined);
t.Render(false, 6, T.HeadingTextColor, T.NormalTextColor);
}
}
}

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.
@@ -141,6 +141,9 @@
<data name="Command_Convert_Style" xml:space="preserve">
<value>Output style. By default result is a multiline json document. Specify "single" to generate an array document with list of documents embedded, or "multiline" for a multiline json.</value>
</data>
<data name="Command_Meta_Description" xml:space="preserve">
<value>Displays internal parquet metadata</value>
</data>
<data name="Command_Schema_Description" xml:space="preserve">
<value>Displays parquet file schema as Parquet.Net sees it. Note that this is a simplified, human-readable version of the schema.</value>
</data>
@@ -18,9 +18,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Cpf.Core" Version="1.1.48" />
<PackageReference Include="LogMagic" Version="2.17.3" />
<PackageReference Include="LogMagic.Microsoft.Azure.ApplicationInsights" Version="2.17.3" />
<PackageReference Include="Cpf.Core" Version="1.1.52" />
<PackageReference Include="LogMagic" Version="2.17.4" />
<PackageReference Include="LogMagic.Microsoft.Azure.ApplicationInsights" Version="2.17.4" />
</ItemGroup>

<ItemGroup>
@@ -61,6 +61,18 @@ static int Main(string[] args)
});
});

app.Command("meta", cmd =>
{
cmd.Description = Help.Command_Meta_Description;

LinePrimitive<string> path = cmd.Argument<string>("path", Help.Argument_Path).Required().FileExists();

cmd.OnExecute(() =>
{
new MetaCommand(path).Execute();
});
});

app.Command("convert", cmd =>
{
cmd.Description = Help.Command_Convert_Description;
@@ -6,7 +6,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="LogMagic" Version="2.17.3" />
<PackageReference Include="LogMagic" Version="2.17.4" />
</ItemGroup>

<ItemGroup>
@@ -113,7 +113,7 @@ public void Empty_list()
Assert.Equal(3, topLevel.Count);

Assert.Equal(2, topLevel[0].ToDataArray().Length);
Assert.Equal(0, topLevel[1].ToDataArray().Length);
Assert.Empty(topLevel[1].ToDataArray());
Assert.Equal(2, topLevel[2].ToDataArray().Length);
}
}
@@ -88,6 +88,8 @@ protected DataColumn[] WriteReadSingleRowGroup(Schema schema, DataColumn[] colum
ms.WriteSingleRowGroupParquetFile(schema, columns);
ms.Position = 0;

//System.IO.File.WriteAllBytes("c:\\tmp\\1.parquet", ms.ToArray());

using (var reader = new ParquetReader(ms))
{
readSchema = reader.Schema;
@@ -20,11 +20,15 @@
#pragma warning disable CS1587
namespace Thrift.Protocol
{
interface TAbstractBase
{
///
/// Writes the objects out to the protocol
///
void Write(TProtocol tProtocol);
}
/// <summary>
///
/// </summary>
public interface TAbstractBase
{
/// <summary>
/// Writes the objects out to the protocol
/// </summary>
/// <param name="tProtocol"></param>
void Write(TProtocol tProtocol);
}
}
@@ -1,5 +1,5 @@
#pragma warning disable CS1587
/**
#pragma warning disable CS1591 // Missing XML comment for publicly visible type or member
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -19,11 +19,12 @@
*/
namespace Thrift.Protocol
{
interface TBase : TAbstractBase
{
public interface TBase : TAbstractBase
{
///
/// Reads the TObject from the given input protocol.
///
void Read(TProtocol tProtocol);
}
}
#pragma warning restore CS1591 // Missing XML comment for publicly visible type or member
Oops, something went wrong.

0 comments on commit 85f642b

Please sign in to comment.