Permalink
Browse files

Nested arrays (parking) (#197)

* add experiment project, build hierarchical schema

* build out schema path

* nested non-repeated records are read, but not collected back to dataset

* reading nested structures into rows

* nested structures read in rows

* int fix

* new AddColumnar was only ever adding first row

* always return definition for merging, as null count can differ between pages

* update featues page

* generate test file

* fix test project

* schema is read out properly for nested arrays, read out repetition levels

* small fixes

* slight refactoring - moving out schema building code to a separate class; general comments

* schema & schemaelement improvements; generating test files for all nesting permutations

* pretty looking Schema.ToString()

* repetitions are grouped

* schema parsed to more understandable format, but rep and def levels are ruined i think

* slightly better way of pre-calculating column levels

* parking work on nested structures, making sure code is backward compatible and all tests are passing

* update docs
  • Loading branch information...
aloneguid authored and azurecoder committed Aug 21, 2017
1 parent dab80c7 commit 4404b456058fff76af6d236949bfbbd83b3f475e
@@ -1,5 +1,57 @@
root=true

[*.cs]
[*]
indent_style=space
indent_size=3
indent_size=3

[*.cs]
# Sort using and Import directives with System.* appearing first
dotnet_sort_system_directives_first = true

# Avoid "this." if not necessary
dotnet_style_qualification_for_field = false:suggestion
dotnet_style_qualification_for_property = false:suggestion
dotnet_style_qualification_for_method = false:suggestion
dotnet_style_qualification_for_event = false:suggestion


# Use language keywords instead of framework type names for type references
dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion
dotnet_style_predefined_type_for_member_access = true:suggestion

# Suggest more modern language features when available
dotnet_style_object_initializer = true:suggestion
dotnet_style_collection_initializer = true:suggestion
dotnet_style_coalesce_expression = true:suggestion
dotnet_style_null_propagation = true:suggestion
dotnet_style_explicit_tuple_names = true:suggestion

# Prefer "var" when it makes sense
csharp_style_var_for_built_in_types = false:error
csharp_style_var_when_type_is_apparent = true:suggestion
csharp_style_var_elsewhere = false:error

# Prefer method-like constructs to have a block body
csharp_style_expression_bodied_methods = false:none
csharp_style_expression_bodied_constructors = false:none
csharp_style_expression_bodied_operators = false:none

# Prefer property-like constructs to have an expression-body
csharp_style_expression_bodied_properties = true:none
csharp_style_expression_bodied_indexers = true:none
csharp_style_expression_bodied_accessors = true:none

# Suggest more modern language features when available
csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
csharp_style_inlined_variable_declaration = true:suggestion
csharp_style_throw_expression = true:suggestion
csharp_style_conditional_delegate_call = true:suggestion

# Newline settings
csharp_new_line_before_open_brace = all
csharp_new_line_before_else = true
csharp_new_line_before_catch = true
csharp_new_line_before_finally = true
csharp_new_line_before_members_in_object_initializers = true
csharp_new_line_before_members_in_anonymous_types = true
@@ -1,7 +1,7 @@
from fastparquet import ParquetFile

pf = ParquetFile("C:\dev\parquet-dotnet\src\Parquet.Test\data\customer.impala.parquet")
pf = ParquetFile("C:\\dev\\parquet-dotnet\\src\\Parquet.Test\\data\\nested.parquet")

df = pf.to_pandas()

#print(df)
print(df)
@@ -18,6 +18,7 @@ We are implementing Parquet features gradually, and the table below outlines the
|SNAPPY Compression|yes|yes|
|Row groups|yes|yes|
|Data pages|yes|yes|
|Append to files||yes|
|Append to files|-|yes|
|Nested structures|yes|no|
|Repeated structures|no|no|
|Nested structures|no|no|
|Nested arrays|no|no|
@@ -6,7 +6,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="LogMagic" Version="2.4.15" />
<PackageReference Include="LogMagic" Version="2.4.19" />
</ItemGroup>

<ItemGroup>
@@ -23,7 +23,7 @@ static void Main(string[] args)
new SchemaElement<double>("lat"),
new SchemaElement<double>("lon"));

log.D(ds.Schema.Show());
log.D(ds.Schema.ToString());

for (int i = 0; i < 10; i++)
{
@@ -7,9 +7,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.0.0" />
<PackageReference Include="NetBox" Version="1.4.19" />
<PackageReference Include="System.ValueTuple" Version="4.3.1" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.3.0" />
<PackageReference Include="NetBox" Version="1.4.20" />
<PackageReference Include="System.ValueTuple" Version="4.4.0" />
<PackageReference Include="xunit" Version="2.2.0" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.2.0" />
</ItemGroup>
@@ -19,7 +19,13 @@
</ItemGroup>

<ItemGroup>
<None Update="data\nested-struct.json">
<None Update="data\nested.parquet">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="data\nested2.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="data\nested1.json">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="data\alltypes_no_headers.csv">
@@ -88,9 +94,6 @@
<None Update="data\nation.plain.parquet">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="data\nested-struct.parquet">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="data\postcodes.csv">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
@@ -162,24 +162,43 @@ public void Reads_compat_customer_impala_file()
}

[Fact]
public void Reads_nested_struct()
{
DataSet ds = ParquetReader.ReadFile(GetDataFilePath("nested-struct.parquet"));

Assert.Equal(2, ds.Count);

Assert.Equal(typeof(string), ds.Schema[0].ElementType);
public void Reads_really_mad_nested_file()
{
/* Spark schema:
root
|-- addresses: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- line1: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- openingHours: array (nullable = true)
| | | |-- element: long (containsNull = true)
| | |-- postcode: string (nullable = true)
|-- cities: array (nullable = true)
| |-- element: string (containsNull = true)
|-- comment: string (nullable = true)
|-- id: long (nullable = true)
|-- location: struct (nullable = true)
| |-- latitude: double (nullable = true)
| |-- longitude: double (nullable = true)
|-- price: struct (nullable = true)
| |-- lunch: struct (nullable = true)
| | |-- max: long (nullable = true)
| | |-- min: long (nullable = true)
*/


Assert.Throws<NotSupportedException>(() => ParquetReader.ReadFile(GetDataFilePath("nested.parquet")));

//DataSet ds = ParquetReader.ReadFile(GetDataFilePath("nested.parquet"));

//Assert.Equal(2, ds.Count);
//Assert.Equal(6, ds.Schema.Length);

/*Assert.Equal(typeof(string), ds.Schema[0].ElementType);
Assert.Equal(typeof(long), ds.Schema[1].ElementType);
Assert.Equal(typeof(Row), ds.Schema[2].ElementType);
Assert.Equal(typeof(long), ds.Schema[3].ElementType);
Assert.Equal(typeof(Row), ds.Schema[4].ElementType);
}


private string GetDataFilePath(string name)
{
string thisPath = Assembly.Load(new AssemblyName("Parquet.Test")).Location;
return Path.Combine(Path.GetDirectoryName(thisPath), "data", name);
Assert.Equal(typeof(Row), ds.Schema[4].ElementType);*/
}


This file was deleted.

Oops, something went wrong.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,29 @@
{
"comment": "this file contains all the permunations for nested structures and arrays to test Parquet parser",
"id": 1,
"cities": [ "London", "Derby" ],
"location": {
"latitude": 51.2,
"longitude": 66.3
},
"price": {
"lunch": {
"min": 1,
"max": 2
}
},
"addresses": [
{
"name": "Head Office",
"line1": "Dante Road",
"postcode": "SE11",
"openingHours": [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 ]
},
{
"name": "Small Office",
"line1": "Somewhere Else",
"postcode": "TN19",
"openingHours": [6, 7, 19, 20, 21, 22, 23]
}
]
}
@@ -0,0 +1,29 @@
{
"comment": "this file contains all the permunations for nested structures and arrays to test Parquet parser",
"id": 1,
"cities": [ "London", "Derby" ],
"location": {
"latitude": 51.2,
"longitude": 66.3
},
"price": {
"lunch": {
"min": 1,
"max": 2
}
},
"addresses": [
{
"name": "Head Office",
"line1": "Dante Road",
"postcode": "SE11",
"openingHours": [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18 ]
},
{
"name": "Small Office",
"line1": "Somewhere Else",
"postcode": "TN19",
"openingHours": [6, 7, 19, 20, 21, 22, 23]
}
]
}
@@ -1,7 +1,7 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.26430.16
VisualStudioVersion = 15.0.26730.3
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Parquet", "Parquet\Parquet.csproj", "{A0507D42-940E-4EF9-BD33-46BB6561A2F5}"
EndProject
@@ -24,6 +24,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "scripts", "scripts", "{9B67
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{60294E19-6F8D-4D78-9A62-C50489095484}"
ProjectSection(SolutionItems) = preProject
..\.editorconfig = ..\.editorconfig
..\appveyor.ps1 = ..\appveyor.ps1
..\appveyor.yml = ..\appveyor.yml
..\build.ps1 = ..\build.ps1
@@ -64,4 +65,7 @@ Global
{8B0A6566-6722-4550-8EF2-1D91DF188F9B} = {3F47B841-9074-4317-8ACA-0F2EEA34FA62}
{CBF02EBF-EB36-42AA-8557-B95CEF5FD175} = {3F47B841-9074-4317-8ACA-0F2EEA34FA62}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {B5C12140-A3BF-47C9-A4AD-91F7C4682804}
EndGlobalSection
EndGlobal
@@ -112,57 +112,6 @@ public void Add(params object[] values)
Add(new Row(values));
}

internal void AddColumnar(IEnumerable<IList> columnsList)
{
IEnumerator[] iear = columnsList.Select(c => c.GetEnumerator()).ToArray();
iear.ForEach(ie => ie.Reset());

while (iear.All(ie => ie.MoveNext()))
{
var row = new Row(iear.Select(ie => ie.Current));
_rows.Add(row);
}
}

internal void AddFromFlatColumns(IEnumerable<IList> columnValues)
{
IEnumerator[] iear = columnValues.Select(c => c.GetEnumerator()).ToArray();
iear.ForEach(ie => ie.Reset());

while (iear.All(ie => ie.MoveNext()))
{
int vi = 0;
Row row = CreateRow(Schema.Elements, iear, ref vi);

_rows.Add(row);
}

}

internal Row CreateRow(IList<SchemaElement> schema, IEnumerator[] flatValues, ref int vi)
{
var values = new List<object>();

for (int i = 0; i < schema.Count; i++)
{
SchemaElement se = schema[i];
object value;

if (se.Children.Count > 0)
{
value = CreateRow(se.Children, flatValues, ref vi);
}
else
{
value = flatValues[vi++].Current;
}

values.Add(value);
}

return new Row(values);
}

private void Validate(Row row)
{
if (row == null)
Oops, something went wrong.

0 comments on commit 4404b45

Please sign in to comment.