Background and motivation
Currently, there is no built-in way to read/parse a bson file. There are also no libraries to convert bson files to json. Newtonsoft has a bson writer and reader but the reader is allocating. The Utf8JsonReader is a robust and good low-level api that can be adapted for bson file reading.
I would propose adapting the Utf8JsonReader to the bson file format.
API Proposal
This is a rough starting point. I have a working version written my self and am using it currently in production. Native support from the BCL however would be great.
namespace System.Formats.Bson;
public struct BsonReaderState
{
internal bool inObject;
internal int binaryLength;
internal BsonElementType elementType;
internal BsonElementType previousElementType;
internal BitStack bitStack;
}
public ref partial struct Utf8BsonReader
{
public Utf8BsonReader(ReadOnlySpan<byte> bsonData, bool isFinalBlock, BsonReaderState state)
{ }
public Utf8BsonReader(ReadOnlySequence<byte> bsonData, bool isFinalBlock, BsonReaderState state)
{ }
public bool HasKeySequence { get; private set; }
public bool HasValueSequence { get; private set; }
public readonly bool IsFinalBlock { get; }
public ReadOnlySpan<byte> KeySpan { get; private set; }
public ReadOnlySpan<byte> ValueSpan { get; private set; }
public readonly long BytesConsumed { get; }
public long ElementStartIndex { get; private set; }
public readonly int CurrentDepth { get; }
public readonly BsonElementType ElementType { get; }
public readonly bool IsInArray { get; }
public ReadOnlySequence<byte> KeySequence { get; private set; }
public ReadOnlySequence<byte> ValueSequence { get; private set; }
public readonly SequencePosition Position { get; }
public readonly BsonReaderState CurrentState { get };
private bool IsLastSpan { get; }
public bool Read();
}
public enum BsonElementType : byte
{
None,
RootObject,
StartObject,
StartArray,
EndOfObject,
EndOfArray,
Binary,
BinaryData,
Undefined,
ObjectId,
Regex,
Reference,
Code,
CodeWScope,
Symbol,
MinKey,
MaxKey,
String,
Number,
Decimal,
Boolean,
Integer,
Long,
Date,
TimeStamp,
Null,
}
API Usage
using System.Buffers;
using System.IO.Pipelines;
using System.Text;
using System.Text.Json;
using ASGmbH.Lib.Formats.Bson;
using ASGmbH.Lib.Formats.Bson.TestConsole;
Console.WriteLine("Hello, World!");
using var outputFileStream = new FileStream("test-out.json", FileMode.Create, FileAccess.Write);;
using var jsonWriter = new Utf8JsonWriter(outputFileStream, new JsonWriterOptions()
{
Indented = true,
Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
});
using var fileStream = File.OpenRead("test.bson");
var pipeReader = PipeReader.Create(fileStream);
var state = new BsonReaderState();
while (true)
{
var result = await pipeReader.ReadAsync();
var buffer = result.Buffer;
if (result.IsCompleted && buffer.IsEmpty)
break;
var position = ParseSequence(ref buffer, result.IsCompleted, ref state, jsonWriter); //, cancellationToken);
pipeReader.AdvanceTo(position, buffer.End);
}
await pipeReader.CompleteAsync();
await jsonWriter.FlushAsync();
await jsonWriter.DisposeAsync();
static SequencePosition ParseSequence(ref ReadOnlySequence<byte> buffer, bool isFinalBlock, ref BsonReaderState state, Utf8JsonWriter writer) //, CancellationToken cancellationToken)
{
var reader = new Utf8BsonReader(buffer, isFinalBlock, state);
while (reader.Read()) // && cancellationToken.IsCancellationRequested is false)
{
#if DEBUG
DebugWriteToken(ref reader);
#endif
WriteJsonToken(ref reader, writer);
}
state = reader.CurrentState;
return reader.Position;
}
#if DEBUG
static void DebugWriteToken(ref Utf8BsonReader reader)
{
var key = ((reader.HasKeySequence) ?
Encoding.UTF8.GetString(reader.KeySequence) :
Encoding.UTF8.GetString(reader.KeySpan));
var value = (reader.ElementType, reader.HasValueSequence) switch
{
(BsonElementType.String, true) => Encoding.UTF8.GetString(reader.ValueSequence),
(BsonElementType.String, false) => Encoding.UTF8.GetString(reader.ValueSpan),
(_, true) => ValueSequenceToHexString(reader.ValueSequence),
(_, false) => Convert.ToHexString(reader.ValueSpan)
};
var str = "Element: " + reader.ElementType + ", Key: " + key + ", Value: " + value;
Console.WriteLine(str);
string ValueSequenceToHexString(ReadOnlySequence<byte> sequence)
{
var length = sequence.Length;
Span<byte> valueHexBuffer = stackalloc byte[(int)length];
sequence.CopyTo(valueHexBuffer);
return Convert.ToHexString(valueHexBuffer);
}
}
#endif
static void WriteJsonToken(ref Utf8BsonReader reader, Utf8JsonWriter writer)
{
if (reader.ElementType == BsonElementType.None)
{
return;
}
if (reader.ElementType == BsonElementType.EndOfDocument)
{
if (reader.IsInArray)
writer.WriteEndObject();
else
writer.WriteEndArray();
return;
}
if (reader.ElementType == BsonElementType.RootObject)
{
writer.WriteStartObject();
return;
}
if (reader.HasKeySequence && reader.ElementType != BsonElementType.BinaryData && (reader.IsInArray is false || reader.ElementType == BsonElementType.StartArray))
{
writer.WritePropertyName(reader.KeySequence);
}
if (reader.HasKeySequence is false && reader.ElementType != BsonElementType.BinaryData && (reader.IsInArray is false || reader.ElementType == BsonElementType.StartArray))
{
writer.WritePropertyName(reader.KeySpan);
}
if (reader.ElementType == BsonElementType.Binary)
{
return;
}
switch (reader.ElementType)
{
case BsonElementType.StartObject:
writer.WriteStartObject();
break;
case BsonElementType.StartArray:
writer.WriteStartArray();
break;
case BsonElementType.Integer when (reader.HasValueSequence):
{
// TODO(markus): Check length > MAX_BUFFER_SIZE
var length = reader.ValueSequence.Length;
Span<byte> valueStringBuffer = stackalloc byte[(int)length];
reader.ValueSequence.CopyTo(valueStringBuffer);
writer.WriteNumberValue(BitConverter.ToInt32(valueStringBuffer));
break;
}
case BsonElementType.Integer:
{
writer.WriteNumberValue(BitConverter.ToInt32(reader.ValueSpan));
break;
}
case BsonElementType.String when (reader.HasValueSequence):
{
// TODO(markus): Check length > MAX_BUFFER_SIZE
var length = reader.ValueSequence.Length;
Span<byte> valueStringBuffer = stackalloc byte[(int)length];
reader.ValueSequence.CopyTo(valueStringBuffer);
writer.WriteStringValue(valueStringBuffer);
break;
}
case BsonElementType.String:
writer.WriteStringValue(reader.ValueSpan);
break;
case BsonElementType.BinaryData when (reader.HasValueSequence):
{
var length = reader.ValueSequence.Length;
Span<byte> valueStringBuffer = stackalloc byte[(int)length];
reader.ValueSequence.CopyTo(valueStringBuffer);
writer.WriteStringValue(Convert.ToBase64String(valueStringBuffer));
break;
}
case BsonElementType.BinaryData:
{
writer.WriteStringValue(Convert.ToBase64String(reader.ValueSpan));
break;
}
}
}
Alternative Designs
No response
Risks
No response
Background and motivation
Currently, there is no built-in way to read/parse a bson file. There are also no libraries to convert bson files to json. Newtonsoft has a bson writer and reader but the reader is allocating. The Utf8JsonReader is a robust and good low-level api that can be adapted for bson file reading.
I would propose adapting the Utf8JsonReader to the bson file format.
API Proposal
This is a rough starting point. I have a working version written my self and am using it currently in production. Native support from the BCL however would be great.
API Usage
Alternative Designs
No response
Risks
No response