Skip to content

[API Proposal]: Add bson format to .net #88422

@msschl

Description

@msschl

Background and motivation

Currently, there is no built-in way to read/parse a bson file. There are also no libraries to convert bson files to json. Newtonsoft has a bson writer and reader but the reader is allocating. The Utf8JsonReader is a robust and good low-level api that can be adapted for bson file reading.

I would propose adapting the Utf8JsonReader to the bson file format.

API Proposal

This is a rough starting point. I have a working version written my self and am using it currently in production. Native support from the BCL however would be great.

namespace System.Formats.Bson;

public struct BsonReaderState
{
    internal bool inObject;

    internal int binaryLength;

    internal BsonElementType elementType;

    internal BsonElementType previousElementType;

    internal BitStack bitStack;
}

public ref partial struct Utf8BsonReader
{
    public Utf8BsonReader(ReadOnlySpan<byte> bsonData, bool isFinalBlock, BsonReaderState state)
    { }

    public Utf8BsonReader(ReadOnlySequence<byte> bsonData, bool isFinalBlock, BsonReaderState state)
    { }

    public bool HasKeySequence { get; private set; }

    public bool HasValueSequence { get; private set; }

    public readonly bool IsFinalBlock { get; }

    public ReadOnlySpan<byte> KeySpan { get; private set; }

    public ReadOnlySpan<byte> ValueSpan { get; private set; }

    public readonly long BytesConsumed { get; }

    public long ElementStartIndex { get; private set; }

    public readonly int CurrentDepth { get; }

    public readonly BsonElementType ElementType { get; }

    public readonly bool IsInArray { get; }

    public ReadOnlySequence<byte> KeySequence { get; private set; }

    public ReadOnlySequence<byte> ValueSequence { get; private set; }

    public readonly SequencePosition Position { get; }

    public readonly BsonReaderState CurrentState { get };

    private bool IsLastSpan { get; }

    public bool Read();
}

public enum BsonElementType : byte
{
    None,
    RootObject,
    StartObject,
    StartArray,
    EndOfObject,
    EndOfArray,
    Binary,
    BinaryData,
    Undefined,
    ObjectId,
    Regex,
    Reference,
    Code,
    CodeWScope,
    Symbol,
    MinKey,
    MaxKey,
    String,
    Number,
    Decimal,
    Boolean,
    Integer,
    Long,
    Date,
    TimeStamp,
    Null,
}

API Usage

using System.Buffers;
using System.IO.Pipelines;
using System.Text;
using System.Text.Json;

using ASGmbH.Lib.Formats.Bson;
using ASGmbH.Lib.Formats.Bson.TestConsole;

Console.WriteLine("Hello, World!");

using var outputFileStream = new FileStream("test-out.json", FileMode.Create, FileAccess.Write);;
using var jsonWriter = new Utf8JsonWriter(outputFileStream, new JsonWriterOptions()
{
    Indented = true,
    Encoder = System.Text.Encodings.Web.JavaScriptEncoder.UnsafeRelaxedJsonEscaping
});

using var fileStream = File.OpenRead("test.bson");
var pipeReader = PipeReader.Create(fileStream);

var state = new BsonReaderState();

while (true)
{
    var result = await pipeReader.ReadAsync();
    var buffer = result.Buffer;

    if (result.IsCompleted && buffer.IsEmpty)
        break;

    var position = ParseSequence(ref buffer, result.IsCompleted, ref state, jsonWriter); //, cancellationToken);

    pipeReader.AdvanceTo(position, buffer.End);
}

await pipeReader.CompleteAsync();
await jsonWriter.FlushAsync();
await jsonWriter.DisposeAsync();



static SequencePosition ParseSequence(ref ReadOnlySequence<byte> buffer, bool isFinalBlock, ref BsonReaderState state, Utf8JsonWriter writer) //, CancellationToken cancellationToken)
{
    var reader = new Utf8BsonReader(buffer, isFinalBlock, state);

    while (reader.Read()) // && cancellationToken.IsCancellationRequested is false)
    {
#if DEBUG
        DebugWriteToken(ref reader);
#endif

        WriteJsonToken(ref reader, writer);
    }

    state = reader.CurrentState;

    return reader.Position;
}



#if DEBUG
static void DebugWriteToken(ref Utf8BsonReader reader)
{
    var key = ((reader.HasKeySequence) ?
            Encoding.UTF8.GetString(reader.KeySequence) :
            Encoding.UTF8.GetString(reader.KeySpan));
    var value = (reader.ElementType, reader.HasValueSequence) switch
    {
        (BsonElementType.String, true) => Encoding.UTF8.GetString(reader.ValueSequence),
        (BsonElementType.String, false) => Encoding.UTF8.GetString(reader.ValueSpan),
        (_, true) => ValueSequenceToHexString(reader.ValueSequence),
        (_, false) => Convert.ToHexString(reader.ValueSpan)
    };

    var str = "Element: " + reader.ElementType + ", Key: " + key + ", Value: " + value;
    Console.WriteLine(str);

    string ValueSequenceToHexString(ReadOnlySequence<byte> sequence)
    {
        var length = sequence.Length;
        Span<byte> valueHexBuffer = stackalloc byte[(int)length];

        sequence.CopyTo(valueHexBuffer);

        return Convert.ToHexString(valueHexBuffer);
    }
}
#endif

static void WriteJsonToken(ref Utf8BsonReader reader, Utf8JsonWriter writer)
{
    if (reader.ElementType == BsonElementType.None)
    {
        return;
    }

    if (reader.ElementType == BsonElementType.EndOfDocument)
    {
        if (reader.IsInArray)
            writer.WriteEndObject();
        else
            writer.WriteEndArray();
        return;
    }

    if (reader.ElementType == BsonElementType.RootObject)
    {
        writer.WriteStartObject();
        return;
    }

    if (reader.HasKeySequence && reader.ElementType != BsonElementType.BinaryData && (reader.IsInArray is false || reader.ElementType == BsonElementType.StartArray))
    {
        writer.WritePropertyName(reader.KeySequence);
    }

    if (reader.HasKeySequence is false && reader.ElementType != BsonElementType.BinaryData && (reader.IsInArray is false || reader.ElementType == BsonElementType.StartArray))
    {
        writer.WritePropertyName(reader.KeySpan);
    }

    if (reader.ElementType == BsonElementType.Binary)
    {
        return;
    }

    switch (reader.ElementType)
    {
        case BsonElementType.StartObject:
            writer.WriteStartObject();
            break;

        case BsonElementType.StartArray:
            writer.WriteStartArray();
            break;

        case BsonElementType.Integer when (reader.HasValueSequence):
        {
            // TODO(markus): Check length > MAX_BUFFER_SIZE
            var length = reader.ValueSequence.Length;
            Span<byte> valueStringBuffer = stackalloc byte[(int)length];

            reader.ValueSequence.CopyTo(valueStringBuffer);

            writer.WriteNumberValue(BitConverter.ToInt32(valueStringBuffer));
            break;
        }

        case BsonElementType.Integer:
        {
            writer.WriteNumberValue(BitConverter.ToInt32(reader.ValueSpan));
            break;
        }

        case BsonElementType.String when (reader.HasValueSequence):
        {
            // TODO(markus): Check length > MAX_BUFFER_SIZE
            var length = reader.ValueSequence.Length;
            Span<byte> valueStringBuffer = stackalloc byte[(int)length];

            reader.ValueSequence.CopyTo(valueStringBuffer);

            writer.WriteStringValue(valueStringBuffer);
            break;
        }

        case BsonElementType.String:
            writer.WriteStringValue(reader.ValueSpan);
            break;

        case BsonElementType.BinaryData when (reader.HasValueSequence):
        {
            var length = reader.ValueSequence.Length;
            Span<byte> valueStringBuffer = stackalloc byte[(int)length];

            reader.ValueSequence.CopyTo(valueStringBuffer);

            writer.WriteStringValue(Convert.ToBase64String(valueStringBuffer));
            break;
        }

        case BsonElementType.BinaryData:
        {
            writer.WriteStringValue(Convert.ToBase64String(reader.ValueSpan));
            break;
        }
    }
}

Alternative Designs

No response

Risks

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions