# Minimal Mermaid Parser

## Token Types

In [1]:
static class TokenType
{
    public const string DIAGRAM_TYPE_TOKEN = "DiagramTypeToken";
    public const string DIRECTION_TOKEN = "DirectionToken";
    public const string NAME_TOKEN = "NameToken";
    public const string DELIMITER_TOKEN = "DelimiterToken";
    public const string LINK_TOKEN = "LinkToken";
    public const string TEXT_TOKEN = "TextToken";
}

record Token
{
    public string Type { get; init; }
    public string Value { get; init; }
}

## Tokenizer

In [2]:
using System.Text.RegularExpressions;
using static TokenType;

In [4]:
class Tokenizer
{
    enum Where
    {
        IDK,
        InDelimitedText
    }
    
    private readonly string _str;
    private int cur;
    private Where where = Where.IDK;

    public Tokenizer(string str) => _str = str;

    bool HasMoreChars() => cur < _str.Length;
    bool IsEOF() => cur == _str.Length;

    string Segment(int left, int rightExclusive) => _str.Substring(left, rightExclusive - left);

    bool MatchAndAdvance(string regex, out string match)
    {
        var r = new Regex(regex, RegexOptions.IgnoreCase);
        var m = r.Match(_str, cur);

        if (m.Success)
        {
            cur += m.Value.Length;
            match = m.Value;
            return true;
        }
        else
        {
            match = null;
            return false;
        }
    }

    public Token GetNextToken()
    {
        if (!HasMoreChars())
            return null;

        string match = null;
        
        switch (where)
        {
            case Where.IDK:
            
                // Whitespace

                if (MatchAndAdvance(@"\G\s+", out match))
                    return GetNextToken();

                // Diagram Type

                if (MatchAndAdvance(@"\Gflowchart", out match))
                    return new Token { Type = DIAGRAM_TYPE_TOKEN, Value = match };

                // Direction

                if (MatchAndAdvance(@"\G(TB|TD|BT|RL|LR)", out match))
                    return new Token { Type = DIRECTION_TOKEN, Value = match };

                // Name

                if (MatchAndAdvance(@"\G\w+", out match))
                    return new Token { Type = NAME_TOKEN, Value = match };

                // Delimiter

                if (MatchAndAdvance(@"\G[(\[]", out match)) // Match an opening delimiter
                {
                    where = Where.InDelimitedText;
                    return new Token { Type = DELIMITER_TOKEN, Value = match };
                }

                // Link

                if (MatchAndAdvance(@"\G--+>", out match)) // Matches a link that is either two or more dashes, or two more dashes with a trailing greater than symbol
                    return new Token { Type = LINK_TOKEN, Value = match };
                    
                break;
            
            case Where.InDelimitedText:
            
                // Text
                
                if (MatchAndAdvance(@"\G[^[()\[\]]+", out match)) // Match anything except a delimiter
                    return new Token { Type = TEXT_TOKEN, Value = match };

                // Delimiter

                if (MatchAndAdvance(@"\G[)\]]", out match)) // Match an closing delimiter
                {
                    where = Where.IDK;
                    return new Token { Type = DELIMITER_TOKEN, Value = match };
                }
                
                break;
        }

        // Unrecognized token

        throw new Exception($"Unknown token: {_str[cur]} at {cur}");
    }

    public static IEnumerable<Token> Tokenize(string str)
    {
        var tokenizer = new Tokenizer(str);
        
        Token t = null;
        while ((t = tokenizer.GetNextToken()) != null)
            yield return t;
    }
}

In [5]:
Tokenizer.Tokenize(
    @"flowchart TD
        Hello --> World(The Earth)
        Children[Crumb Crunchers] --> Parents --> Grandparents
    "
)

index,Type,Value
0,DiagramTypeToken,flowchart
1,DirectionToken,TD
2,NameToken,Hello
3,LinkToken,-->
4,NameToken,World
5,DelimiterToken,(
6,TextToken,The Earth
7,DelimiterToken,)
8,NameToken,Children
9,DelimiterToken,[


## Parser Types

In [6]:
public interface IExpression
{
    string Type { get; }
}

public enum NodeKind
{
    Rectangle,
    Rounded
}

public class NodeExpression : IExpression
{
    public string Type { get; } = nameof(NodeExpression);
    public string Name { get; init; }
    public NodeKind NodeKind { get; init; }
    public string Text { get; init; }
}

public class LinkExpression : IExpression
{
    public string Type { get; } = nameof(LinkExpression);
    public int Length { get; init; }
    public IExpression Left { get; init; }
    public IExpression Right { get; init; }
}

public class Diagram
{
    public string DiagramType { get; init; }
    public string Direction { get; init; }
    public List<IExpression> Expressions { get; init; }
}

## Parser

In [7]:
using static TokenType;

In [8]:
class Parser
{
    private readonly IEnumerator<Token> _tokens;
    private Token lookahead;
    
    public Parser(string str)
    {
        _tokens = Tokenizer.Tokenize(str).GetEnumerator();

        lookahead = _tokens.MoveNext()
                    ? _tokens.Current
                    : null;
    }

    Token Consume(string type)
    {
        var token = lookahead;

        if (token == null)
            throw new Exception($"Reached the end of the string, which was unexpected. Expected a token of type: {type}");

        if (token.Type != type)
            throw new Exception($"Was expecting a token of type {type}, but found a token of type {token.Type} instead ({token.Value}).");

        lookahead = _tokens.MoveNext()
                    ? _tokens.Current
                    : null;

        return token;
    }

    Token Consume(string type, string value)
    {
        var token = Consume(type);

        if (token.Value != value)
            throw new Exception($"Expecting token value of {value} for token type {type}, but found a value of {token.Value}");
            
        return token;
    }

    string Text()
    {
        var delimiter = lookahead.Value;
        Consume(DELIMITER_TOKEN);
        
        var t = Consume(TEXT_TOKEN);
        
        switch (delimiter)
        {
            case "(": Consume(DELIMITER_TOKEN, ")"); break;
            case "[": Consume(DELIMITER_TOKEN, "]"); break;
            default: throw new Exception($"Unrecogonized delimiter {delimiter}");
        }
        
        return t.Value;
    }
    
    NodeExpression NodeExpression()
    {
        var name = Consume(NAME_TOKEN).Value;
        
        var kind = lookahead?.Type == DELIMITER_TOKEN
                   && lookahead.Value == "("
                   ? NodeKind.Rounded
                   : NodeKind.Rectangle;
        
        var text = lookahead?.Type == DELIMITER_TOKEN
                   ? Text()
                   : null;
        
        return new NodeExpression { Name = name, NodeKind = kind, Text = text };
    }

    IExpression LinkExpression()
    {
        IExpression left = NodeExpression();
        
        while (lookahead?.Type == LINK_TOKEN)
        {
            var length = Consume(LINK_TOKEN).Value.Length - 2;
            
            var right = NodeExpression();
            
            left = new LinkExpression { Length = length, Left = left, Right = right };
        }
        
        return left;
    }
    
    List<IExpression> ExpressionList()
    {
        var list = new List<IExpression>();
        
        while (lookahead?.Type == NAME_TOKEN)
            list.Add(LinkExpression());
        
        return list;
    }
    
    Diagram Diagram()
    {
        var type = Consume(DIAGRAM_TYPE_TOKEN).Value;
        var direction = Consume(DIRECTION_TOKEN).Value;
        var expressions = ExpressionList();
        
        return new Diagram { DiagramType = type, Direction = direction, Expressions = expressions };
    }
    
    public static Diagram Parse(string str)
    {
        var parser = new Parser(str);
        
        return parser.Diagram();
    }
}

In [9]:
var tree = Parser.Parse(
    @"flowchart TD
        Hello --> World(The Earth)
        Children[Crumb Crunchers] --> Parents --> Grandparents
    "
);

tree

DiagramType,Direction,Expressions
flowchart,TD,"[ { Submission#7+LinkExpression: Type: LinkExpression, Length: 1, Left: { Submission#7+NodeExpression: Type: NodeExpression, Name: Hello, NodeKind: Rectangle, Text: <null> }, Right: { Submission#7+NodeExpression: Type: NodeExpression, Name: World, NodeKind: Rounded, Text: The Earth } }, { Submission#7+LinkExpression: Type: LinkExpression, Length: 1, Left: { Submission#7+LinkExpression: Type: LinkExpression, Length: 1, Left: { Submission#7+NodeExpression: Type: NodeExpression, Name: Children, NodeKind: Rectangle, Text: Crumb Crunchers }, Right: { Submission#7+NodeExpression: Type: NodeExpression, Name: Parents, NodeKind: Rectangle, Text: <null> } }, Right: { Submission#7+NodeExpression: Type: NodeExpression, Name: Grandparents, NodeKind: Rectangle, Text: <null> } } ]"


### Display As JSON

In [10]:
#r "nuget:Newtonsoft.Json"

In [11]:
using Newtonsoft.Json;

Microsoft.DotNet.Interactive.Formatting.Formatter.ListExpansionLimit = Int32.MaxValue;

JsonConvert.SerializeObject(tree, new JsonSerializerSettings { Formatting = Formatting.Indented })

{
  "DiagramType": "flowchart",
  "Direction": "TD",
  "Expressions": [
    {
      "Type": "LinkExpression",
      "Length": 1,
      "Left": {
        "Type": "NodeExpression",
        "Name": "Hello",
        "NodeKind": 0,
        "Text": null
      },
      "Right": {
        "Type": "NodeExpression",
        "Name": "World",
        "NodeKind": 1,
        "Text": "The Earth"
      }
    },
    {
      "Type": "LinkExpression",
      "Length": 1,
      "Left": {
        "Type": "LinkExpression",
        "Length": 1,
        "Left": {
          "Type": "NodeExpression",
          "Name": "Children",
          "NodeKind": 0,
          "Text": "Crumb Crunchers"
        },
        "Right": {
          "Type": "NodeExpression",
          "Name": "Parents",
          "NodeKind": 0,
          "Text": null
        }
      },
      "Right": {
        "Type": "NodeExpression",
        "Name": "Grandparents",
        "NodeKind": 0,
        "Text": n