In [2]:
from lark import Tree, Lark

In [3]:
def unwrap(tree):
    lines = []
    lastLineEnd = 0
    
    indent = 0
    
    text = str(tree)
    
    for i, c in enumerate(text):
        if text[i:].startswith('), ['):
            lines.append((indent * '  ') + text[lastLineEnd:i + 4].strip())
            lastLineEnd = i + 4
            indent += 1
            
        elif c == ']':
            lines.append((indent * '  ') + text[lastLineEnd:i].strip())
            lastLineEnd = i
            indent -= 1
            
        elif text[i:].startswith('), '):
            lines.append((indent * '  ') + text[lastLineEnd:i + 2].strip())
            lastLineEnd = i + 2
                    
    return '\n'.join(lines)

In [4]:
grammar = r'''
// sectioning

_CHAPTER_TAG: "!"
_SECTION_TAG: "!!"
_SUBSECTION_TAG: "!!!"
_SUBSUBSECTION_TAG: "!!!!"

TITLE: /[^\n`]+/

// block

_BLOCK_TAG: "@"
BLOCK_TYPE_TAG: /[A-Za-z0-9]+/
_BLOCK_TAG_SEPPARATOR: ":"
_BLOCK_END: /\n[ \t]*(?=\n[ \t]*\n)/

// whitespace

SPACE: " "
DOUBLE_SPACE: "  "
TRIPLE_SPACE: "   "
TAB: /\t/

INLINE_WHITESPACE: _INLINE_WHITESPACE
inline_whitespace: (TRIPLE_SPACE | DOUBLE_SPACE | SPACE | TAB)+
_INLINE_WHITESPACE: (TRIPLE_SPACE | DOUBLE_SPACE | SPACE | TAB)+

NEWLINE: _NEWLINE
_NEWLINE: /\n/

// center

CENTER_OPEN: ">"
CENTER_CLOSE: "<" 

// comment

COMMENT_DELIMITER: "##"
COMMENT_TEXT: /([^#]|((?<!#)#(?!#)))+/

// math block

MATH_BLOCK_DELIMITER: _MATH_BLOCK_DELIMITER
_MATH_BLOCK_DELIMITER: /(?!<`)`(?!`)/

// table

TABLE_DELIMITER: "||" | "|"
TABLE_SEPPARATOR: "|--" /[^(\n)]*/

// text

SPECIAL_LETTER: /[ěščřžýáíéďňťóúůöĚŠČŘŽÝÁÍÉĎŇŤÓÚŮÖ]/
PASSIVE_CHARACTER: /[a-zA-Z0-9\(\)\[\]\.,~\+\*\/:;'"_\?&\%]|((?<!#)#(?!#))/ | SPECIAL_LETTER
WORD: (LETTER | NUMBER | SPECIAL_LETTER | /[\(\)\[\]\.,\!@~\-\+\*\/:;'"_\?&\%]|((?<!#)#(?!#))/  )+
FIRST_WORD: PASSIVE_CHARACTER WORD?

text: INLINE_WHITESPACE? WORD (INLINE_WHITESPACE WORD)* INLINE_WHITESPACE?
sanitized_text: INLINE_WHITESPACE? FIRST_WORD (INLINE_WHITESPACE WORD)* INLINE_WHITESPACE?

// list

SIMPLE_LIST_BULLET: /\-(?!{)/
COMPOSITE_LIST_BULLET: "-{"

// image

IMAGE: "\\image" /\[[^\]]*\]/? "{" /[^}]+/ "}"


////////////////////////////////////////////////////////////////////////

start: (_INLINE_WHITESPACE | _NEWLINE)* (_object NEWLINE+)* _object (_INLINE_WHITESPACE | _NEWLINE)*

title: (TITLE | INLINE_WHITESPACE | math_block)+

_object: (chapter_header | section_header | subsection_header | subsubsection_header | list_item | block | table | center | content)
_inline_object: (chapter_header | section_header | subsection_header | subsubsection_header | list_item | block | table | center | inline_content)

content: INLINE_WHITESPACE? (image | comment | sanitized_text | math_block) (image | comment | text | math_block | INLINE_WHITESPACE)* INLINE_WHITESPACE?
inline_content: INLINE_WHITESPACE? (image | comment | text | math_block) (image | comment | text | math_block | INLINE_WHITESPACE)* INLINE_WHITESPACE?

// lists

list_bullet: SIMPLE_LIST_BULLET (/[^ `]/ | math_block)* | COMPOSITE_LIST_BULLET (/[^`}]/ | math_block)* "}" 
list_item: INLINE_WHITESPACE? list_bullet (INLINE_WHITESPACE inline_content)? _INLINE_WHITESPACE?

// block

block: block_title _block_structure _BLOCK_END
block_title: _BLOCK_TAG (_INLINE_WHITESPACE BLOCK_TYPE_TAG)? _BLOCK_TAG_SEPPARATOR (_INLINE_WHITESPACE title)? _INLINE_WHITESPACE?

_block_structure: (NEWLINE (_INLINE_WHITESPACE? _NEWLINE)? _block_object _INLINE_WHITESPACE?)*
_block_object: (block | list_item | table | center | content)

// center

center: INLINE_WHITESPACE? CENTER_OPEN (_INLINE_WHITESPACE | NEWLINE)* (_inline_object (INLINE_WHITESPACE | NEWLINE)*)+ CENTER_CLOSE _INLINE_WHITESPACE?

// table

table: table_row+
table_row: TABLE_DELIMITER (table_content _INLINE_WHITESPACE? TABLE_DELIMITER)+ _INLINE_WHITESPACE? _NEWLINE
         | TABLE_SEPPARATOR _NEWLINE
table_content: (comment | text | math_block | _INLINE_WHITESPACE)*

// math block

math_block: MATH_BLOCK_DELIMITER math_content MATH_BLOCK_DELIMITER

// comment

comment: COMMENT_DELIMITER COMMENT_TEXT COMMENT_DELIMITER
indented_comment: INLINE_WHITESPACE? comment

// headers

chapter_header: _CHAPTER_TAG _INLINE_WHITESPACE title _INLINE_WHITESPACE?
section_header: _SECTION_TAG _INLINE_WHITESPACE title _INLINE_WHITESPACE?
subsection_header: _SUBSECTION_TAG _INLINE_WHITESPACE title _INLINE_WHITESPACE?
subsubsection_header: _SUBSUBSECTION_TAG _INLINE_WHITESPACE title _INLINE_WHITESPACE?

// image

image: IMAGE



//////////////// MATH CONTENT ////////////////////////////////////////////////////////////////////////////////////////////////



NUMBER: /[0-9\.]+/
LETTER: /[a-zA-Z]/

PARENTHESES: "(" | ")" | "[" | "]" | /\|(?!\-)/ | "{{" | "}}"
PUNCTUATION: "," | "'" | ":" | ";" | "?" | "\""  | " /" | "/ "

SHORTCUT_OPERATOR: "<=>" | "<=" | "=>" | "|->" | "->" | "<-" | "<->" | ">>" | "<<" | "\\Σ" | "\\π" | "\\∪" | "\\∩" | "\\∫" | /\\(?![\\\]\[\)\(}{\|)])/
UNICODE_OPERATOR: /[∧∨∩∪∀∃∄∂∫≤≥<>\!\|∈∉∙≆≃≅×±¬∤⊍≋\~≡≢≈≉⊂⊆⊃⊇≠\=\+\-\*]/
KEYWORD_OPERATOR:  "limsup" | "liminf" | "arccos" | "arcsin"  | "argmax" | "argmin" | "mod" | "min" | "max" | "sin" | "cos" | "tan" | "exp" | "log" | "ln" | "sup" | "inf" | "sgn" | "lim" | "diag" | "span" |  "conv" |  "cone" 

MATH_SYMBOLS: /[∞⟨⟩⋱︙⠇∇⊥∅↾◦○￮∝¶✓≼≺⇳⇓⇑↓↑⊄⊅⋡∋⊴⊵⊲⊳⋪⋫⋬⋭∦⌈⌉⌊𝨨⌋⊊⊋]/
MATH_GREEK_SYMBOLS: /[αβγδεφηικλμν𝛝ωπΘψρστυχξζΩΨΣΔΦΓΛΞ]/
MATH_BB_SYMBOLS: /[𝔸𝔹ℂ𝔻𝔼𝔽𝔾ℍ𝕀𝕁𝕂𝕃𝕄ℕ𝕆ℙℚℝ𝕊𝕋𝕌𝕍𝕎𝕏𝕐ℤ]/
MATH_CAL_SYMBOLS: /[𝓐𝓑𝓒𝓓𝓔𝓕𝓖𝓗𝓘𝓙𝓚𝓛𝓜𝓝𝓞𝓟𝓠𝓡𝓢𝓣𝓤𝓥𝓦𝓧𝓨𝓩]/

OPEN_BRACKET: /(?<!{){(?!{)/
CLOSE_BRACKET: /(?<!})}(?!})/

OPEN_BRACKET_EXPLICIT: "{{"
CLOSE_BRACKET_EXPLICIT: "}}"

DIVISION_OPERATOR: "/"
ROOT_OPERATOR: "√"

_TEXT_BLOCK_DELIMITER: "``"
MATH_TEXT: /[^`]+/

// array

ARRAY_OPEN: /(\\[\(\[\{\\\|])/
ARRAY_CLOSE: /(\\[\)\]}\\\|])/
ARRAY_COLUMN_DIVIDER: /(?<!&)&(?!&)/
_ARRAY_ROW_DIVIDER: "&&"

// over/underline/sub/superscript

OVERLINE_OPERATOR: /\^(?=[ }`])/
UNDERLINE_OPERATOR: /_(?=[ }`])/
SUPERSCRIPT_OPERATOR: /\^(?![ }`])/
SUBSCRIPT_OPERATOR: /_(?![ }`])/



//////////////////////////////////////////////////////////////////


math_content: _INLINE_WHITESPACE? (expression | array | _structural | inline_whitespace | NEWLINE | comment)+ _INLINE_WHITESPACE?

expression: (_math_object inline_whitespace)* _math_object
sub_expression: OPEN_BRACKET _INLINE_WHITESPACE? expression _INLINE_WHITESPACE? CLOSE_BRACKET
              | symbol
          
_math_object: term | sub_super_script_term | underline_overline | fraction | root | text_block | array

// atomic

_structural: PUNCTUATION | PARENTHESES

term: (symbol | _structural)+
symbol: SHORTCUT_OPERATOR | UNICODE_OPERATOR | KEYWORD_OPERATOR | NUMBER | LETTER | MATH_GREEK_SYMBOLS | MATH_BB_SYMBOLS | MATH_CAL_SYMBOLS | MATH_SYMBOLS

text_block: _TEXT_BLOCK_DELIMITER MATH_TEXT _TEXT_BLOCK_DELIMITER

// operations

underline_overline: (OPEN_BRACKET | inline_whitespace) sub_expression (OVERLINE_OPERATOR | UNDERLINE_OPERATOR) (CLOSE_BRACKET | inline_whitespace)

sub_super_script_term: sub_expression SUBSCRIPT_OPERATOR sub_expression SUPERSCRIPT_OPERATOR sub_expression
                     | sub_expression SUPERSCRIPT_OPERATOR sub_expression SUBSCRIPT_OPERATOR sub_expression
                     | sub_expression (SUBSCRIPT_OPERATOR | SUPERSCRIPT_OPERATOR) sub_expression
                
                
fraction: sub_expression DIVISION_OPERATOR sub_expression
root: sub_expression? ROOT_OPERATOR sub_expression

// array

array: ARRAY_OPEN (_INLINE_WHITESPACE? _NEWLINE)? (array_row ((_ARRAY_ROW_DIVIDER | _NEWLINE) array_row)+) _INLINE_WHITESPACE? _NEWLINE? _INLINE_WHITESPACE? ARRAY_CLOSE
array_row: _array_item (_INLINE_WHITESPACE? ARRAY_COLUMN_DIVIDER _array_item)*
_array_item: _INLINE_WHITESPACE? (expression | comment | text_block | inline_whitespace)+ _INLINE_WHITESPACE? 
'''

In [12]:
sample = r'''`aa^33`'''

In [13]:
p = Lark(grammar)

In [14]:
t = p.parse(sample)

In [15]:
print(unwrap(t))

Tree(Token('RULE', 'start'), [
  Tree(Token('RULE', 'content'), [
    Tree(Token('RULE', 'math_block'), [
      Token('MATH_BLOCK_DELIMITER', '`'),
      Tree(Token('RULE', 'math_content'), [
        Tree(Token('RULE', 'expression'), [
          Tree(Token('RULE', 'term'), [
            Tree(Token('RULE', 'symbol'), [
              Token('LETTER', 'a')
            ])
          ])
        ]),
        Tree(Token('RULE', 'expression'), [
          Tree(Token('RULE', 'sub_super_script_term'), [
            Tree(Token('RULE', 'sub_expression'), [
              Tree(Token('RULE', 'symbol'), [
                Token('LETTER', 'a')
              ])
            ]),
            Token('SUPERSCRIPT_OPERATOR', '^'),
            Tree(Token('RULE', 'sub_expression'), [
              Tree(Token('RULE', 'symbol'), [
                Token('NUMBER', '33')
              ])
            ])
          ])
        ])
      ]),
      Token('MATH_BLOCK_DELIMITER', '`')
    ])
  ])
