In [13]:
text = """
## heading 4
- **list1**: item 1
- **list2**: _item 2_ good
  1. subitem 1
    - sub-sub item 1
    - sub-sub item 2
  2. _subitem 2_
- item 3

## heading 5
1. _new-list_ 1
2. _new-list_ 2

## heading 6
<figure>
  <img src="https://example.com/fig1.png">
</figure>

## heading 7
|A|B|__C__|
|-|-|-|
|0|**0.5**|1.0|
|1|1.5|**2.0**|

## heading 8
<figure>
  <img src="https://example.com/fig2.png">
  <figcaption>fig 2: image</figcaption>
</figure>
"""

In [14]:
import re

In [15]:
code_block_pattern = re.compile(r'```(\w+)?\n([\s\S]*?)\n```',re.DOTALL)
latex_block_pattern = re.compile(r'\$\$(.*?)\$\$', re.DOTALL)
image_xml_pattern = re.compile(r'<figure>\n([\s\S]*?)\n</figure>',re.DOTALL)
numbered_list_pattern = r'^( *)(\d+)\. '
unordered_list_pattern = r'^( *)(\-|\*) '
heading_pattern = r'^(#+)'
quote_pattern = r'> ([\s\S]+)'
table_header_pattern = r'(\|(.*)\|)'
image_pattern = r'!\[(.*?)\]\((.*?)\)'

In [16]:
code_blocks = {}
latex_blocks = {}
image_xml_blocks = {}

def replace_code_blocks(match):
    index = len(code_blocks)
    language, content = match.group(1), match.group(2)
    code_blocks[index] = ((language or 'plain text').strip(), content.strip())
    return f'CODEBLOCK_{index}'

text = code_block_pattern.sub(replace_code_blocks, text)

def replace_latex_blocks(match):
    index = len(latex_blocks)
    print(match.group(0), match.group(1))
    content = match.group(1)
    latex_blocks[index] = content.strip()
    return f'LATEXBLOCK_{index}'

text = latex_block_pattern.sub(replace_latex_blocks, text)

def replace_image_xml_blocks(match):
    index = len(image_xml_blocks)
    content = match.group(1)
    src = re.findall(r'<img src="([\s\S]*?)">',content)
    figcaption = re.findall(r'<figcaption>([\s\S]*?)</figcaption>',content)
    src = "" if not src else src[0]
    figcaption = "" if not figcaption else figcaption[0]
    image_xml_blocks[index] = (src[0], figcaption)
    return f'IMAGEXMLBLOCK_{index}'

text = image_xml_pattern.sub(replace_image_xml_blocks, text)

code_blocks, latex_blocks, image_xml_blocks

({}, {}, {0: ('h', ''), 1: ('h', 'fig 2: image')})

In [17]:
text

'\n## heading 4\n- **list1**: item 1\n- **list2**: _item 2_ good\n  1. subitem 1\n    - sub-sub item 1\n    - sub-sub item 2\n  2. _subitem 2_\n- item 3\n\n## heading 5\n1. _new-list_ 1\n2. _new-list_ 2\n\n## heading 6\nIMAGEXMLBLOCK_0\n\n## heading 7\n|A|B|__C__|\n|-|-|-|\n|0|**0.5**|1.0|\n|1|1.5|**2.0**|\n\n## heading 8\nIMAGEXMLBLOCK_1\n'

In [18]:
def text_with_style(text):
    STYLE_MAP = {
        "**": "bold",
        "_": "bold",
        "*": "italic",
        "__": "italic",
        "~~": "strikethrough",
        "`": "code",
    }

    tokens = re.split(r"(\*\*|__|\*|_|~~|`)", text)
    active_styles = set()
    spans = []

    for token in tokens:
        if not token:
            continue
        
        if token in STYLE_MAP:
            if token in active_styles:
                active_styles.remove(token)
            else:
                active_styles.add(token)
        else:
            spans.append({
                "text": {"content": token},
                "annotations": {STYLE_MAP[style]: True for style in active_styles}
            })
    
    return spans

text_with_style("~~**_item_**?!~~: item2 / item 3")

[{'text': {'content': 'item'},
  'annotations': {'bold': True, 'strikethrough': True}},
 {'text': {'content': '?!'}, 'annotations': {'strikethrough': True}},
 {'text': {'content': ': item2 / item 3'}, 'annotations': {}}]

In [19]:
def make_block(type, text):
    annotations = text_with_style(text)

    block = {
        "object": "block",
        "type": type,
        type: {
            "rich_text": annotations
        }
    }
    return block

def make_image_block(url):
    block = {
        "object": "block",
        "type": "image",
        "image": {
            "external": {
                "url": url
            }
        }
    }
    return block

def make_table_row_block(row):
    block = {
        "object": "block",
        "type": "table_row",
        "table_row": {
            "cells": [text_with_style(data) for data in row]
        }
    }
    return block

def make_table_block(row):
    block = {
        "object": "block",
        "type": "table",
        "table": {
            "has_column_header": True,
            "has_row_header": False,
            "table_width": len(row),
            "children": [make_table_row_block(row)]
        }
    }
    return block

def make_code_block(language, code):
    block = {
        "object": "block",
        "type": "code",
        "code": {
            "language": language,
            "rich_text": [{"type": "text", "text": {"content": code}}]
        }
    }
    return block

def make_latex_bloock(latex):
    block = {
        "type": "equation",
        "equation": {
            "expression": latex
        }
    }
    return block

In [20]:
blocks = []
list_stack = []
for line in text.split('\n'):
    if not line:
        continue
    
    heading_match = re.match(heading_pattern, line)
    unordered_list_match = re.match(unordered_list_pattern, line)
    numbered_list_match = re.match(numbered_list_pattern, line)
    image_match = re.match(image_pattern, line)
    table_match = re.match(table_header_pattern, line)
    quote_match = re.match(quote_pattern, line)
    horizon_rule_match = re.match("---", line)
    
    if heading_match:
        line = re.sub(heading_pattern, '', line.strip()).strip()
        block = make_block(type=f'heading_{heading_match.group(1).count("#")}', text=line)
        blocks.append(block)

    elif unordered_list_match:
        indent = len(unordered_list_match.group(1))
        if indent == 0:
            root, list_stack = [], []
        
        line = re.sub(unordered_list_pattern, '', line)
        block = make_block('bulleted_list_item', text=line)

        while list_stack and list_stack[-1][1] >= indent:
            list_stack.pop()

        if list_stack:
            parent_element = list_stack[-1][0]
            if 'children' not in parent_element[parent_element['type']]:
                parent_element[parent_element['type']]['children'] = [block]
            else:
                parent_element[parent_element['type']]['children'].append(block)
        else:
            blocks.append(block)
            
        list_stack.append((block, indent))

    elif numbered_list_match:
        indent = len(numbered_list_match.group(1))
        if indent == 0:
            root, list_stack = [], []
        
        line = re.sub(numbered_list_pattern, '', line)
        block = make_block('numbered_list_item', text=line)

        while list_stack and list_stack[-1][1] >= indent:
            list_stack.pop()

        if list_stack:
            parent_element = list_stack[-1][0]
            if 'children' not in parent_element[parent_element['type']]:
                parent_element[parent_element['type']]['children'] = [block]
            else:
                parent_element[parent_element['type']]['children'].append(block)
        else:
            blocks.append(block)
        
        list_stack.append((block, indent))

    elif image_match:
        alt_text, url = image_match.groups()
        if url.split('.')[-1] in ('bmp', 'gif', 'heic', 'jpeg', 'png', 'svg', 'tif', 'tiff'):
            block = make_image_block(url)
        else:
            block = make_block(type="paragraph", text=url)
        blocks.append(block)

    elif line.startswith("IMAGEXMLBLOCK"):
        idx = int(line.split('_')[-1])
        src, caption = image_xml_blocks[idx]
        block = make_image_block(src)
        blocks.append(block)
        block = make_block('paragraph',f'_{caption}_')
        blocks.append(block)

    elif table_match:
        table_delimeter = re.match(r'(\|[-| ]+\|)',line.strip())
        if table_delimeter:
            continue
        
        row = re.match(r'(\|(.*)\|)',line.strip()).group(2).split('|')
        row = list(map(lambda s: s.strip(), row))
        if blocks and blocks[-1]['type'] == 'table':
            block = blocks.pop()
            block['table']['children'].append(make_table_row_block(row))
        else:
            block = make_table_block(row)
        blocks.append(block)

    elif quote_match:
        line = quote_match.group(1)
        block = make_block(type='quote', text=line.strip())
        blocks.append(block)

    elif line.startswith("CODEBLOCK"):
        idx = int(line.split('_')[-1])
        language, code = code_blocks[idx]
        block = make_code_block(language, code)
        blocks.append(block)

    elif line.startswith("LATEXBLOCK"):
        idx = int(line.split('_')[-1])
        latex = latex_blocks[idx]
        block = make_latex_bloock(latex)
        blocks.append(block)
    
    elif horizon_rule_match:
        block = {'divider': {}, 'type': 'divider'}
        blocks.append(block)

    else:
        block = make_block(type='paragraph', text=line.strip())
        blocks.append(block)

blocks

[{'object': 'block',
  'type': 'heading_2',
  'heading_2': {'rich_text': [{'text': {'content': 'heading 4'},
     'annotations': {}}]}},
 {'object': 'block',
  'type': 'bulleted_list_item',
  'bulleted_list_item': {'rich_text': [{'text': {'content': 'list1'},
     'annotations': {'bold': True}},
    {'text': {'content': ': item 1'}, 'annotations': {}}]}},
 {'object': 'block',
  'type': 'bulleted_list_item',
  'bulleted_list_item': {'rich_text': [{'text': {'content': 'list2'},
     'annotations': {'bold': True}},
    {'text': {'content': ': '}, 'annotations': {}},
    {'text': {'content': 'item 2'}, 'annotations': {'bold': True}},
    {'text': {'content': ' good'}, 'annotations': {}}],
   'children': [{'object': 'block',
     'type': 'numbered_list_item',
     'numbered_list_item': {'rich_text': [{'text': {'content': 'subitem 1'},
        'annotations': {}}],
      'children': [{'object': 'block',
        'type': 'bulleted_list_item',
        'bulleted_list_item': {'rich_text': [{'text'