In [1]:
import json
import requests
from collections import defaultdict
from enum import Enum
from dataclasses import dataclass
from typing import Optional

In [2]:
wiki_pages = [json.loads(line) for line in open('../wiki_pages000000000000.json', 'r')]
wiki_pages_by_title = {page['title']: page for page in wiki_pages}

In [3]:
response = requests.get('http://localhost:8086/tags')
response.raise_for_status()
tags = response.json()

In [4]:
tag_pages = {}
tag_names = [tag['name'] for tag in tags]
tag_pages = {tag_name: wiki_pages_by_title[tag_name] for tag_name in tag_names if tag_name in wiki_pages_by_title}

In [7]:
TagType = Enum('TagType', 'bold italic link header text br')

@dataclass
class Tag:
    type: TagType
    content: str
    arg: Optional[str] = None

    def to_json(self):
        return {
            'type': self.type.name,
            'content': self.content,
            'arg': self.arg,
        }

def parse_helper(dtext: str, tag: str):
    closingtag = dtext.lower().find(f'[/{tag}]')
    if closingtag == -1:
        raise ValueError(f'Missing closing tag for {tag}')
    
    content = dtext[len(tag) + 2:closingtag]
    return content, dtext[closingtag + len(tag) + 3:]

def parse_helper2(text: str, new_tag: Tag, result: list[Tag]):
    if len(text) > 0:
        result.append(Tag(TagType.text, text))
    result.append(new_tag)
    return ''

def parse_dtext(dtext: str) -> list[Tag]:
    result = []
    text = ''

    # First, let's deal with \r\n
    dtext = dtext.replace('\r\n', '\n')

    while len(dtext) > 0:
        if dtext.startswith('[b]') or dtext.startswith('[B]'):
            content, dtext = parse_helper(dtext, 'b')
            text = parse_helper2(text, Tag(TagType.bold, content), result)
        elif dtext.startswith('[i]') or dtext.startswith('[I]'):
            content, dtext = parse_helper(dtext, 'i')
            text = parse_helper2(text, Tag(TagType.italic, content), result)
        elif dtext.startswith('[['):
            closingtag = dtext.find(']]')
            if closingtag == -1:
                raise ValueError('Missing closing tag for [[')
            
            content = dtext[2:closingtag]
            dtext = dtext[closingtag + 2:]

            if '|' in content:
                link, title = content.split('|')
            else:
                link = title = content

            text = parse_helper2(text, Tag(TagType.link, title, link), result)
        elif (dtext.startswith('h') or dtext.startswith('H')) and len(dtext) >= 2 and dtext[1].isdigit() and dtext[2] == '.':
            header = int(dtext[1])
            # Look for the end of the line
            end = dtext.find('\n')
            if end == -1:
                end = len(dtext)
            
            content = dtext[3:end]
            dtext = dtext[end + 1:]

            text = parse_helper2(text, Tag(TagType.header, content, str(header)), result)
        elif dtext[0] == '\n':
            dtext = dtext[1:]
            text = parse_helper2(text, Tag(TagType.br, ''), result)
        else:
            text += dtext[0]
            dtext = dtext[1:]
    
    if len(text) > 0:
        result.append(Tag(TagType.text, text))
    
    return result

parsed_tag_pages = {}

for tag, page in tag_pages.items():
    try:
        parsed_tag_pages[tag] = parse_dtext(page['body'])
    except ValueError as e:
        print(f'Error parsing {tag}: {e}')

Error parsing nanao_naru: Missing closing tag for b


In [8]:
json.dump({k: [x.to_json() for x in v] for k,v in parsed_tag_pages.items()}, open('src/parsed_tag_pages.json', 'w'), indent=4)

In [7]:
tag_pages['lips']

{'is_deleted': False,
 'other_names': ['唇'],
 'is_locked': False,
 'body': 'The part of the anatomy most anime-style artists seem to forget. You use them to talk, to eat, to hold things, to kiss, and most importantly, to tease sexual organs. Images tagged "lips" should probably be centered around them, or at least have lips prominently displayed.\r\n\r\nh4. Colors\r\n\r\n* [[Aqua lips]] [tn]← [[aqua lipstick]][/tn]\r\n* [[Black lips]]\r\n* [[Blue lips]]\r\n* [[Grey lips]] [tn]← [[grey_lipstick]][/tn]\r\n* [[Green lips]] [tn]← [[green lipstick]][/tn]\r\n* [[Orange lips]]\r\n* [[Pink lips]]\r\n* [[Purple lips]]\r\n* [[Red lips]]\r\n* [[Shiny lips]]\r\n* [[Yellow lips]]\r\n[tn]See also the [[lipstick]] colors[/tn]\r\n\r\nh4. Actions/Positions\r\n\r\n* [[Closed mouth]]\r\n* [[Cum on lips]]\r\n* [[Licking lips]]\r\n* [[Lip biting]]\r\n* [[Open mouth]]\r\n* [[Parted lips]]\r\n* [[Puckered lips]]\r\n* [[Pursed lips]]\r\n* [[Spread lips]]\r\n\r\nh4. See also\r\n\r\n* [[Dry lips]]\r\n* [[Lip ba