In [None]:
import re
import json
import requests
import pandas
import pathlib
from collections import OrderedDict
from markdown2 import markdown
import numpy as np
from typing import *

Get raw data

In [None]:
def crawl_items(directory: str, items: List[str]) -> Dict[str, str]:
    "get content from developers.urbit.org by directory and page name."
    groups = dict()
    for item in items:
        result = requests.get(
            "https://raw.githubusercontent.com/urbit/developers.urbit.org/main/content/reference/hoon/{}/{}.md".format(
                directory, 
                item
            )
        )
        if(result.status_code == 200):
            groups[item] = result.text
        else:
            print(result.status_code)
            raise Exception(result.text)
    return groups

In [None]:
rune_pages = ["bar","buc","cen","col","dot","fas","ket","lus","mic","sig","tis","wut","zap"]
rune_groups = crawl_items('rune', rune_pages)

stdlib_pages = ["1a", "1b", "1c", "2a", "2b", "2c", "2d", "2e", "2e", "2f", "2g", "2h", "2i", "2j",
     "2k", "2l", "2m", "2n", "2o", "2p", "2q", "3a", "3b", "3c", "3d", "3f", "3g", "4a", "4b",
     "4c", "4d", "4e", "4f", "4g", "4h", "4i", "4j", "4k", "4l", "4m", "4n", "4o", "5a", "5b", "5c",
     "5d", "5e", "5f"]
stdlib_groups = crawl_items('stdlib', stdlib_pages)

Convert tables to markdown

In [None]:
rune_groups.keys()

In [None]:
def trim_page_index(groups: Dict[str, str]) -> str:
    """Remove markdown index from beginning of pages and add all content to a single string."""
    raw = ""
    for name in groups.keys():
        start_ix = groups[name].index("## `")
        raw += "\n" + groups[name][start_ix:]
    return raw

In [None]:
runes_trimmed = trim_page_index(rune_groups)
stdlib_trimmed = trim_page_index(stdlib_groups)

In [None]:
def string_to_dict(raw: str) -> Dict[str, str]:
    """Split trimmed individual content into a dictionary."""
    raw_splits = list(filter(lambda x: len(x), raw.split("\n## ")))  # split strings
    keys = list(map(lambda x: x[x.find('`') + 1:x.find('`', 1)], raw_splits))
    splits = dict(zip(keys, ['\n## ' + s for s in raw_splits]))  # add back markdown highlight
    return splits

In [None]:
rune_dict = string_to_dict(runes_trimmed)
stdlib_dict = string_to_dict(stdlib_trimmed)

In [None]:
runes_trimmed[:100]

In [None]:
print(rune_dict.keys())
# print(rune_dict)

In [None]:
print(rune_dict['!<'])

In [None]:
def convert_to_vs_markdown(a):
  # a = a.replace("`", "\`")
  match = re.search("{% table %}\n([\s\S]*?){% /table %}", a)
  while match:
    b = match.group(1).split("---\n")
    c = list(map(lambda x: x.split("\n- ")[1:], b))

    table_string = ""
    for i, d in enumerate(c):
      if(not d):
        continue
      x = d[1].split("```")
      if len(x) == 3:
        if(x[1].lower().startswith("hoon")):
          x[1] = x[1][5:]
        d[1] = "<pre>{}</pre>".format(
            x[1].replace(
                "```hoon","<code>"
            ).replace(
                "```","</code>"
            ).replace(
                "\n", "<p>"
            ).replace(
                "|", "&verbar;"
            )
        )
      else:
        d[1] = d[1].replace("\n","")
      d[0] = d[0].replace("\n","")
      table_string += "| {} | {} |\n".format(d[0],d[1])
      if(i == 0):
        table_string += "| :----: | :---: |\n"

    a = a[:match.start()] + table_string + a[match.end():]
    # print(table_string)
    match = re.search("{% table %}\n([\s\S]*?){% /table %}", a)
  return a.replace(
          "/reference/","https://developers.urbit.org/reference/"
      )

In [None]:
print(convert_to_vs_markdown(rune_dict['!<']))

In [None]:
parsed_runes = {k: convert_to_vs_markdown(v) for k,v in rune_dict.items()}
parsed_stdlib = {k: convert_to_vs_markdown(v) for k,v in stdlib_dict.items()}
# parsed_runes

In [None]:
print(parsed_runes['%^'])

In [None]:
def parse_forms(string: str) -> Tuple[str, str]:
    """Extract Tall and Wide forms from runes."""
    tall = re.search('(?<=\| Tall \| <pre>  )([^\s]+)', string.replace("&verbar;","|"))
    wide = re.search('(?<=\| Wide \| <pre>  )([^a-z\s]+)', string.replace("&verbar;", "|"))
    irregular = re.search('(?<=\| Irregular \| <pre>)(?:\s*(?:<p>)?\s+)([^a-z\s<]+)', string.replace("&verbar;", "|"))
    return (tall.group() if tall else np.nan,
            wide.group() if wide else np.nan,
            irregular.groups()[0] if irregular else np.nan)

In [None]:
rune_forms = {k: parse_forms(v) for k, v in parsed_runes.items()}
print(rune_forms['%:'])
print(rune_forms['!<'])

In [None]:
html_runes = {k: markdown(v, extras=['tables', 'fenced-code-blocks']).replace("&verbar;", "|") for k,v in parsed_runes.items()}
html_stdlib = {k: markdown(v, extras=['tables', 'fenced-code-blocks']).replace("&verbar;", "|") for k,v in parsed_stdlib.items()}
# html_runes

In [None]:
print(html_runes['%~'])

In [None]:
runes = pandas.DataFrame(index=html_runes.keys(), data=html_runes.values(), columns=['doc'])
runes['group'] = ['runes'] * len(html_runes)

# stdlib needs ++ trimmed from the front so that it can be matched within code
stdlib = pandas.DataFrame(index=map(lambda x: x[2:], html_stdlib.keys()), data=html_stdlib.values(), columns=['doc'])
stdlib['group'] = ['stdlib'] * len(html_stdlib)

wide = pandas.DataFrame(index=rune_forms.keys(), data=rune_forms.values(), columns=['tall', 'wide', 'irregular'])

items = pandas.concat([pandas.concat([runes, stdlib]), wide], axis=1)
items['index'] = items.index
items.loc[:,"keys"] = items[['index', 'wide', 'irregular']].apply(tuple, axis=1)
items

In [None]:
items.loc['%~']

In [None]:
out = items[["doc"]]
out.loc[:, "keys"] = items.loc[:, "keys"].apply(lambda xs: tuple(i for i in xs if isinstance(i, str)))
out

In [None]:
finale = out.set_index('keys', drop=False).to_dict("records")
finale = list(map(lambda x: OrderedDict(reversed(sorted(x.items()))), finale))
finale

In [None]:
import pathlib
pathlib.Path("hoon-dictionary.json").write_text(json.dumps(finale, indent=4))

In [None]:
print(out.loc['%~','doc'])