etc/scripts/docenizers/docenizer-amd64.py

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
import re
import sys
import tarfile
import urllib
from urllib import request
from urllib import parse

try:
    from bs4 import BeautifulSoup
except ImportError:
    raise ImportError("Please install BeautifulSoup (apt-get install python3-bs4 or pip install beautifulsoup4 should do it)")

parser = argparse.ArgumentParser(description='Docenizes HTML version of the official Intel Asm PDFs')
parser.add_argument('-i', '--inputfolder', type=str,
                    help='Folder where the input files reside as .html. Default is ./asm-docs/',
                    default='asm-docs')
parser.add_argument('-o', '--outputpath', type=str, help='Final path of the .ts file. Default is ./asm-docs-amd64.ts',
                    default='./asm-docs-amd64.ts')
parser.add_argument('-d', '--downloadfolder', type=str,
                    help='Folder where the archive will be downloaded and extracted', default='asm-docs')

# The maximum number of paragraphs from the description to copy.
MAX_DESC_PARAS = 5
STRIP_PREFIX = re.compile(r'^(([0-9a-fA-F]{2}|m64|NP|(REX|E?VEX\.)[.0-9A-Z]*|/[0-9a-z]+|[a-z]+)\b\s*)*')
INSTRUCTION_RE = re.compile(r'^([A-Z][A-Z0-9]+)\*?(\s+|$)')
# Some instructions are so broken we just take their names from the filename
UNPARSEABLE_INSTR_NAMES = ['PSRLW:PSRLD:PSRLQ', 'PSLLW:PSLLD:PSLLQ', 'MOVBE']
# Some files contain instructions which cannot be parsed and which compilers are unlikely to emit
IGNORED_FILE_NAMES = [
    # SGX pseudo-instructions
    "EADD",
    "EACCEPT",
    "EAUG",
    "EACCEPTCOPY",
    "EDECVIRTCHILD",
    "EINCVIRTCHILD",
    "EINIT",
    "ELDB:ELDU:ELDBC:ELBUC",
    "EMODPE",
    "EMODPR",
    "EMODT",
    "ERDINFO",
    "ESETCONTEXT",
    "ETRACKC",
    "EBLOCK",
    "ECREATE",
    "EDBGRD",
    "EDBGWR",
    "EENTER",
    "EEXIT",
    "EEXTEND",
    "EGETKEY",
    "ELDB",
    "ELDU",
    "ENCLS",
    "ENCLU",
    "EPA",
    "EREMOVE",
    "EREPORT",
    "ERESUME",
    "ETRACK",
    "EWB",
    # VMX instructions
    "INVEPT",
    "INVVPID",
    "VMCALL",
    "VMCLEAR",
    "VMFUNC",
    "VMLAUNCH",
    "VMLAUNCH:VMRESUME",
    "VMPTRLD",
    "VMPTRST",
    "VMREAD",
    "VMRESUME",
    "VMWRITE",
    "VMXOFF",
    "VMXON",
    # Other instructions
    "INVLPG",
    "LAHF",
    "RDMSR",
    "SGDT",
    # Unparsable instructions
    # These instructions should be supported in the future
    "MONITOR",
    "MOVDQ2Q",
    "MFENCE",
]
# Some instructions are defined in multiple files. We ignore a specific set of the
# duplicates here.
IGNORED_DUPLICATES = [
    'MOV-1',  # move to control reg
    'MOV-2',  # move to debug reg
    'CMPSD',  # compare doubleword (defined in CMPS:CMPSB:CMPSW:CMPSD:CMPSQ)
    'MOVQ',  # defined in MOVD:MOVQ
    'MOVSD',  # defined in MOVS:MOVSB:MOVSW:MOVSD:MOVSQ
    'VPBROADCASTB:VPBROADCASTW:VPBROADCASTD:VPBROADCASTQ',  # defined in VPBROADCAST
    "VGATHERDPS:VGATHERDPD",
    "VGATHERQPS:VGATHERQPD",
    "VPGATHERDD:VPGATHERQD",
    "VPGATHERDQ:VPGATHERQQ",
]
# Where to extract the asmdoc archive.
ASMDOC_DIR = "asm-docs"
ARCHIVE_URL = "https://www.felixcloutier.com/x86/x86.tbz2"
ARCHIVE_NAME = "x86.tbz2"


class Instruction(object):
    def __init__(self, name, names, tooltip, body):
        self.name = name
        self.names = names
        self.tooltip = tooltip.rstrip(': ,')
        self.body = body

    def __str__(self):
        return f"{self.name} = {self.tooltip}\n{self.body}"


def get_url_for_instruction(instr):
    return f"https://www.felixcloutier.com/x86/{urllib.parse.quote(instr.name)}.html"


def download_asm_doc_archive(downloadfolder):
    if not os.path.exists(downloadfolder):
        print(f"Creating {downloadfolder} as download folder")
        os.makedirs(downloadfolder)
    elif not os.path.isdir(downloadfolder):
        print(f"Error: download folder {downloadfolder} is not a directory")
        sys.exit(1)
    archive_name = os.path.join(downloadfolder, ARCHIVE_NAME)
    print("Downloading archive...")
    urllib.request.urlretrieve(ARCHIVE_URL, archive_name)


def extract_asm_doc_archive(downloadfolder, inputfolder):
    print("Extracting file...")
    if os.path.isdir(os.path.join(inputfolder, "html")):
        for root, dirs, files in os.walk(os.path.join(inputfolder, "html")):
            for file in files:
                if os.path.splitext(file)[1] == ".html":
                    os.remove(os.path.join(root, file))
    tar = tarfile.open(os.path.join(downloadfolder, ARCHIVE_NAME))
    tar.extractall(path=inputfolder)


def strip_non_instr(i):
    # removes junk from encodings where the opcode is in the middle
    # of prefix stuff. e.g.
    # 66 0f 38 30 /r PMOVZXBW xmm1, xmm2/m64
    return STRIP_PREFIX.sub('', i)


def instr_name(i):
    match = INSTRUCTION_RE.match(strip_non_instr(i))
    if match:
        return match.group(1)


def get_description_paragraphs(document_soup):
    description_header_node = document_soup.find(id="description")
    i = 0
    description_paragraph_node = description_header_node.next_sibling.next_sibling
    description_paragraphs = []
    while i < MAX_DESC_PARAS and len(description_paragraph_node.text) > 20:
        if description_paragraph_node.name == "p":
            description_paragraphs.append(description_paragraph_node)
            i = i + 1
            # Move two siblings forward. Next sibling is the line feed.
        description_paragraph_node = description_paragraph_node.next_sibling.next_sibling
    return description_paragraphs


def parse(filename, f):
    doc = BeautifulSoup(f, 'html.parser')
    if doc.table is None:
        print(f"{filename}: Failed to find table")
        return None
    table = read_table(doc.table)
    names = set()

    def add_all(instrs):
        for i in instrs:
            instruction_name = instr_name(i)
            if instruction_name:
                names.add(instruction_name)

    for inst in table:
        if 'Opcode/Instruction' in inst:
            add_all(inst['Opcode/Instruction'].split("\n"))
        elif 'OpcodeInstruction' in inst:
            add_all(inst['OpcodeInstruction'].split("\n"))
        elif 'Opcode Instruction' in inst:
            add_all(inst['Opcode Instruction'].split("\n"))
        elif 'Opcode*/Instruction' in inst:
            add_all(inst['Opcode*/Instruction'].split("\n"))
        elif 'Opcode / Instruction' in inst:
            add_all(inst['Opcode / Instruction'].split("\n"))
        elif 'Instruction' in inst:
            instruction_name = instr_name(inst['Instruction'])
            if not instruction_name:
                print(f"Unable to get instruction from: {inst['Instruction']}")
            else:
                names.add(instruction_name)
        # else, skip the line
    if not names:
        if filename in UNPARSEABLE_INSTR_NAMES:
            for inst in filename.split(":"):
                names.add(inst)
        else:
            print(f"{filename}: Failed to read instruction table")
            return None

    description_paragraphs = get_description_paragraphs(doc)

    for para in description_paragraphs:
        for link in para.find_all('a'):
            # this urljoin will only ensure relative urls are prefixed
            # if a url is already absolute it does nothing
            link['href'] = urllib.parse.urljoin('https://www.felixcloutier.com/x86/', link['href'])
            link['target'] = '_blank'
            link['rel'] = 'noreferrer noopener'

    return Instruction(
        filename,
        names,
        description_paragraphs[0].text.strip(),
        ''.join(map(lambda x: str(x), description_paragraphs)).strip())


def read_table(start_table):
    # Tables on felixcloutier may be split in half, e.g. on https://www.felixcloutier.com/x86/sal:sar:shl:shr
    # This traverses the immediate siblings of the input table
    tables = []
    current_node = start_table
    while current_node:
        if current_node.name == 'table':
            tables.append(current_node)
        elif current_node.name is not None: # whitespace between the tables, i.e. the \n, is a none tag
            break
        current_node = current_node.next_sibling
    # Finding all 'th' is not enough, since some headers are 'td'.
    # Instead, walk through all children of the first 'tr', filter out those
    # that are only whitespace, keep `get_text()` on the others.
    headers = list(
        map(lambda th: th.get_text(),
            filter(lambda th: str(th).strip(), tables[0].tr.children)))

    result = []
    if headers:
        # common case
        for table in tables:
            for row in table.find_all('tr'):
                obj = {}
                for column, name in zip(row.find_all('td'), headers):
                    # Remove '\n's in names that contain it.
                    obj[name.replace('\n', '')] = column.get_text()
                if obj:
                    result.append(obj)
    else:
        # Cases like BEXTR and BZHI
        for table in tables:
            rows = table.find_all('tr')
            if len(rows) != 1:
                return []
            obj = {}
            for td in rows[0].find_all('td'):
                header = td.p.strong.get_text()
                td.p.strong.decompose()
                obj[header] = td.get_text()
            result.append(obj)

    return result


def parse_html(directory):
    print("Parsing instructions...")
    instructions = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".html") and file != 'index.html':
                with open(os.path.join(root, file), encoding='utf-8') as f2:
                    name = os.path.splitext(file)[0]
                    if name in IGNORED_DUPLICATES or name in IGNORED_FILE_NAMES:
                        continue
                    try:
                        instruction = parse(name, f2)
                        if not instruction:
                            continue
                        patch_instruction(instruction)
                        instructions.append(instruction)
                    except Exception as e:
                        print(f"Error parsing {name}:\n{e}")
    return instructions


def self_test(instructions, directory):
    # For each generated instruction, check that there is a path to a file in
    # the documentation.
    directory = os.path.join(directory, "html")
    ok = True
    for inst in instructions:
        if not os.path.isfile(os.path.join(directory, inst.name + ".html")):
            print(f"Warning: {inst.name} has not file associated")
            ok = False
    return ok


def patch_instruction(instruction):
    if instruction.name == "ADDSS":
        print("\nPatching ADDSS")
        print("REMINDER: Check if https://github.com/compiler-explorer/compiler-explorer/issues/2380 is still relevant\n")

        old_body = instruction.body
        old_tooltip = instruction.tooltip
        instruction.body = old_body.replace("stores the double-precision", "stores the single-precision")
        instruction.tooltip = old_tooltip.replace("stores the double-precision", "stores the single-precision")


def main():
    args = parser.parse_args()
    print(f"Called with: {args}")
    # If we don't have the html folder already...
    if not os.path.isdir(os.path.join(args.inputfolder, 'html')):
        # We don't, try with the compressed file
        if not os.path.isfile(os.path.join(args.downloadfolder, "x86.tbz2")):
            # We can't find that either. Download it
            try:
                download_asm_doc_archive(args.downloadfolder)
                extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
            except IOError as e:
                print("Error when downloading archive:")
                print(e)
                sys.exit(1)
        else:
            # We have a file already downloaded
            extract_asm_doc_archive(args.downloadfolder, args.inputfolder)
    instructions = parse_html(args.inputfolder)
    instructions.sort(key=lambda b: b.name)
    self_test(instructions, args.inputfolder)
    all_inst = set()
    for inst in instructions:
        if not all_inst.isdisjoint(inst.names):
            print(f"Overlap in instruction names: {inst.names.intersection(all_inst)} for {inst.name}")
        all_inst = all_inst.union(inst.names)
    if not self_test(instructions, args.inputfolder):
        print("Tests do not pass. Not writing output file. Aborting.")
        sys.exit(3)
    print(f"Writing {len(instructions)} instructions")
    with open(args.outputpath, 'w') as f:
        f.write("""
import {AssemblyInstructionInfo} from '../base';

export function getAsmOpcode(opcode: string | undefined): AssemblyInstructionInfo | undefined {
    if (!opcode) return;
    switch (opcode.toUpperCase()) {
""".lstrip())
        for inst in instructions:
            for name in sorted(inst.names):
                f.write(f'        case "{name}":\n')
            f.write('            return {}'.format(json.dumps({
                "tooltip": inst.tooltip,
                "html": inst.body,
                "url": get_url_for_instruction(inst)
            }, indent=16, separators=(',', ': '), sort_keys=True))[:-1] + '            };\n\n')
        f.write("""
    }
}
""")


if __name__ == '__main__':
    main()