In [3]:
import sys
sys.path.append('../../lib')

from readmdict import MDX
from bs4 import BeautifulSoup
import re
import json
import json
from jinja2 import Environment, FileSystemLoader

mdx = MDX('./LDOCE5++ V 1-35.mdx') # change to your own path

In [4]:
# Parser for LDOCE5
entries = {}
def extract(dom):
    try:
        el_body = dom.find("span", {"class": "lm5ppbody"})
        if el_body is None:
            return None
        el_dict = el_body.find("div", {"class": "dictionary"})
        if el_dict is not None:
            el_family = el_dict.find("div", {"class": "wordfams"})
            title = el_body.find("h1", {"class": "pagetitle"}).text
            family = {}
            entry_el = el_body.find_all("span", {"class": "dictentry"})
            definitions = []

            if el_family is not None:
                extract_family(el_family)
            
            if entry_el is not None:
                for d in entry_el:
                    definitions.append(extract_def(d))

            return {
                'title': title,
                'definitions': definitions,
                'link': None,
            }

        else:
            return None
    except Exception as err:
        print(dom)
        raise err

def extract_family(el_family):
    pass

def extract_def(d):
    definition = {}
    definition['business'] = 'bussdict' in d['class']
    definition['pron'] =  d.find("span", {"class": "PRON"}).get_text(strip=True) if d.find("span", {"class": "PRON"}) is not None else None
    definition['level'] = d.find("span", {"class": "tooltip LEVEL"}).get_text(strip=True) if d.find("span", {"class": "tooltip LEVEL"}) is not None else None
    el_pos = d.find('span', {'class': 'lm5pp_POS'})
    definition['pos'] = el_pos.get_text(strip=True) if el_pos is not None else None
    definition['freq'] = [f.get_text(strip=True) for f in d.find_all("span", {"class": "FREQ"})]
    definition['inflections'] = [ i.get_text() for i in d.find_all("span",{"class": "PTandPP"})]
    definition['sense'] = []

    example_filter = set()

    for el_sense in d.find_all("span", {"class": "Sense"}):
        sense = {}
        sense['examples'] = []
        sense['gram_examples'] = []
        el_sign = el_sense.find("span", {"class": "SIGNPOST"})
        if el_sign is not None:
            sense['cn_sign'] = el_sign.find("span", {"class": "cn_txt"}).extract().get_text(strip=True) if el_sign.find("span", {"class": "cn_txt"}) is not None else None
            sense['en_sign'] = el_sign.get_text(strip=True)
        else:
            sense['cn_sign'] = None
            sense['en_sign'] = None

        el_gram = el_sense.find("span", {"class": "GRAM"})
        if el_gram is not None:
            sense['gram'] = el_gram.get_text(strip=True)
        else:
            sense['gram'] = None

        el_defs = el_sense.find_all("span", {"class": "DEF"})
        if el_defs:
            sense['en_def'] = el_defs.pop(0).get_text().strip()
            sense['cn_def'] = el_defs.pop(0).get_text().strip() if el_defs else None
        else:
            sense['en_def'] = None
            sense['cn_def'] = None

        el_examples = el_sense.find_all("span", {"class": "EXAMPLE"})
        if el_examples:
            for el_exm in el_examples:
                example = {}
                el_cn_exam = el_exm.find("span", {"class": "cn_txt"})
                example['cn_exm'] = el_cn_exam.extract().get_text().strip() if el_cn_exam is not None else None
                example['en_exm'] = el_exm.get_text().strip()
                if example['en_exm'] not in example_filter:
                    sense['examples'].append(example)
                    example_filter.add(example['en_exm'])

        el_gram_examples = el_sense.find_all("span", {"class": "GramExa"})

        if el_gram_examples:
            for el_gram_exm in el_gram_examples:
                gram_examples ={}
                el_form = el_gram_exm.find("span", {"class": "PROPFORM"})
                gram_examples['form'] = el_form.get_text().strip() if el_form is not None else None
                el_cn_gram_exm = el_gram_exm.find("span", {"class": "cn_txt"})
                gram_examples['cn_gram_exm'] = el_cn_gram_exm.extract().get_text().strip() if el_cn_gram_exm is not None else None
                gram_examples['en_gram_exm'] = el_gram_exm.get_text().strip()
                sense['gram_examples'].append(gram_examples)

        definition['sense'].append(sense)
    
    return definition
        # el_boxes = el_sense.find("span",{"class": "BoxHide"})

        # if el_boxes is not None:
        #     if el_boxes.find("span",{"class": "foldsign"}) is not None:
        #         el_boxes.find("span",{"class": "foldsign"}).extract()
            
        #     box_title = el_boxes.find("span", {"class": "lm5ppBoxHead"}).get_text().strip() if el_boxes.find("span", {"class": "lm5ppBoxHead"}) is not None else None

        #     print(box_title)
        #     for el_expl_example in el_boxes.find_all(next_expl_or_example):
        #         # el_expl_example.extract()
        #         el_cn_expl = el_expl_example.find("span",{"class": "cn_txt"})
        #         cn_expl = el_cn_expl.extract().get_text().strip() if el_cn_expl is not None else None
        #         en_expl = el_expl_example.get_text().strip()
        #         warning = el_expl_example.find("span",{"class": "warning"}) is not None or el_expl_example.find("span",{"class": "dont_say"}) is not None
        #         if cn_expl is not None:
        #             print("{},{},{}".format(warning,en_expl,cn_expl))
        #             pass
                
                # while el_expl.next_element is not None and el_expl.next_element.has_attr("class"):
                #     print("{},{},{}".format(warning,en_expl,cn_expl))
                #     print(el_expl.next_element["class"])
                #     el_gram_example = el_expl.next_element.extract()
                    # el_cn_gram_example = el_gram_example.find("span",{"class": "cn_txt"})
                    # cn_gram_example = el_cn_gram_example.extract().get_text().strip() if el_cn_gram_example is not None else None
                    # en_gram_example = el_gram_example.get_text().strip()
                    # print("{},{},{}".format(en_gram_example,cn_gram_example))



def next_expl_or_example(el):
    return el.has_attr("class") and ("EXAMPLE" in el["class"] or "EXPL" in el["class"])

r = re.compile(r'@@@LINK=(.*)')

def parse(index ,content):
    if "@@@LINK" in content:
        if 'ldoce' not in content:
            matches = r.search(content)
            return {
                'index': index,
                'definitions': None,
                'link': matches.group(1).strip()
            }
        else:
            return None
    else:
        dom = BeautifulSoup(content, "html.parser")
        return extract(dom)


In [None]:
# extract data from ldoce5.mdx and save to dict.json
dict = {}
links = {}
for i in mdx.items():
    entry = i[0].decode("utf-8")
    content = i[1].decode("utf-8")
    info = parse(entry, content)
    if info:
        index = entry
        link = info['link']
        definitions = info['definitions']

        if definitions:
            if index in dict:
                print("ignore duplicated index: " + index)
            else:
                dict[index] = {
                    "definitions": definitions,
                    "indices": [index] 
                }
        else:
            links[index] = link
        
for link, i in links.items():
    if link in dict:
        dict[link]["indices"].append(i)


with open("./dict.json", "w", encoding="utf-8") as f:
    json.dump(dict, f, ensure_ascii=False)

In [None]:
# generate mac dictionary xml file

data = {}
with open('./dict.json', 'r') as file:
    data = json.load(file)

print(json.dumps(data['make']))

template_env = Environment(loader=FileSystemLoader('.'))
template = template_env.get_template('template.html.jinja')
rendered = template.render({"entries":data})
with open("./LDOCE5.xml", "w") as f:
    f.write(rendered)