In [1]:
import argparse
import glob
import sys
import os
from d2lbook import config, markdown, utils, common
import logging
import re
import glob
import ipdb

In [2]:
class MarkdownCleaning(object):
    def __init__(self):
        self.math_mapping = []
        self.math_wholeline_mapping = []
        self.backtick_mapping = []
        self.english_no_ref_no_math = []
        self.math_patterns = rf'\$.*?\$'
        self.math_wholeline_patterns = r'\$\$[^$]*\$\$'
        self.ref_pattern = r':[a-z]*:`[a-z_]*`'
        self.html_pattern = r'(https[\S]*)'
        self.html_wrong_pattern = r'（https[\S]*）'
        self.backtick_pattern = r'`[^`]*`' # anything enclosed in two backticks
        
        
    def remove_space(self, text:str) -> str:
        text = text.replace(' ', '')
        text = text.replace("(", '（') # 半角到全角 half-width to full-width
        text = text.replace(")", '）') # 半角到全角 half-width to full-width
        return text
 
    def record_math(self, text:str) -> str:  
        """
        Example:
        input text = "$(\mathbf{X} \in \mathbb{R}^{n \times d}$ "
        self.math_mapping = (("$(\mathbf{X} \in \mathbb{R}^{n \times d}$ ", 
                              "$(\mathbf{X}\in\mathbb{R}^{n\timesd}$")) 
        """
        # find inline latex math match, record their original 
        # and "removed space" format in the mapping
         
        matched = set(re.findall(self.math_patterns, text))
        for m in matched:
            m_no_space = self.remove_space(m)
            self.math_mapping.append((m, m_no_space))    
        return matched

    def record_wholeline_math(self, text:str) -> str:  
        """
        Example:
        input text = "(**$$\|\mathbf{x}\|_1 = \sum_{i=1}^n \left|x_i \right|.$$**)"
        
        without this function, the spaces will be removed, hence,
        
        
        self.math_mapping = (("(**$$\|\mathbf{x}\|_1 = \sum_{i=1}^n \left|x_i \right|.$$**)", 
                              "(**$$\|\mathbf{x}\|_1=\sum_{i=1}^n\left|x_i\right|.$$**)")) 
        """
        # find inline latex math match, record their original 
        # and "removed space" format in the mapping
         
        matched = set(re.findall(self.math_wholeline_patterns, text))
        for m in matched:
            m_no_space = self.remove_space(m)
            self.math_wholeline_mapping.append((m, m_no_space))    
        return matched
    
    def record_backtick(self, text:str) -> str:  
        """
        Example:
        input text = "`x.reshape（3,-1）"
        self.backtick_mapping = (("`x.reshape（3,-1）", 
                              "`x.reshape(3,-1)")) 
        """
        # find inline latex math match, record their original 
        # and "removed space" format in the mapping
         
        matched = set(re.findall(self.backtick_pattern, text))
        for m in matched:
            m_no_space = self.remove_space(m)
            self.backtick_mapping.append((m, m_no_space))    
        return matched
        
    def find_noref_nomath_english(self, text:str) -> str:
        """
        Example:
        input text 1 = "在(mdad da) :numref:`sec_linear_regression` 中我们介绍了线性回归mds ds。 
                        mds那么小批量特征为 $(\mathbf{X} \in \mathbb{R}^{n \times d}$ ，
                        权重为 $\(mathbf{W} \in \mathbb{R}^{d \times q}$。 \n "
        output 1 = ['mdad da', 'mds ds', 'mds']
        
        input text 2: "方程之一：*定理* (Bayes' theorem)。"
        output text 2: ["Bayes' theorem"]
        """
        # find and replace inline references like
        # numref:`...`, :eqref:`...`, :cite:`...` etc
        text_ex_ref = re.sub(self.ref_pattern, '', text)
        
        # find and replace all inline math and references
        text_ex_ref_ex_math = re.sub(self.math_patterns, '', text_ex_ref)
        
        # find all english in text, exclude inline math and references
        list_of_english_no_ref_no_math = re.findall(r'[a-zA-Z][a-z A-Z\']+[a-zA-Z]', text_ex_ref_ex_math)
        
        # find all english phases with space, record them in a list mapping
        if len(list_of_english_no_ref_no_math)>0:
            for eng in list_of_english_no_ref_no_math:
                if " " in eng:
                    eng_no_space = eng.replace(' ', '')
                    self.english_no_ref_no_math.append((eng, eng_no_space))
        return list_of_english_no_ref_no_math
        
    def recover_html_parentheses(self, text:str) -> str:
        """
        Example:
        input text: 参见[关于分布的在线附录]（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html）
        output text: 参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)
        """
        wrong_html_matched = set(re.findall(self.html_wrong_pattern, text))
        for html in wrong_html_matched:
            if html[0]=="（" and html[-1]=="）":
                corrected_html = "(" + html[1:-1] + ")"
                text = re.sub(html, corrected_html, text)
        return text
    
    def recover_slides_symbol(self, text:str) -> str:
        """
        The slides symbol in d2lbook needs to be half-width rather than full-width
        
        Example:
        input text:  "（**执行原地操作**）"
        output text: "(**执行原地操作**)"
        """       
        text = text.replace("（**", "(**")
        text = text.replace("**）", "**)")
        text = text.replace("（~~", "(~~")
        text = text.replace("~~）", "~~)")  
        
        text = text.replace("](~~", "] (~~")
        text = text.replace("](**", "] (**")
        
        
        return text
        
    def clean_and_recover(self, text:str) -> str:
        """
        Example:
        input text 1: "在(mdad da) :numref:`sec_linear_regression` 中我们介绍了线性回归mds ds。
                     mds那么小批量特征为 $(\mathbf{X} \in \mathbb{R}^{n \times d}$ ，
                     分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-
                     for-deep-learning/distributions.html)）模型中。 \n "
        
        output text 1: "在（mdad da） :numref:`sec_linear_regression`中我们介绍了线性回归mds ds。
                      mds那么小批量特征为$(\\mathbf{X} \\in \\mathbb{R}^{n \times d}$，
                      分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-
                      for-deep-learning/distributions.html)）模型中。\n'

        
        """
        _ = self.record_math(text)
        _ = self.record_wholeline_math(text)
        _ = self.record_backtick(text)
        
        _ = self.find_noref_nomath_english(text)
        text = self.remove_space(text)
        
        # recover all the math with spaces
        for key, value in self.math_mapping:
            text = text.replace(value, key)
 
        # previous line failed to recover math in slides "(** ... **)",
        # recover by the following line
        for key, value in self.math_wholeline_mapping:
            text = text.replace(value, key)

        # recoverr backticks content
        for key, value in self.backtick_mapping:
            text = text.replace(value, key)  
    
        # recover all the reference (like :numref:`...`) with spaces
        for key, value in self.english_no_ref_no_math:
#             # add a space in front of reference key (like "numref", "eqref", "cite"),
#             # or the html won't compile
#             new_key = " "+key
#             text = text.replace(value, new_key)
            text = text.replace(value, key) 
    
        # recover htmls wrong parentheses to be [xxx](htmls:...)
        text = self.recover_html_parentheses(text)
            
        # recover some ref (or the html won't compile)
        text = text.replace(":numref:", " :numref:")
        text = text.replace(":eqref:", " :eqref:")
        text = text.replace(":cite:", " :cite:")
        
        # recover slides symbols
        text = self.recover_slides_symbol(text)
        
        return text
        

    
    
mc = MarkdownCleaning()

In [74]:
ss = "方程之一：*定理* (Bayes' theorem)。"
eng = re.findall(r'[a-zA-Z][a-z A-Z\']+[a-zA-Z]', ss)
eng

["Bayes' theorem"]

In [65]:
ee = "(**$$\|\mathbf{X}\|_F = \sqrt{\sum_{i=1}^m \sum_{j=1}^n x_{ij}^2}.$$**)"
p = r'\*\*\$\$[^$]*\$\$\*\*'
for n in re.findall(p, ee):
    print(n)

ee_out = mc.clean_and_recover(ee)
ee_out


**$$\|\mathbf{X}\|_F = \sqrt{\sum_{i=1}^m \sum_{j=1}^n x_{ij}^2}.$$**


'(**$$\\|\\mathbf{X}\\|_F = \\sqrt{\\sum_{i=1}^m \\sum_{j=1}^n x_{ij}^2}.$$**)'

In [66]:
eee = "$$\frac{d}{dx} [Cf(x)] = C \frac{d}{dx} f(x),$$"
eee_out = mc.clean_and_recover(eee)
eee_out

'$$\x0crac{d}{dx} [Cf(x)] = C \x0crac{d}{dx} f(x),$$'

In [94]:
mc = MarkdownCleaning()
ex_text = "在(mdad da) :numref:`sec_linear_regression` 中我们介绍了线性回归mds ds。 mds 那么小批量特征为 $(\mathbf{X} \in \mathbb{R}^{n \times d}$ ，分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)）模型中。 \n "
# ex_text_encode = mt.record_math(ex_text)
ex_text_nex = mc.clean_and_recover(ex_text)
ex_text_nex

'在（mdad da） :numref:`sec_linear_regression`中我们介绍了线性回归mds ds。mds那么小批量特征为$(\\mathbf{X} \\in \\mathbb{R}^{n \times d}$，分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)）模型中。\n'

In [68]:
out = mc.find_noref_nomath_english(ex_text)
# re.findall(r'[a-zA-Z][a-z A-Z]+', out)
out

['mdad da', 'mds ds', 'mds']

In [69]:
mc.english_no_ref_no_math

[('mdad da', 'mdadda'),
 ('mds ds', 'mdsds'),
 ('mdad da', 'mdadda'),
 ('mds ds', 'mdsds')]

In [91]:
sample = 'I am from 美国。We should be friends. 朋友softmax 。'
for n in re.findall(r'[a-zA-Z][a-z A-Z]*[a-zA-Z]', sample):
    print(n)

# for n in re.findall(r'[\u4e00-\u9fff]+', sample):
#     print(n)

I am from
We should be friends
softmax


In [39]:
for n in re.findall(r':[a-z]*:`[a-z_]*`', ex_text):
    print(n)

:numref:`sec_linear_regression`


In [77]:
ex_html = "分布（参见[关于分布的在线附录]（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html））模型中"
for n in re.findall(r'（http[\S]*html）', ex_html):
    print(n)

（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html）


In [82]:
ex_html = "分布（参见[关于分布的在线附录]（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html））模型中"
mc = MarkdownCleaning()
ex_html_nex = mc.clean_and_recover(ex_html)
ex_html_nex

'分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)）模型中'

In [35]:
example = "在上面的例子中，我们可以用`x.reshape(-1,4)`或`x.reshape(3,-1)`来取代`x.reshape(3,4)`。"

for n in re.findall(r'`[^`]*`', example):
    print(n)

`x.reshape(-1,4)`
`x.reshape(3,-1)`
`x.reshape(3,4)`


In [36]:
out = mc.clean_and_recover(example)
out

'在上面的例子中，我们可以用`x.reshape(-1,4)`或`x.reshape(3,-1)`来取代`x.reshape(3,4)`。'

In [45]:
example2 = "（**首先，我们导入`torch`"
out = mc.clean_and_recover(example2)
out

'(**首先，我们导入`torch`'

In [43]:
example2 = example2.replace("（**", "(**")
example2

'(**首先，我们导入`torch`'

In [3]:
class Translator():
    def _translate_markdown(self, text):
            cells = markdown.split_markdown(text)
            for cell in cells:
                if cell['type'] == 'markdown':
                    if 'class' in cell and cell['class']:
                        # it may have nested code blocks
                        cell['source'] = self._translate_markdown(cell['source'])
                    else:
                        text_cells = markdown.split_text(cell['source'])
                        for t_cell in text_cells:
#                             ipdb.set_trace()
                            if t_cell['source'] and (
                                t_cell['type'] in ['text', 'list']):
                                text = t_cell['source']
                                markdown_cleaning = MarkdownCleaning()
                                t_cell['source'] = markdown_cleaning.clean_and_recover(text)
#                                 if text.endswith('\n'):
#                                     t_cell['source'] += '\n'
                        cell['source'] = markdown.join_text(text_cells)
            return markdown.join_markdown_cells(cells)

    def translate_markdown(self, source_file: str, target_file: str):
        with open(source_file, 'r') as r:
            with open(target_file, 'w') as w:
                w.write(self._translate_markdown(r.read()))

In [7]:
chapter_name = "chapter_multilayer-perceptrons" # "chapter_linear-networks"
filename = "kaggle-house-price"

src = "/Users/rlhu/git_goldpiggy/parser/origin/{}/{}.md".format(chapter_name, filename)
tgt = "/Users/rlhu/git_goldpiggy/d2l-zh/{}/{}.md".format(chapter_name, filename)

translator = Translator()
translator.translate_markdown(src, tgt)

In [5]:
!pwd

/Users/rlhu/git_goldpiggy/parser


{'type': 'title', 'prefix': '# ', 'source': 'softmax回归', 'mark': ':label:`sec_softmax`\n'}

{'type': 'text', 'source': '在 :numref:`sec_linear_regression` 中我们介绍了线性回归。那么小批量特征为 $\mathbf{X} \in \mathbb{R}^{n \times d}$ ，权重为 $\mathbf{W} \in \mathbb{R}^{d \times q}$。\n'}