In [1]:
import argparse
import glob
import sys
import os
from d2lbook import config, markdown, utils, common
import logging
import re
import glob
import ipdb

In [93]:
class MarkdownCleaning(object):
    def __init__(self):
        self.math_mapping = []
        self.english_no_ref_no_math = []
        self.math_patterns = rf'\$.*?\$'
        self.ref_pattern = r':[a-z]*:`[a-z_]*`'
        self.html_pattern = r'http[\S]*html'
        self.html_wrong_pattern = r'（http[\S]*html）'
        
        
    def remove_space(self, text:str) -> str:
        text = text.replace(' ', '')
        text = text.replace("(", '（') # 半角到全角 half-width to full-width
        text = text.replace(")", '）') # 半角到全角 half-width to full-width
        return text
 
    def record_math(self, text:str) -> str:  
        """
        Example:
        input text = "$(\mathbf{X} \in \mathbb{R}^{n \times d}$ "
        self.math_mapping = (("$(\mathbf{X} \in \mathbb{R}^{n \times d}$ ", 
                              "$(\mathbf{X}\in\mathbb{R}^{n\timesd}$")) 
        """
        # find inline latex math match, record their original 
        # and "removed space" format in the mapping
         
        matched = set(re.findall(self.math_patterns, text))
        for m in matched:
            m_no_space = self.remove_space(m)
            self.math_mapping.append((m, m_no_space))    
        return matched
    
        
    def find_noref_nomath_english(self, text:str) -> str:
        """
        Example:
        input text = "在(mdad da) :numref:`sec_linear_regression` 中我们介绍了线性回归mds ds。 
                        mds那么小批量特征为 $(\mathbf{X} \in \mathbb{R}^{n \times d}$ ，
                        权重为 $\(mathbf{W} \in \mathbb{R}^{d \times q}$。 \n "
        output = ['mdad da', 'mds ds', 'mds']
        """
        # find and replace inline references like
        # numref:`...`, :eqref:`...`, :cite:`...` etc
        text_ex_ref = re.sub(self.ref_pattern, '', text)
        
        # find and replace all inline math and references
        text_ex_ref_ex_math = re.sub(self.math_patterns, '', text_ex_ref)
        
        # find all english in text, exclude inline math and references
        list_of_english_no_ref_no_math = re.findall(r'[a-zA-Z][a-z A-Z]+[a-zA-Z]', text_ex_ref_ex_math)
        
        # find all english phases with space, record them in a list mapping
        if len(list_of_english_no_ref_no_math)>0:
            for eng in list_of_english_no_ref_no_math:
                if " " in eng:
                    eng_no_space = eng.replace(' ', '')
                    self.english_no_ref_no_math.append((eng, eng_no_space))
        return list_of_english_no_ref_no_math
        
    def recover_html_parentheses(self, text:str) -> str:
        """
        Example:
        input text: 参见[关于分布的在线附录]（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html）
        output text: 参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)
        """
        wrong_html_matched = set(re.findall(self.html_wrong_pattern, text))
        for html in wrong_html_matched:
            if html[0]=="（" and html[-1]=="）":
                corrected_html = "(" + html[1:-1] + ")"
                text = re.sub(html, corrected_html, text)
        return text
                
        
    def clean_and_recover(self, text:str) -> str:
        """
        Example:
        input text: "在(mdad da) :numref:`sec_linear_regression` 中我们介绍了线性回归mds ds。
                     mds那么小批量特征为 $(\mathbf{X} \in \mathbb{R}^{n \times d}$ ，
                     分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-
                     for-deep-learning/distributions.html)）模型中。 \n "
        
        output text: "在（mdad da） :numref:`sec_linear_regression`中我们介绍了线性回归mds ds。
                      mds那么小批量特征为$(\\mathbf{X} \\in \\mathbb{R}^{n \times d}$，
                      分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-
                      for-deep-learning/distributions.html)）模型中。\n'
        """
        _ = self.record_math(text)
        _ = self.find_noref_nomath_english(text)
        text = self.remove_space(text)
        # recover all the math with spaces
        for key, value in self.math_mapping:
            text = text.replace(value, key)
            
        # recover all the reference (like :numref:`...`) with spaces
        for key, value in self.english_no_ref_no_math:
#             # add a space in front of reference key (like "numref", "eqref", "cite"),
#             # or the html won't compile
#             new_key = " "+key
#             text = text.replace(value, new_key)
            text = text.replace(value, key) 
    
        # recover htmls wrong parentheses to be [xxx](htmls:...)
        text = self.recover_html_parentheses(text)
            
        # recover some ref (or the html won't compile)
        text = text.replace(":numref:", " :numref:")
        text = text.replace(":eqref:", " :eqref:")
        text = text.replace(":cite:", " :cite:")
        return text

In [94]:
mc = MarkdownCleaning()
ex_text = "在(mdad da) :numref:`sec_linear_regression` 中我们介绍了线性回归mds ds。 mds 那么小批量特征为 $(\mathbf{X} \in \mathbb{R}^{n \times d}$ ，分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)）模型中。 \n "
# ex_text_encode = mt.record_math(ex_text)
ex_text_nex = mc.clean_and_recover(ex_text)
ex_text_nex

'在（mdad da） :numref:`sec_linear_regression`中我们介绍了线性回归mds ds。mds那么小批量特征为$(\\mathbf{X} \\in \\mathbb{R}^{n \times d}$，分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)）模型中。\n'

In [68]:
out = mc.find_noref_nomath_english(ex_text)
# re.findall(r'[a-zA-Z][a-z A-Z]+', out)
out

['mdad da', 'mds ds', 'mds']

In [69]:
mc.english_no_ref_no_math

[('mdad da', 'mdadda'),
 ('mds ds', 'mdsds'),
 ('mdad da', 'mdadda'),
 ('mds ds', 'mdsds')]

In [91]:
sample = 'I am from 美国。We should be friends. 朋友softmax 。'
for n in re.findall(r'[a-zA-Z][a-z A-Z]*[a-zA-Z]', sample):
    print(n)

# for n in re.findall(r'[\u4e00-\u9fff]+', sample):
#     print(n)

I am from
We should be friends
softmax


In [39]:
for n in re.findall(r':[a-z]*:`[a-z_]*`', ex_text):
    print(n)

:numref:`sec_linear_regression`


In [77]:
ex_html = "分布（参见[关于分布的在线附录]（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html））模型中"
for n in re.findall(r'（http[\S]*html）', ex_html):
    print(n)

（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html）


In [82]:
ex_html = "分布（参见[关于分布的在线附录]（https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html））模型中"
mc = MarkdownCleaning()
ex_html_nex = mc.clean_and_recover(ex_html)
ex_html_nex

'分布（参见[关于分布的在线附录](https://d2l.ai/chapter_appendix-mathematics-for-deep-learning/distributions.html)）模型中'

In [88]:
class Translator():
    def _translate_markdown(self, text):
            cells = markdown.split_markdown(text)
            for cell in cells:
                if cell['type'] == 'markdown':
                    if 'class' in cell and cell['class']:
                        # it may have nested code blocks
                        cell['source'] = self._translate_markdown(cell['source'])
                    else:
                        text_cells = markdown.split_text(cell['source'])
                        for t_cell in text_cells:
#                             ipdb.set_trace()
                            if t_cell['source'] and (
                                t_cell['type'] in ['text', 'list']):
                                text = t_cell['source']
                                markdown_cleaning = MarkdownCleaning()
                                t_cell['source'] = markdown_cleaning.clean_and_recover(text)
#                                 if text.endswith('\n'):
#                                     t_cell['source'] += '\n'
                        cell['source'] = markdown.join_text(text_cells)
            return markdown.join_markdown_cells(cells)

    def translate_markdown(self, source_file: str, target_file: str):
        with open(source_file, 'r') as r:
            with open(target_file, 'w') as w:
                w.write(self._translate_markdown(r.read()))

In [95]:
src = "softmax-regression.md"
tgt = "softmax-regression_new_new.md"

translator = Translator()
translator.translate_markdown(src, tgt)

## 全连接层的参数开销
:label:`subsec_parameterization-cost-fc-layers`
正如我们将在后续章节中看到的，在深度学习中，全连接层无处不在。
然而，顾名思义，全连接层是“完全”连接的，可能有很多可学习的参数。
具体来说，对于任何具有$d$个输入和$q$个输出的全连接层，参数开销为$\mathcal{O}(dq)$，在实践中可能高得令人望而却步。
幸运的是，将$d$个输入转换为$q$个输出的成本可以减少到$\mathcal{O}(\frac{dq}{n})$，其中超参数$n$可以由我们灵活指定，以在实际应用中平衡参数节约和模型有效性 :cite:`Zhang.Tay.Zhang.ea.2021` 。



{'type': 'title', 'prefix': '# ', 'source': 'softmax回归', 'mark': ':label:`sec_softmax`\n'}

{'type': 'text', 'source': '在 :numref:`sec_linear_regression` 中我们介绍了线性回归。那么小批量特征为 $\mathbf{X} \in \mathbb{R}^{n \times d}$ ，权重为 $\mathbf{W} \in \mathbb{R}^{d \times q}$。\n'}