In [1]:
import codecs
import json
import re
import yaml
import os
from tqdm.notebook import tqdm
from PIL import Image
from base64 import b64decode
from io import BytesIO
import numpy as np
import pykakasi

In [2]:
class AutoIndexing():
    def __init__(self,):
        self.kks = pykakasi.kakasi()
    
    def japanese_check(self, s):
        # s: string
        if re.search(r'[ぁ-ん]+|[ァ-ヴー]+|[一-龠]+', s):
            return True
        else:
            return False
        
    def converter(self, s):
        s = s.replace(r"**", "")
        if self.japanese_check(s):
            s_hira = "".join([word["hira"] for word in self.kks.convert(s)])
            return r'\textbf{'+s+r'}\index{'+s_hira + r"@" + s + r'}'
            #return r'\index{'+s_hira + r"@\textbf{" + s + r'}|textbf}'
        else:
            return r'\textbf{'+s+r'}\index{'+s + r'}'

In [3]:
auto_indexing = AutoIndexing()

同じ文字が連続する場合は長い方から処理する．
- ToDo: citationをどうにかする．

In [4]:
s = '# aa\n ## aaa\n `this` is ``` **sample string** **漢字です** for *extracting substring*. {cite:p}`Echeveste2020-sh` <a>'

In [5]:
auto_indexing.converter("漢字です")

'\\textbf{漢字です}\\index{かんじです@漢字です}'

In [6]:
def markdown2latex(s):
    # s: string
    s = re.sub(r'\####\ (.+?)\n', r'\\paragraph{\1}\n', s)  # subsubsection
    s = re.sub(r'\###\ (.+?)\n', r'\\subsubsection{\1}\n', s)  # subsubsection
    s = re.sub(r'\##\ (.+?)\n', r'\\subsection{\1}\n', s)  # subsection
    s = re.sub(r'\#\ (.+?)\n', r'\\section{\1}\n', s)      # section
    
    #s = re.sub(r'\*\*(.+?)\*\*', r'\\textbf{\\index{\1}}', s)   # bold
    s = re.sub(r'\*\*(.+?)\*\*', lambda m: auto_indexing.converter(m.group()), s)   # bold
    #s = re.sub(r'\*(.+?)\*', r'\\textit{\1}', s)       # italic
    
    s = s.replace(r"```{note}", "\\footnote{") # note to footnote
    s = s.replace(r"```", "}")
    s = re.sub(r'<(.+?)>', r'\\url{\1}', s) # url

    s = re.sub(r'{cite:p}`(.+?)`', r'\\citep{\1}', s)     
    s = re.sub(r'`(.+?)`', r'\\jl{\1}', s) # inline code with \newcommand{\jl}{\lstinline[language=julia]}

    s = re.sub(r'<(.+?)>', r'\\url{\1}', s) # url
    s = s.replace(r":=", r"\coloneqq ")
    s = s.replace(r"=:", r"\eqqcolon ")
    s = s.replace(r"（", " (") 
    s = s.replace(r"）", ") ") 
    s = s.replace(r"$$", "") 
    s = s.replace("\r\n", "\n") 
    #s = s.replace(r"．", "．\n") 
    return s

In [7]:
def latex_itemized(text):
    #splited_text = text.split('\n')
    #splited_text = all_remove(splited_text, "\n")
    splited_text = list(filter(None, text))
    # itemize
    item_idx = [line[:2] == "- " for line in splited_text]
    if np.sum(item_idx) > 0:
        item_idx += [False]
        item_startend = np.where(np.diff(np.array(item_idx)) == True)[0]
        item_startend += np.arange(len(item_startend)) + 1

        # replace - to \item
        for i in range(len(splited_text)):
            if item_idx[i]:
                splited_text[i] = splited_text[i].replace('- ', r'\item ', 1) 

        # add begin and end
        for j in range(len(item_startend)):
            if j % 2 == 0:
                splited_text.insert(item_startend[j], r"\begin{itemize}")
            else:
                splited_text.insert(item_startend[j], r"\end{itemize}")
    
    # enumerate
    enum_idx = [line[:3] == "1. " for line in splited_text]
    if np.sum(enum_idx) > 0:
        enum_idx += [False]
        enum_startend = np.where(np.diff(np.array(enum_idx)) == True)[0]
        enum_startend += np.arange(len(enum_startend)) + 1

        # replace 1. to \item
        for i in range(len(splited_text)):
            if enum_idx[i]:
                splited_text[i] = splited_text[i].replace('1. ', r'\item ', 1) 

        # add begin and end
        for j in range(len(enum_startend)):
            if j % 2 == 0:
                splited_text.insert(enum_startend[j], r"\begin{enumerate}")
            else:
                splited_text.insert(enum_startend[j], r"\end{enumerate}")

    for i in range(len(splited_text)):
        if splited_text[i][-1:] != "\n":
            splited_text[i] += "\n"
    return splited_text

In [8]:
markdown2latex(s)

'\\section{aa}\n \\subsection{aaa}\n \\jl{this} is } \\textbf{sample string}\\index{sample string} \\textbf{漢字です}\\index{かんじです@漢字です} for *extracting substring*. \\citep{Echeveste2020-sh} \\url{a}'

変換

In [8]:
def all_remove(xlist, remove):
    return [value for value in xlist if value != remove]

In [14]:
save_dir = "../tex/"
filename = "01_introduction"
f = codecs.open(f"../markdowns/{filename}.md", 'r', encoding="utf8")
md = f.read()
# convert
text = markdown2latex(md)
text = text.split('\n')
text = latex_itemized(text)

with open(f"{save_dir}{filename}.tex", 'w', encoding='UTF-8') as f:
    f.writelines(text)