# Resources
- Tutorial used to create dictionary: https://jakemccrary.com/blog/2020/11/11/creating-a-custom-kindle-dictionary/
- Kanji Koohi: https://kanji.koohii.com/
- Anki "Heisig's RTK 6th Edition- Stories, Stroke diagrams, Readings" Deck used to mine data
- Anki Pandas used to mine anki deck

In [1]:
from ankipandas import Collection

col = Collection()

[32mINFO: Searching for database. This might take some time. You can speed this up by specifying a search path or directly entering the path to your database.[0m


# 1. Get Heisig Kanji Dataset

In [3]:
rtk = col.notes.loc[col.notes.nmodel.str.contains('Heisig')].fields_as_columns()
rtk = rtk[[c for c in rtk.columns if c not in ['nguid','nmod','nusn','ntags','nmodel']]]
rtk.rename(columns={c:c.split('_')[-1] for c in rtk.columns}, inplace=True)

# 2. Create content.html (Main Dictionary)

In [5]:
TITLE = 'Kanji Koohi (RTK) Dictionary'
AUTHOR = 'Stephen R. Thompson'
LANGUAGE = 'ja'

In [8]:
def add_word(r):    
    def format_line(title, value):
        value = value.strip()
        if value != '':
            value = f'\n<h5>{title}</h5>\n<dd>{value}</dd>\n'
        return value
    
    kanji = r['kanji']
    keyword = r['keyword'].strip()
    constituents = r['constituent'].strip()
    koohi1 = format_line('Koohi Story 1', r['koohiiStory1'])
    koohi2 = format_line('Koohi Story 2', r['koohiiStory2']) 
    heisig = format_line('Heisig Story', r['heisigStory']) 
    comment = format_line('Heisig Comment', r['heisigComment']) 
    
    # Comment out the next two lines if you want to add heisig's stories & comments to the dictionary
    heisig = ''
    comment = ''
      
    return \
    f"""<idx:entry name="default" scriptable="yes" spell="yes">
      <h5><dt><idx:orth>{kanji}</idx:orth></dt></h5>
      <dd>{keyword} [{constituents}]</dd>{koohi1}{koohi2}{heisig}{comment}
    </idx:entry>
    <hr/>"""

def add_html_wrapper(entry_list):
    entry_str = '\n'.join(entry_list)
    
    return \
    f"""<html xmlns:math="http://exslt.org/math" xmlns:svg="http://www.w3.org/2000/svg"
          xmlns:tl="https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf"
          xmlns:saxon="http://saxon.sf.net/" xmlns:xs="http://www.w3.org/2001/XMLSchema"
          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
          xmlns:cx="https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf"
          xmlns:dc="http://purl.org/dc/elements/1.1/"
          xmlns:mbp="https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf"
          xmlns:mmc="https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf"
          xmlns:idx="https://kindlegen.s3.amazonaws.com/AmazonKindlePublishingGuidelines.pdf">
      <head>
        <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
        <style>
          h5 {{
              font-size: 1em;
              margin: 0;
          }}
          dt {{
              font-weight: bold;
          }}
          dd {{
              margin: 0;
              padding: 0 0 0.5em 0;
              display: block
          }}
        </style>
      </head>
      <body>
        <mbp:frameset>
          {entry_str}
        </mbp:frameset>
      </body>
    </html>
    """

In [9]:
entry_list = []
for i, r in rtk.iterrows():
    entry_list += [add_word(r)]

In [10]:
content = add_html_wrapper(entry_list)
f = open('content.html', 'w', encoding='UTF-8')
f.write(content)
f.close()

# 3. Cover Page

In [11]:
def get_cover(dict_name, dict_auth):
    return \
    f"""<html>
      <head>
        <meta content="text/html" http-equiv="content-type">
      </head>
      <body>
        <h1>{dict_name}</h1>
        <h3>{dict_auth}</h3>
      </body>
    </html>
    """

cover = get_cover(TITLE, AUTHOR)

f = open('cover.html', 'w', encoding='UTF-8')
f.write(cover)
f.close()

# 4. Copyright Page

In [12]:
license_content = """
Licensed under CC BY-NC-SA 3.0 (Attribution-NonCommercial-ShareAlike 3.0 Unported).
All kanji mnemonics provided by the Kanji Koohi database created by Fabrice Denis.
"""

copyright = \
f"""<html>
  <head>
    <meta content="text/html" http-equiv="copyright-type">
  </head>
  </h1>COPYRIGHT</h1>
  <body>
    {license_content}
  </body>
</html>
"""

f = open('copyright.html', 'w', encoding='UTF-8')
f.write(copyright)
f.close()

# 5. Usage Page
Not Required

In [13]:
# f = open('usage.html', 'w', encoding='UTF-8')
# f.write(usage)
# f.close()

# 6. Create an .opf XML file

In [14]:
def get_opf(title, author, language='en-us', lookup_idx='default', image='', usage=False):
    if image != '':
        image=f'\n<item href="{image}" id="my-cover-image" media-type="image/jpg" />\n'
    
    usage_str = ''
    usage_ref = ''
    if usage:
        usage_str = \
        """\n<item id="usage"
              href="usage.html"
              media-type="application/xhtml+xml" />\n"""
        usage_ref = '\n<itemref idref="usage" />\n'
    
    return \
    f"""<?xml version="1.0"?>
    <package version="2.0" xmlns="http://www.idpf.org/2007/opf" unique-identifier="BookId">
      <metadata>
        <dc:title>{title}</dc:title>
        <dc:creator opf:role="aut">{author}</dc:creator>
        <dc:language>{language}</dc:language>
        <meta name="cover" content="my-cover-image" />
        <x-metadata>
          <DictionaryInLanguage>{language}</DictionaryInLanguage>
          <DictionaryOutLanguage>{language}</DictionaryOutLanguage>
          <DefaultLookupIndex>{lookup_idx}</DefaultLookupIndex>
        </x-metadata>
      </metadata>
      <manifest>{image}        
        <item id="cover"
              href="cover.html"
              media-type="application/xhtml+xml" />{usage_str}
        <item id="copyright"
              href="copyright.html"
              media-type="application/xhtml+xml" />
        <item id="content"
              href="content.html"
              media-type="application/xhtml+xml" />
      </manifest>
      <spine>
        <itemref idref="cover" />{usage_ref}        
        <itemref idref="copyright"/>
        <itemref idref="content"/>
      </spine>
      <guide>
        <reference type="index" title="IndexName" href="content.html"/>
      </guide>
    </package>
    """

In [15]:
opf = get_opf(TITLE, AUTHOR, language=LANGUAGE)
f = open('kanji-koohi-rtk_dict.opf', 'w', encoding='UTF-8')
f.write(opf)
f.close()