# Define Input/Output/Global Variables

In [8]:
# global vars
from opencc import OpenCC

toTrad = OpenCC('s2tw')
toSimp = OpenCC('t2s')

# input vars
wordlist_file = "hsk_wordlist_processing/final_formatted_wordlist.txt"

# Load wordlist

In [26]:
# define word entry class
from typing import List
import dataclasses

@dataclasses.dataclass
class WordEntry:
    sort_numbers: List[int]
    tags: List[str]

In [27]:
# process wordlist into dictionary
wordlist = {}
n_words_in_wordlist, n_dup = 0, 0
current_tag = ""
for line in open(wordlist_file, encoding="utf-8"):
    line = line.strip()
    if "hsk_level" in line:
        current_tag = line.replace("hsk_l", "HSK_L")
        continue
    else:
        trad_char = toTrad.convert(line)
        if trad_char not in wordlist:
            wordlist[trad_char] = WordEntry(sort_numbers=[n_words_in_wordlist], tags=[current_tag])
        else:
            n_dup += 1
            wordlist[trad_char].sort_numbers.append(n_words_in_wordlist)
            wordlist[trad_char].tags.append(current_tag)
        n_words_in_wordlist += 1

In [30]:
# print stats on wordlist dict
print("\n".join([str(t) for t in list(wordlist.items())[:10]]), "\n...\n")

print(f"current wordlist is '{wordlist_file}'")
print("number of words inside original wordlist:", n_words_in_wordlist)
print("number of unique words:                  ", len(set(wordlist.keys())))
print("num_duplicates:                          ", n_dup)

('愛', WordEntry(sort_numbers=[0], tags=['HSK_Level_1']))
('愛好', WordEntry(sort_numbers=[1], tags=['HSK_Level_1']))
('八', WordEntry(sort_numbers=[2], tags=['HSK_Level_1']))
('爸爸', WordEntry(sort_numbers=[3], tags=['HSK_Level_1']))
('吧', WordEntry(sort_numbers=[4], tags=['HSK_Level_1']))
('白', WordEntry(sort_numbers=[5, 1275], tags=['HSK_Level_1', 'HSK_Level_3']))
('白天', WordEntry(sort_numbers=[6], tags=['HSK_Level_1']))
('百', WordEntry(sort_numbers=[7], tags=['HSK_Level_1']))
('班', WordEntry(sort_numbers=[8], tags=['HSK_Level_1']))
('半', WordEntry(sort_numbers=[9], tags=['HSK_Level_1'])) 
...

current wordlist is 'hsk_wordlist_processing/final_formatted_wordlist.txt'
number of words inside original wordlist: 11073
number of unique words:                   10921
num_duplicates:                           152


# Connect to SQLite DB

In [9]:
from zipfile import ZipFile
import shutil
import os

apkg_file = "apkgs\\pleco_import_bkfill2.apkg"
experiement_folder = r"apkgs\\pleco_import_bkfill2_direct_sqlite_apkg"

In [10]:
# if os.path.isdir(experiement_folder):
#     shutil.rmtree(experiement_folder)
 

# with ZipFile(apkg_file) as zf:
#     zf.extractall(experiement_folder)

In [11]:
import sqlite3
import re
SEPARATOR = "\x1f"

In [12]:
con = sqlite3.connect(f"{experiement_folder}\\collection.anki2")

In [13]:
cur = con.cursor()

In [14]:
def get_trad_word(simp_word):
    return toTrad.convert(toSimp.convert(simp_word))

def get_trad_word_from_row(row):
    _id, flds = row
    flds_arr = flds.split(SEPARATOR)
    return get_trad_word(flds_arr[0])

In [15]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[('col',), ('notes',), ('cards',), ('revlog',), ('sqlite_stat1',), ('sqlite_stat4',), ('graves',)]


# missing words

In [16]:
sqlite_wordset = set([get_trad_word_from_row(row) for row in list(cur.execute('SELECT id, flds FROM notes;'))])
misfits = sqlite_wordset - full_wordset
missing = full_wordset - sqlite_wordset 
print(f"""
anki cards in sqlite: {len(sqlite_wordset)}
unique words in editted wordlist: {len(full_wordset)}
misfits (in anki but not wordlist): {len(misfits)}
missing (in wordlist, but not anki): {len(missing)}
""")


anki cards in sqlite: 10921
unique words in editted wordlist: 10921
misfits (in anki but not wordlist): 0
missing (in wordlist, but not anki): 0



In [17]:
# TO BE REMOVED
sorted([toSimp.convert(word) for word in list(misfits)])

['不难理会', '怀', '指著和尚骂秃子', '揹', '见过世面']

In [18]:
list(sorted([toSimp.convert(word) for word in list(missing)]))

[]

In [21]:
# backfill_todo_wordlist_file = "backfill_todo_wordlist_file.txt"
# with open(backfill_todo_wordlist_file, "w", encoding="utf-8") as f:
#     f.write("\n".join(sorted(list(missing))))

# Delete Misfit Words (in anki but not wordlist)

In [19]:
def delete_words(cur, words):
    word_to_id = {get_trad_word_from_row(row) : row[0] 
          for row in list(cur.execute('SELECT id, flds FROM notes;'))}
    for word in words:
        del_id = word_to_id[word]
        cur.execute("DELETE FROM notes WHERE id=?", (del_id,))

In [20]:
delete_words(cur, misfits)

# Update New Fields for Cards

In [36]:
def extract_pinyin(back):
    x = re.match(r"""[\s\S]*?<br>([\s\S]+?>PY <[\s\S]+?<\/span>)<\/p>""", back).group(1)
    x = re.sub("<.*?>", '', x)
    x = re.sub("//", '', x)
    x = re.sub("PY", '', x)
    x = x.strip()
    return x
    

def process_flds(flds):
    flds_arr = flds.split(SEPARATOR)
    front, back = flds_arr[0], flds_arr[1]
    traditional = get_trad_word(front)
    simplified = toSimp.convert(traditional)

    word_entry = wordlist[traditional]
    definition = re.match(r"""[\s\S]*?<\/div>(<div align="left">[\s\S]*?<\/div>)""", back).group(1)
    pinyin = extract_pinyin(back)
    learning_order = str(min(word_entry.sort_numbers))
    metadata = str(word_entry)
    new_tags = f" {' '.join(word_entry.tags)} "
    
    new_flds = SEPARATOR.join([front, back, traditional, simplified, definition, pinyin, learning_order, metadata])
    return new_flds, new_tags

def update(cur, _id, new_flds, new_tags):
    cur.execute("UPDATE notes SET flds = ?, tags = ? WHERE id = ?", (new_flds, new_tags, _id))

def process_row(cur, row):
    _id, flds = row
    new_flds, new_tags = process_flds(flds)
    if new_flds.count(SEPARATOR) != flds.count(SEPARATOR):
        print(f"""
{new_flds.count(SEPARATOR)}
{flds.count(SEPARATOR)}
        """)
        raise Exception("uh oh")
    update(cur, _id, new_flds, new_tags)

In [37]:
def update_all(cur):
    for row in list(cur.execute('SELECT id, flds FROM notes;')):
        process_row(cur, row)

In [38]:
update_all(cur)

# Commit to DB

In [42]:
con.commit()

In [43]:
con.close()

# Scratch Paper

In [27]:
b = """<div align="left"><p><span style="font-size:32px";>&#25918;&#23416;&#12308;&#25918;&#23398;&#12309;</span><br>\n<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#DF80FF;"><span style="font-weight:600;">f&#224;ng</span></span><span style="font-weight:600;">//</span><span style="font-weight:600;"><span style="color:#80FF80;">xu&#233;</span></span></p>\n</div><div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">VERB</span></span></b><br>\n<b>1&#9;</b>classes are over; school lets out<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">&#20182;&#20497;&#23416;&#26657;&#19979;&#21320;&#20116;&#40670;</span><span style="color:#00AAFF;"><b>&#25918;&#23416;</b></span><span style="color:#00AAFF;">&#12290;</sp"""

In [35]:
x = re.match(r"""[\s\S]*?<br>([\s\S]+?>PY <[\s\S]+?<\/span>)<\/p>""", b).group(1)
x = re.sub("<.*?>", '', x)
x = re.sub("//", '', x)
x = re.sub("PY", '', x)
x = x.strip()
print(x)

f&#224;ngxu&#233;


In [33]:
print(x.strip())

<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#DF80FF;"><span style="font-weight:600;">f&#224;ng</span></span><span style="font-weight:600;">//</span><span style="font-weight:600;"><span style="color:#80FF80;">xu&#233;</span></span>


'\nPY f&#224;ng//xu&#233;'

In [17]:
__id, _flds, _tags = list(cur.execute('SELECT id, flds, tags FROM notes;'))[101]
_flds, _tags

('飛\x1f<div align="left"><p><span style="font-size:32px" ;="">飛〔飞〕</span><br>\n<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#FF8080;"><span style="font-weight:600;">fēi</span></span></p>\n</div><div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">VERB</span></span></b><br>\n<b>1\t</b>(of birds or insects) fly; flit<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><b><span style="color:#00AAFF;">飛</span></b><span style="color:#00AAFF;">鳥</span><br>\n<b>fēi</b><span style="font-weight:600;">niǎo</span><br>\nflying bird<br>\n</p>\n</blockquote>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">老鷹</span><span style="color:#00AAFF;"><b>飛</b></span><span style="color:#00AAFF;">得高。</span><br>\n<span style="

In [15]:
__id, _flds, _tags = list(cur.execute('SELECT id, flds, tags FROM notes;'))[101]
_flds, _tags

('飛\x1f<div align="left"><p><span style="font-size:32px" ;="">飛〔飞〕</span><br>\n<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#FF8080;"><span style="font-weight:600;">fēi</span></span></p>\n</div><div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">VERB</span></span></b><br>\n<b>1\t</b>(of birds or insects) fly; flit<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><b><span style="color:#00AAFF;">飛</span></b><span style="color:#00AAFF;">鳥</span><br>\n<b>fēi</b><span style="font-weight:600;">niǎo</span><br>\nflying bird<br>\n</p>\n</blockquote>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">老鷹</span><span style="color:#00AAFF;"><b>飛</b></span><span style="color:#00AAFF;">得高。</span><br>\n<span style="

In [22]:
process_flds(_flds)

('飛\x1f飞\x1f<div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">VERB</span></span></b><br>\n<b>1&#9;</b>(of birds or insects) fly; flit<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><b><span style="color:#00AAFF;">&#39131;</span></b><span style="color:#00AAFF;">&#40165;</span><br>\n<b>f&#275;i</b><span style="font-weight:600;">ni&#462;o</span><br>\nflying bird<br>\n</p>\n</blockquote>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">&#32769;&#40441;</span><span style="color:#00AAFF;"><b>&#39131;</b></span><span style="color:#00AAFF;">&#24471;&#39640;&#12290;</span><br>\n<span style="font-weight:600;">L&#462;oy&#299;ng </span><b>f&#275;i</b><span style="font-weight:600;"> de g&#257;o.</span><br>\nEagles fly high.<br>\n</p>\n</blockquote>\n<blockquote sty

In [40]:
__id, _flds, _tags = list(cur.execute('SELECT id, flds, tags FROM notes;'))[100]
_flds, _tags

('放學\x1f<div align="left"><p><span style="font-size:32px";>&#25918;&#23416;&#12308;&#25918;&#23398;&#12309;</span><br>\n<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#DF80FF;"><span style="font-weight:600;">f&#224;ng</span></span><span style="font-weight:600;">//</span><span style="font-weight:600;"><span style="color:#80FF80;">xu&#233;</span></span></p>\n</div><div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">VERB</span></span></b><br>\n<b>1&#9;</b>classes are over; school lets out<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">&#20182;&#20497;&#23416;&#26657;&#19979;&#21320;&#20116;&#40670;</span><span style="color:#00AAFF;"><b>&#25918;&#23416;</b></span><span style="color:#00AAFF;">&#12290;</span><br>\n<span style="font-weight:600;">T&#257;men xu&#233;xi&#224;o xi&#224;w&

In [41]:
__id, _flds, _tags = list(cur.execute('SELECT id, flds, tags FROM notes;'))[5]
_flds, _tags

('白\x1f<div align="left"><p><span style="font-size:32px" ;="">白</span><br>\n<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#80FF80;"><span style="font-weight:600;">bái</span></span></p>\n</div><div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">ADJECTIVE</span></span></b><br>\n<b>1\t</b>white (opp. 黑)<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">幾根</span><span style="color:#00AAFF;"><b>白</b></span><span style="color:#00AAFF;">髮</span><br>\n<span style="font-weight:600;">Jǐ gēn </span><b>bái</b><span style="font-weight:600;">fà</span><br>\na few white (or grey) hairs<br>\n</p>\n</blockquote>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><span style="color:#00AAFF;">皮膚</span><span style="color:#

In [47]:
con.close()