# Compares Two Output APKGS

In [68]:
# global vars ###################################
from opencc import OpenCC

toTrad = OpenCC('s2tw')
toSimp = OpenCC('t2s')
SEPARATOR = "\x1f"

# input vars #####################################

# folders
WORDLISTS = "wordlists"
APKGS = "apkgs"
EXPERIMENTS = f"{APKGS}/experiments"
OUTPUTS = "outputs"

# files
output_apkg_file1 = f"{OUTPUTS}/old_final_output.apkg"
output_apkg_file2 = f"{OUTPUTS}/pleco_import_bkfill2_direct_sqlite.apkg"

output_dir1 = f"{EXPERIMENTS}/output_dir1"
output_dir2 = f"{EXPERIMENTS}/output_dir2"

In [69]:
# setup experiments folder (.apkg are compressed sqlite databases, this just unzips it)

from zipfile import ZipFile
import shutil
import os

def load_apkg(apkg, output_dir):
    if os.path.isdir(output_dir):
        # delete if target experiment folder already exists
        shutil.rmtree(output_dir)
    with ZipFile(apkg) as zf:
        # unzip apkg to target folder
        zf.extractall(output_dir)

load_apkg(output_apkg_file1, output_dir1)
load_apkg(output_apkg_file2, output_dir2)

In [70]:
# setup sqlite db connection
import sqlite3

def get_sqlite_con_cur(db_dir):
    con = sqlite3.connect(f"{db_dir}/collection.anki2")
    cur = con.cursor()
    return con, cur

con1, cur1 = get_sqlite_con_cur(output_dir1)
con2, cur2 = get_sqlite_con_cur(output_dir2)

In [71]:
# helper functions
def get_trad_word(simp_word):
    return toTrad.convert(toSimp.convert(simp_word))

def get_trad_word_from_row(row):
    _id, flds = row
    flds_arr = flds.split(SEPARATOR)
    return get_trad_word(flds_arr[0])

def get_flds_from_row(row):
    _id, flds = row
    return flds

def get_items_rom_row(row, items):
    _id, flds = row
    flds_arr = flds.split(SEPARATOR)
    return tuple([flds_arr[i] for i in items])


def get_sqlite_wordset(cur):
    return set([get_trad_word_from_row(row) for row in list(cur.execute('SELECT id, flds FROM notes;'))])

def get_sqlite_word_and_field_set(cur):
    return set([(get_trad_word_from_row(row), get_flds_from_row(row)) for row in list(cur.execute('SELECT id, flds FROM notes;'))])

def get_sqlite_front_and_field_set(cur):
    return set([(get_items_rom_row(row, [0])[0], get_flds_from_row(row)) for row in list(cur.execute('SELECT id, flds FROM notes;'))])

def get_items(cur, indices):
    return set([get_items_rom_row(row, indices) for row in list(cur.execute('SELECT id, flds FROM notes;'))])


In [72]:
# compare wordsets
set1, set2 = get_sqlite_wordset(cur1), get_sqlite_wordset(cur2)
print(output_apkg_file1, len(set1))
print(output_apkg_file2, len(set2))
print(set1 - set2)
print(set2 - set1)

outputs/old_final_output.apkg 10921
outputs/pleco_import_bkfill2_direct_sqlite.apkg 10921
set()
set()


In [78]:
# check how many "fronts" do not match the "traditional character"
front_trad1, front_trad2 = get_items(cur1, [0, 2]), get_items(cur2, [0, 2])
not_matching1 = [(f,t) for f,t in front_trad1 if f != t]
not_matching2 = [(f,t) for f,t in front_trad2 if f != t]

print(len(not_matching1))
print(len(not_matching2))

print(not_matching1)

192
194
[('張燈結彩', '張燈結綵'), ('針灸', '鍼灸'), ('拐杖', '柺杖'), ('別扭', '彆扭'), ('準許', '准許'), ('動蕩', '動盪'), ('包扎', '包紮'), ('万', '萬'), ('艷麗', '豔麗'), ('復合', '複合'), ('回復', '回覆'), ('雇主', '僱主'), ('復蘇', '復甦'), ('贊不絕口', '讚不絕口'), ('好家伙', '好傢伙'), ('幾率', '機率'), ('占據', '佔據'), ('項鏈', '項鍊'), ('開天辟地', '開天闢地'), ('呼吁', '呼籲'), ('獎杯', '獎盃'), ('蘇醒', '甦醒'), ('開采', '開採'), ('鑒於', '鑑於'), ('濕潤', '溼潤'), ('泄露', '洩露'), ('鍊', '煉'), ('週到', '周到'), ('病癥', '病症'), ('麻痹', '麻痺'), ('濕', '溼'), ('广', '廣'), ('癥狀', '症狀'), ('占用', '佔用'), ('借鑒', '借鑑'), ('發泄', '發洩'), ('悠閑', '悠閒'), ('布滿', '佈滿'), ('宣泄', '宣洩'), ('繫', '系'), ('幸存', '倖存'), ('乾', '幹'), ('曬', '晒'), ('霸占', '霸佔'), ('洶涌', '洶湧'), ('戒煙', '戒菸'), ('盡早', '儘早'), ('涌現', '湧現'), ('一目了然', '一目瞭然'), ('品嘗', '品嚐'), ('鮮艷', '鮮豔'), ('桿', '杆'), ('仿佛', '彷彿'), ('雇員', '僱員'), ('吸煙', '吸菸'), ('扣人心弦', '扣人心絃'), ('頒布', '頒佈'), ('奇跡', '奇蹟'), ('折疊', '摺疊'), ('別致', '別緻'), ('面嚮', '面向'), ('抽簽', '抽籤'), ('貪污', '貪汙'), ('迴首', '回首'), ('開辟', '開闢'), ('鍛煉', '鍛鍊'), ('泄密', '洩密'), ('紐扣', '鈕釦'), ('雇傭', '僱傭'), ('咨詢', '諮詢'), ('摩託'

In [79]:
# compare flds
set1, set2 = get_sqlite_front_and_field_set(cur1), get_sqlite_front_and_field_set(cur2)
print(output_apkg_file1, len(set1))
print(output_apkg_file2, len(set2))
print(set1 - set2)
print(set2 - set1)
print(len(set1 - set2))
print(len(set2 - set1))

outputs/old_final_output.apkg 10922
outputs/pleco_import_bkfill2_direct_sqlite.apkg 10922
{('凶殘', '凶殘\x1f<div align="left"><p><span style="font-size:32px";>&#20982;&#27544;&#12308;&#20982;&#27531;&#12309;</span><br>\n<span style="color:#B4B4B4;"><b><span style="font-size:0.80em;">PY </span></b></span><span style="color:#FF8080;"><span style="font-weight:600;">xi&#333;ng</span></span><span style="font-weight:600;"><span style="color:#80FF80;">c&#225;n</span></span></p>\n</div><div align="left"><p><b><span style="font-size:0.80em;"><span style="color:#B4B4B4;">ADJECTIVE</span></span></b><br>\nfierce (or savage) and cruel; brutal; ruthless<br>\n</p>\n<blockquote style="border-left: 2px solid #00aaff; margin-left: 3px; padding-left: 1em; margin-top: 0px; margin-bottom: 0px;"><p><b><span style="color:#00AAFF;">&#20982;&#27544;</span></b><span style="color:#00AAFF;">&#25104;&#24615;</span><br>\n<b>Xi&#333;ngc&#225;n</b><span style="font-weight:600;"> ch&#233;ng x&#236;ng</span><br>\nbe cruel

In [80]:
diff1 = set2 - set1
diff2 = set1 - set2

from collections import defaultdict
comp = defaultdict(list)
for word, fld in diff1.union(diff2):
    comp[word].append(fld)

for w in list(comp.keys()):
    if len(comp[w]) != 2:
        del comp[w]

new_comp = defaultdict(list)
dict1 = {word: flds for word, flds in set1}
dict2 = {word: flds for word, flds in set2}
for w in comp:
    new_comp[w].append(dict1[w])
    new_comp[w].append(dict2[w])


In [81]:
print(f"how to change from {output_apkg_file1} to {output_apkg_file2}\n\n")

import difflib
for (word,(a,b)) in new_comp.items():     
    print(word)  
    for i,s in enumerate(difflib.ndiff(a, b)):
        if s[0]==' ': continue
        elif s[0]=='-':
            print(u'Delete "{}" from position {}'.format(s[-1],i))
        elif s[0]=='+':
            print(u'Add "{}" to position {}'.format(s[-1],i))    
    print()      

# the differences here are in the "traditional" character field (separate from the "front" field)
# the "traditional" field takes the "front" character -> converts to simplified -> then converts back to traditional
# occassionally it doesn't work perfectly or some simplified characters are valid traditional characters

# occassionally 

how to change from outputs/old_final_output.apkg to outputs/pleco_import_bkfill2_direct_sqlite.apkg


凶殘
Delete "凶" from position 1213
Add "兇" to position 1214

凶狠
Delete "凶" from position 1381
Add "兇" to position 1382

回應
Delete "迴" from position 1176
Add "回" to position 1177

神祕
Delete "祕" from position 1850
Add "秘" to position 1851

祕書
Delete "祕" from position 1098
Add "秘" to position 1099

曬太陽
Delete "晒" from position 2057
Add "曬" to position 2058

奧祕
Delete "祕" from position 1388
Add "秘" to position 1389

紐扣
Delete "鈕" from position 1653
Delete "釦" from position 1654
Add "紐" to position 1655
Add "扣" to position 1656

凶手
Delete "凶" from position 615
Add "兇" to position 616

兇惡
Delete "凶" from position 564
Add "兇" to position 565

祕方
Delete "祕" from position 591
Add "秘" to position 592

幾率
Delete "機" from position 1581
Add "幾" to position 1582
Delete "机" from position 1585
Add "几" to position 1586

曬
Delete "晒" from position 5515
Add "曬" to position 5516

祕密
Delete "祕" from position