Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
branch: master
239 lines (213 sloc) 7.854 kb
# -*- coding: utf-8 -*-
# Copyright: Damien Elmes <anki@ichi2.net>
# License: GNU GPL, version 3 or later; http://www.gnu.org/copyleft/gpl.html
#
# Automatic reading generation with kakasi and mecab.
# See http://ichi2.net/anki/wiki/JapaneseSupport
#
import sys, os, platform, re, subprocess
from anki.utils import stripHTML, isWin, isMac
from anki.hooks import addHook
srcFields = ['Expression']
dstFields = ['Reading']
kakasiArgs = ["-isjis", "-osjis", "-u", "-JH", "-KH"]
mecabArgs = ['--node-format=%m[%f[7]] ', '--eos-format=\n',
'--unk-format=%m[] ']
def escapeText(text):
# strip characters that trip up kakasi/mecab
text = text.replace("\n", " ")
text = text.replace(u'\uff5e', "~")
text = re.sub("<br( /)?>", "---newline---", text)
text = stripHTML(text)
text = text.replace("---newline---", "<br>")
return text
if sys.platform == "win32":
si = subprocess.STARTUPINFO()
try:
si.dwFlags |= subprocess.STARTF_USESHOWWINDOW
except:
si.dwFlags |= subprocess._subprocess.STARTF_USESHOWWINDOW
else:
si = None
# Mecab
##########################################################################
def mungeForPlatform(popen):
if isWin:
popen = [os.path.normpath(x) for x in popen]
popen[0] += ".exe"
elif not isMac:
popen[0] += ".lin"
return popen
class MecabController(object):
def __init__(self):
self.mecab = None
def setup(self):
base = "../../addons/japanese/support/"
self.mecabCmd = mungeForPlatform(
[base + "mecab"] + mecabArgs + [
'-d', base, '-r', base + "mecabrc"])
os.environ['DYLD_LIBRARY_PATH'] = base
os.environ['LD_LIBRARY_PATH'] = base
if not isWin:
os.chmod(self.mecabCmd[0], 0755)
def ensureOpen(self):
if not self.mecab:
self.setup()
try:
self.mecab = subprocess.Popen(
self.mecabCmd, bufsize=-1, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
startupinfo=si)
except OSError:
raise Exception("Please ensure your Linux system has 32 bit binary support.")
def reading(self, expr):
self.ensureOpen()
expr = escapeText(expr)
self.mecab.stdin.write(expr.encode("euc-jp", "ignore")+'\n')
self.mecab.stdin.flush()
expr = unicode(self.mecab.stdout.readline().rstrip('\r\n'), "euc-jp")
out = []
for node in expr.split(" "):
if not node:
break
(kanji, reading) = re.match("(.+)\[(.*)\]", node).groups()
# hiragana, punctuation, not japanese, or lacking a reading
if kanji == reading or not reading:
out.append(kanji)
continue
# katakana
if kanji == kakasi.reading(reading):
out.append(kanji)
continue
# convert to hiragana
reading = kakasi.reading(reading)
# ended up the same
if reading == kanji:
out.append(kanji)
continue
# don't add readings of numbers
if kanji in u"一二三四五六七八九十0123456789":
out.append(kanji)
continue
# strip matching characters and beginning and end of reading and kanji
# reading should always be at least as long as the kanji
placeL = 0
placeR = 0
for i in range(1,len(kanji)):
if kanji[-i] != reading[-i]:
break
placeR = i
for i in range(0,len(kanji)-1):
if kanji[i] != reading[i]:
break
placeL = i+1
if placeL == 0:
if placeR == 0:
out.append(" %s[%s]" % (kanji, reading))
else:
out.append(" %s[%s]%s" % (
kanji[:-placeR], reading[:-placeR], reading[-placeR:]))
else:
if placeR == 0:
out.append("%s %s[%s]" % (
reading[:placeL], kanji[placeL:], reading[placeL:]))
else:
out.append("%s %s[%s]%s" % (
reading[:placeL], kanji[placeL:-placeR],
reading[placeL:-placeR], reading[-placeR:]))
fin = u""
for c, s in enumerate(out):
if c < len(out) - 1 and re.match("^[A-Za-z0-9]+$", out[c+1]):
s += " "
fin += s
return fin.strip().replace("< br>", "<br>")
# Kakasi
##########################################################################
class KakasiController(object):
def __init__(self):
self.kakasi = None
def setup(self):
base = "../../addons/japanese/support/"
self.kakasiCmd = mungeForPlatform(
[base + "kakasi"] + kakasiArgs)
os.environ['ITAIJIDICT'] = base + "itaijidict"
os.environ['KANWADICT'] = base + "kanwadict"
if not isWin:
os.chmod(self.kakasiCmd[0], 0755)
def ensureOpen(self):
if not self.kakasi:
self.setup()
try:
self.kakasi = subprocess.Popen(
self.kakasiCmd, bufsize=-1, stdin=subprocess.PIPE,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
startupinfo=si)
except OSError:
raise Exception("Please install kakasi")
def reading(self, expr):
self.ensureOpen()
expr = escapeText(expr)
self.kakasi.stdin.write(expr.encode("sjis", "ignore")+'\n')
self.kakasi.stdin.flush()
res = unicode(self.kakasi.stdout.readline().rstrip('\r\n'), "sjis")
return res
# Focus lost hook
##########################################################################
def onFocusLost(flag, n, fidx):
global mecab
from aqt import mw
if not mecab:
return flag
src = None
dst = None
# japanese model?
if "japanese" not in n.model()['name'].lower():
return flag
# have src and dst fields?
for c, name in enumerate(mw.col.models.fieldNames(n.model())):
for f in srcFields:
if name == f:
src = f
srcIdx = c
for f in dstFields:
if name == f:
dst = f
if not src or not dst:
return flag
# dst field already filled?
if n[dst]:
return flag
# event coming from src field?
if fidx != srcIdx:
return flag
# grab source text
srcTxt = mw.col.media.strip(n[src])
if not srcTxt:
return flag
# update field
try:
n[dst] = mecab.reading(srcTxt)
except Exception, e:
mecab = None
raise
return True
# Init
##########################################################################
kakasi = KakasiController()
mecab = MecabController()
addHook('editFocusLost', onFocusLost)
# Tests
##########################################################################
if __name__ == "__main__":
expr = u"カリン、自分でまいた種は自分で刈り取れ"
print mecab.reading(expr).encode("utf-8")
expr = u"昨日、林檎を2個買った。"
print mecab.reading(expr).encode("utf-8")
expr = u"真莉、大好きだよん^^"
print mecab.reading(expr).encode("utf-8")
expr = u"彼2000万も使った。"
print mecab.reading(expr).encode("utf-8")
expr = u"彼二千三百六十円も使った。"
print mecab.reading(expr).encode("utf-8")
expr = u"千葉"
print mecab.reading(expr).encode("utf-8")
Jump to Line
Something went wrong with that request. Please try again.