Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 203 lines (143 sloc) 5.314 kb
#!/usr/bin/python3
import os
import sqlite3
from argparse import ArgumentParser
import utils
from myconfig import MyConfig
from dirwalk import walkIndex
config = MyConfig()
#default pinyin total frequency
default = config.getDefaultPinyinTotalFrequency()
#minimum pinyin frequency
minimum = config.getMinimumPinyinFrequency()
#change cwd to the word recognizer directory
words_dir = config.getWordRecognizerDir()
os.chdir(words_dir)
#chdir done
atomic_words_dict = {}
merged_words_dict = {}
def load_atomic_words(filename):
wordsfile = open(filename, "r")
for oneline in wordsfile.readlines():
oneline = oneline.rstrip(os.linesep)
if len(oneline) == 0:
continue
(word, pinyin, freq) = oneline.split(None, 2)
freq = int(freq)
if word in atomic_words_dict:
atomic_words_dict[word].append((pinyin, freq))
else:
atomic_words_dict[word] = [(pinyin, freq)]
wordsfile.close()
def load_merged_words(filename):
wordsfile = open(filename, "r")
for oneline in wordsfile.readlines():
oneline = oneline.rstrip(os.linesep)
if len(oneline) == 0:
continue
(word, prefix, postfix, freq) = oneline.split(None, 3)
freq = int(freq)
if word in merged_words_dict:
merged_words_dict[word].append((prefix, postfix, freq))
else:
merged_words_dict[word] = [(prefix, postfix, freq)]
wordsfile.close()
def mergePinyin(pinyin_list):
print(pinyin_list)
pinyins = {}
for (pinyin, freq) in pinyin_list:
if pinyin in pinyins:
pinyins[pinyin] += freq
else:
pinyins[pinyin] = freq
pinyins = list(pinyins.items())
total_freq = sum([ freq for pinyin, freq in pinyins ])
results = []
for (pinyin, freq) in pinyins:
freq = default * freq / total_freq
freq = int(freq)
if freq < minimum:
continue
results.append((pinyin, freq))
print(results)
return results
def markAtomicWord(word):
assert word in atomic_words_dict
results = atomic_words_dict[word]
return mergePinyin(results)
def markMergedWord(word):
assert word in merged_words_dict
merged_list = merged_words_dict[word]
print(merged_list)
merged_sum = sum([ freq for prefix, postfix, freq in merged_list ])
results = []
for (prefix, postfix, freq) in merged_list:
prefix_list = markPinyin(prefix)
prefix_sum = sum([ freq for pinyin, freq in prefix_list ])
postfix_list = markPinyin(postfix)
postfix_sum = sum([ freq for pinyin, freq in postfix_list ])
for prefix_pinyin, prefix_freq in prefix_list:
for postfix_pinyin, postfix_freq in postfix_list:
merged_pinyin = prefix_pinyin + "'" + postfix_pinyin
merged_freq = default * freq * prefix_freq * postfix_freq / \
merged_sum / prefix_sum / postfix_sum
results.append((merged_pinyin, merged_freq))
return mergePinyin(results)
def markPinyin(word):
print(word)
if word in atomic_words_dict:
return markAtomicWord(word)
elif word in merged_words_dict:
return markMergedWord(word)
else:
assert False, "missed word.\n"
def markPinyins(workdir):
merged_words_dict = {}
filename = config.getPartialWordFileName()
filepath = workdir + os.sep + filename
load_merged_words(filepath)
filename = config.getNewWordFileName()
filepath = workdir + os.sep + filename
newwordfile = open(filepath, "r")
filename = config.getRecognizedWordFileName()
filepath = workdir + os.sep + filename
recordfile = open(filepath, "w")
for oneline in newwordfile.readlines():
oneline = oneline.rstrip(os.linesep)
if len(oneline) == 0:
continue
word = oneline
pinyin_list = markPinyin(word)
for pinyin, freq in pinyin_list:
freq = str(freq)
oneline = '\t'.join((word, pinyin, freq))
recordfile.writelines([oneline, os.linesep])
recordfile.close()
newwordfile.close()
def handleOneIndex(indexpath, subdir, indexname):
print(indexpath, subdir, indexname)
indexstatuspath = indexpath + config.getStatusPostfix()
indexstatus = utils.load_status(indexstatuspath)
if not utils.check_epoch(indexstatus, 'NewWord'):
raise utils.EpochError('Please new word first.\n')
if utils.check_epoch(indexstatus, 'MarkPinyin'):
return
workdir = config.getWordRecognizerDir() + os.sep + \
subdir + os.sep + indexname
print(workdir)
markPinyins(workdir)
#sign epoch
utils.sign_epoch(indexstatus, 'MarkPinyin')
utils.store_status(indexstatuspath, indexstatus)
#loading old words
load_atomic_words(config.getWordsWithPinyinFileName())
#print(atomic_words_dict)
if __name__ == '__main__':
parser = ArgumentParser(description='Mark pinyins.')
parser.add_argument('--indexdir', action='store', \
help='index directory', \
default=config.getTextIndexDir())
args = parser.parse_args()
print(args)
walkIndex(handleOneIndex, args.indexdir)
print('done')
Jump to Line
Something went wrong with that request. Please try again.