forked from NaturalNode/natural
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
482 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,354 @@ | ||
/** | ||
* Normalize Japanese inputs and expose function to perform several conversions. | ||
* | ||
* Note: `alphabet` means all lower/upper case letter + space. | ||
* | ||
* \@todo Lazy initializations of conversionTables and replacers. | ||
* \@todo Would fixHalfwidthKana be useful? | ||
* | ||
* Descriptions of functions exposed: | ||
* replacers.fullwidthToHalfwidth.alphabet 「全角」英字を「半角」に変換 | ||
* replacers.halfwidthToFullwidth.alphabet 「半角」英字を「全角」に変換 | ||
* replacers.fullwidthToHalfwidth.numbers 「全角」数字を「半角」に変換 | ||
* replacers.halfwidthToFullwidth.numbers 「半角」数字を「全角」に変換 「全角」スペースを「半角」 | ||
* replacers.fullwidthToHalfwidth.punctuation 「全角」記号を「半角」に変換 「半角」スペースを「全角」 | ||
* replacers.halfwidthToFullwidth.punctuation 「半角」記号を「全角」に変換 | ||
* replacers.fullwidthToHalfwidth.katakana 「全角カタカナ」を「半角カタカナ」に変換 | ||
* replacers.halfwidthToFullwidth.katakana 「半角カタカナ」を「全角カタカナ」に変換 | ||
*/ | ||
|
||
var flip = require('../util/utils.js').flip; | ||
var merge = require('../util/utils.js').merge; | ||
var Replacer = require('../util/utils').Replacer; | ||
|
||
// From http://fernweh.jp/b/mb_convert_kana_js/ | ||
var conversionTables = { | ||
fullwidthToHalfwidth: { | ||
alphabet: { | ||
'a': 'a', | ||
'b': 'b', | ||
'c': 'c', | ||
'd': 'd', | ||
'e': 'e', | ||
'f': 'f', | ||
'g': 'g', | ||
'h': 'h', | ||
'i': 'i', | ||
'j': 'j', | ||
'k': 'k', | ||
'l': 'l', | ||
'm': 'm', | ||
'n': 'n', | ||
'o': 'o', | ||
'p': 'p', | ||
'q': 'q', | ||
'r': 'r', | ||
's': 's', | ||
't': 't', | ||
'u': 'u', | ||
'v': 'v', | ||
'w': 'w', | ||
'x': 'x', | ||
'y': 'y', | ||
'z': 'z', | ||
'A': 'A', | ||
'B': 'B', | ||
'C': 'C', | ||
'D': 'D', | ||
'E': 'E', | ||
'F': 'F', | ||
'G': 'G', | ||
'H': 'H', | ||
'I': 'I', | ||
'J': 'J', | ||
'K': 'K', | ||
'L': 'L', | ||
'M': 'M', | ||
'N': 'N', | ||
'O': 'O', | ||
'P': 'P', | ||
'Q': 'Q', | ||
'R': 'R', | ||
'S': 'S', | ||
'T': 'T', | ||
'U': 'U', | ||
'V': 'V', | ||
'W': 'W', | ||
'X': 'X', | ||
'Y': 'Y', | ||
'Z': 'Z', | ||
' ': ' ' // Fullwidth space | ||
}, | ||
|
||
numbers: { | ||
'0': '0', | ||
'1': '1', | ||
'2': '2', | ||
'3': '3', | ||
'4': '4', | ||
'5': '5', | ||
'6': '6', | ||
'7': '7', | ||
'8': '8', | ||
'9': '9' | ||
}, | ||
|
||
punctuation: { | ||
'_': '_', | ||
'-': '-', | ||
'・': '・', | ||
',': ',', | ||
'、': '、', | ||
';': ';', | ||
':': ':', | ||
'!': '!', | ||
'?': '?', | ||
'.': '.', | ||
'。': '。', | ||
'(': '(', | ||
')': ')', | ||
'[': '[', | ||
']': ']', | ||
'{': '{', | ||
'}': '}', | ||
//'「': '「', | ||
//'」': '」', | ||
'「': '「', | ||
'」': '」', | ||
'@': '@', | ||
'*': '*', | ||
'\': '\\', | ||
'/': '/', | ||
'&': '&', | ||
'#': '#', | ||
'%': '%', | ||
'`': '`', | ||
'^': '^', | ||
'+': '+', | ||
'<': '<', | ||
'=': '=', | ||
'>': '>', | ||
'|': '|', | ||
'~': '~', | ||
'≪': '«', | ||
'≫': '»', | ||
'─': '-', | ||
'$': '$', | ||
'"': '"' | ||
}, | ||
|
||
katakana: { | ||
'゛': '゙', | ||
'゜': '゚', | ||
'ー': 'ー', | ||
|
||
'ヴ': 'ヴ', | ||
'ガ': 'ガ', | ||
'ギ': 'ギ', | ||
'グ': 'グ', | ||
'ゲ': 'ゲ', | ||
'ゴ': 'ゴ', | ||
'ザ': 'ザ', | ||
'ジ': 'ジ', | ||
'ズ': 'ズ', | ||
'ゼ': 'ゼ', | ||
'ゾ': 'ゾ', | ||
'ダ': 'ダ', | ||
'ヂ': 'ヂ', | ||
'ヅ': 'ヅ', | ||
'デ': 'デ', | ||
'ド': 'ド', | ||
'バ': 'バ', | ||
'パ': 'パ', | ||
'ビ': 'ビ', | ||
'ピ': 'ピ', | ||
'ブ': 'ブ', | ||
'プ': 'プ', | ||
'ベ': 'ベ', | ||
'ペ': 'ペ', | ||
'ボ': 'ボ', | ||
'ポ': 'ポ', | ||
|
||
'ァ': 'ァ', | ||
'ア': 'ア', | ||
'ィ': 'ィ', | ||
'イ': 'イ', | ||
'ゥ': 'ゥ', | ||
'ウ': 'ウ', | ||
'ェ': 'ェ', | ||
'エ': 'エ', | ||
'ォ': 'ォ', | ||
'オ': 'オ', | ||
'カ': 'カ', | ||
'キ': 'キ', | ||
'ク': 'ク', | ||
'ケ': 'ケ', | ||
'コ': 'コ', | ||
'サ': 'サ', | ||
'シ': 'シ', | ||
'ス': 'ス', | ||
'セ': 'セ', | ||
'ソ': 'ソ', | ||
'タ': 'タ', | ||
'チ': 'チ', | ||
'ッ': 'ッ', | ||
'ツ': 'ツ', | ||
'テ': 'テ', | ||
'ト': 'ト', | ||
'ナ': 'ナ', | ||
'ニ': 'ニ', | ||
'ヌ': 'ヌ', | ||
'ネ': 'ネ', | ||
'ノ': 'ノ', | ||
'ハ': 'ハ', | ||
'ヒ': 'ヒ', | ||
'フ': 'フ', | ||
'ヘ': 'ヘ', | ||
'ホ': 'ホ', | ||
'マ': 'マ', | ||
'ミ': 'ミ', | ||
'ム': 'ム', | ||
'メ': 'メ', | ||
'モ': 'モ', | ||
'ャ': 'ャ', | ||
'ヤ': 'ヤ', | ||
'ュ': 'ュ', | ||
'ユ': 'ユ', | ||
'ョ': 'ョ', | ||
'ヨ': 'ヨ', | ||
'ラ': 'ラ', | ||
'リ': 'リ', | ||
'ル': 'ル', | ||
'レ': 'レ', | ||
'ロ': 'ロ', | ||
'ワ': 'ワ', | ||
'ヲ': 'ヲ', | ||
'ン': 'ン' | ||
} | ||
}, | ||
|
||
halfwidthToFullwidth: {} | ||
}; | ||
|
||
var fixFullwidthKana = { | ||
'ゝ゛': 'ゞ', | ||
'ヽ゛': 'ヾ', | ||
|
||
'う゛': 'ゔ', | ||
'か゛': 'が', | ||
'き゛': 'ぎ', | ||
'く゛': 'ぐ', | ||
'け゛': 'げ', | ||
'こ゛': 'ご', | ||
'さ゛': 'ざ', | ||
'し゛': 'じ', | ||
'す゛': 'ず', | ||
'せ゛': 'ぜ', | ||
'そ゛': 'ぞ', | ||
'た゛': 'だ', | ||
'ち゛': 'ぢ', | ||
'つ゛': 'づ', | ||
'て゛': 'で', | ||
'と゛': 'ど', | ||
'は゛': 'ば', | ||
'は゜': 'ぱ', | ||
'ひ゛': 'び', | ||
'ひ゜': 'ぴ', | ||
'ふ゛': 'ぶ', | ||
'ふ゜': 'ぷ', | ||
'へ゛': 'べ', | ||
'へ゜': 'ぺ', | ||
'ほ゛': 'ぼ', | ||
'ほ゜': 'ぽ', | ||
'っな': 'んな', | ||
'っに': 'んに', | ||
'っぬ': 'んぬ', | ||
'っね': 'んね', | ||
'っの': 'んの', | ||
|
||
'ウ゛': 'ヴ', | ||
'カ゛': 'ガ', | ||
'キ゛': 'ギ', | ||
'ク゛': 'グ', | ||
'ケ゛': 'ゲ', | ||
'コ゛': 'ゴ', | ||
'サ゛': 'ザ', | ||
'シ゛': 'ジ', | ||
'ス゛': 'ズ', | ||
'セ゛': 'ゼ', | ||
'ソ゛': 'ゾ', | ||
'タ゛': 'ダ', | ||
'チ゛': 'ヂ', | ||
'ツ゛': 'ヅ', | ||
'テ゛': 'デ', | ||
'ト゛': 'ド', | ||
'ハ゛': 'バ', | ||
'ハ゜': 'パ', | ||
'ヒ゛': 'ビ', | ||
'ヒ゜': 'ピ', | ||
'フ゛': 'ブ', | ||
'フ゜': 'プ', | ||
'ヘ゛': 'ベ', | ||
'ヘ゜': 'ペ', | ||
'ホ゛': 'ボ', | ||
'ホ゜': 'ポ', | ||
'ッナ': 'ンナ', | ||
'ッニ': 'ンニ', | ||
'ッヌ': 'ンヌ', | ||
'ッネ': 'ンネ', | ||
'ッノ': 'ンノ' | ||
}; | ||
|
||
// Fill in the conversion tables with the flipped tables. | ||
conversionTables.halfwidthToFullwidth.alphabet = flip(conversionTables.fullwidthToHalfwidth.alphabet); | ||
conversionTables.halfwidthToFullwidth.numbers = flip(conversionTables.fullwidthToHalfwidth.numbers); | ||
conversionTables.halfwidthToFullwidth.punctuation = flip(conversionTables.fullwidthToHalfwidth.punctuation); | ||
conversionTables.halfwidthToFullwidth.katakana = flip(conversionTables.fullwidthToHalfwidth.katakana); | ||
|
||
// Build the normalization table. | ||
conversionTables.normalize = merge( | ||
conversionTables.fullwidthToHalfwidth.alphabet, | ||
conversionTables.fullwidthToHalfwidth.numbers, | ||
conversionTables.halfwidthToFullwidth.punctuation, | ||
conversionTables.halfwidthToFullwidth.katakana | ||
); | ||
|
||
var replacers = { | ||
fullwidthToHalfwidth: { | ||
alphabet: new Replacer(conversionTables.fullwidthToHalfwidth.alphabet), | ||
numbers: new Replacer(conversionTables.fullwidthToHalfwidth.numbers), | ||
punctuation: new Replacer(conversionTables.fullwidthToHalfwidth.punctuation), | ||
katakana: new Replacer(conversionTables.fullwidthToHalfwidth.katakana) | ||
}, | ||
|
||
halfwidthToFullwidth: { | ||
alphabet: new Replacer(conversionTables.halfwidthToFullwidth.alphabet), | ||
numbers: new Replacer(conversionTables.halfwidthToFullwidth.numbers), | ||
punctuation: new Replacer(conversionTables.halfwidthToFullwidth.punctuation), | ||
katakana: new Replacer(conversionTables.halfwidthToFullwidth.katakana) | ||
}, | ||
|
||
fixFullwidthKana: new Replacer(fixFullwidthKana), | ||
normalize: new Replacer(conversionTables.normalize) | ||
}; | ||
|
||
|
||
/** | ||
* Fix kana and apply the following processes; | ||
* * Alphabet to halfwidth | ||
* * Numbers to halfwidth | ||
* * Punctuation to fullwidth | ||
* * Katakana to fullwidth | ||
* | ||
* @param str | ||
* @return {String} | ||
*/ | ||
var normalizeJapanese = function(str) { | ||
str = replacers.normalize(str); | ||
str = replacers.fixFullwidthKana(str); | ||
|
||
return str; | ||
}; | ||
|
||
exports.normalizeJapanese = normalizeJapanese; | ||
exports.replacers = replacers; |
Oops, something went wrong.