From 283bb5b8ca4e38b3c8b60befb8e602b5bb9eb704 Mon Sep 17 00:00:00 2001 From: eggplants Date: Mon, 4 Apr 2022 12:55:58 +0900 Subject: [PATCH] `--lang` option (#55) --- .gitignore | 2 + README.md | 19 ++++++++-- budoux/main.py | 65 +++++++++++++++++++++++++++++---- javascript/README.md | 15 ++++---- tests/test_feature_extractor.py | 4 +- tests/test_main.py | 40 +++++++++++++++++++- tests/test_parser.py | 11 ++++++ 7 files changed, 135 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index 69e1534..f1cac98 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,8 @@ __pycache__ *.pyc *.log *.egg-info +*.coverage +cov.xml # Python related files build/ diff --git a/README.md b/README.md index 9708b2c..c5d7676 100644 --- a/README.md +++ b/README.md @@ -91,9 +91,17 @@ For more details of the JavaScript model, please refer to [JavaScript module REA You can also format inputs on your terminal with `budoux` command. ```shellsession -$ budoux 本日は晴天です。 +$ budoux 本日は晴天です。 # default: japanese 本日は 晴天です。 + +$ budoux -l ja 本日は晴天です。 +本日は +晴天です。 + +$ budoux -l zh-hans 今天天气晴朗。 +今天天气 +晴朗。 ``` ```shellsession @@ -114,7 +122,7 @@ If you want to see help, run `budoux -h`. ```shellsession $ budoux -h -usage: budoux [-h] [-H] [-m JSON] [-d STR] [-t THRES] [-V] [TXT] +usage: budoux [-h] [-H] [-m JSON | -l LANG] [-d STR] [-t THRES] [-V] [TXT] BudouX is the successor to Budou, the machine learning powered line break organizer tool. @@ -122,13 +130,18 @@ the machine learning powered line break organizer tool. positional arguments: TXT text (default: None) -optional arguments: +options: -h, --help show this help message and exit -H, --html HTML mode (default: False) -m JSON, --model JSON custom model file path (default: /path/to/models/ja-knbc.json) + -l LANG, --lang LANG language of custom model (default: None) -d STR, --delim STR output delimiter in TEXT mode (default: ---) -t THRES, --thres THRES threshold value to separate chunks (default: 1000) -V, --version show program's version number and exit + +supported languages of `-l`, `--lang`: +- zh-hans +- ja ``` ## Caveat diff --git a/budoux/main.py b/budoux/main.py index 0321364..3718c70 100644 --- a/budoux/main.py +++ b/budoux/main.py @@ -14,6 +14,7 @@ # limitations under the License. """BudouX Script to provide CLI for user.""" import argparse +import glob import json import os import shutil @@ -34,21 +35,60 @@ class BudouxHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, def check_file(path: str) -> str: - """Check if filepath is exist or not. + """Check if a given filepath exists or not. Args: path (str): Model path Raises: - FileNotFoundError: Raise if given path is not exist. + FileNotFoundError: Raise if given path does not exist. Returns: - str: Model path confirmed its existance. + str: A model path. """ if os.path.isfile(path): return path else: - raise FileNotFoundError("'{}' is not found.".format(path)) + raise argparse.ArgumentTypeError(f"'{path}' is not found.") + + +def get_model_langs() -> typing.Dict[str, str]: + """Get a dictionary of model languages and its paths. + + Returns: + typing.Dict[str, str]: A dictionary of model languages and its paths. + """ + models = glob.glob( + pkg_resources.resource_filename(__name__, "models") + "/*-*.json") + langs = {} + for model in models: + model_name = model.split(os.sep)[-1][:-5] + if model_name.startswith('zh-'): + langs[model_name] = model + else: + langs[model_name[:2]] = model + return langs + + +def check_lang(lang: str) -> str: + """Check if given language exists or not. + + Args: + lang (str): language code (e.g.: 'ja') + + Raises: + argparse.ArgumentTypeError: Raise if no model for given language exists. + + Returns: + str: A model path. + """ + langs = get_model_langs() + if lang in langs: + return langs[lang] + else: + raise argparse.ArgumentTypeError( + f"'{lang}' does not exist in builtin models. (supported languages: {list(langs.keys())})" + ) def parse_args(test: ArgList = None) -> argparse.Namespace: @@ -72,7 +112,9 @@ def parse_args(test: ArgList = None) -> argparse.Namespace: description=textwrap.dedent("""\ BudouX is the successor to Budou, the machine learning powered line break organizer tool."""), - ) + epilog="\n- ".join( + ["supported languages of `-l`, `--lang`:", + *get_model_langs().keys()])) parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text") parser.add_argument( @@ -81,7 +123,8 @@ def parse_args(test: ArgList = None) -> argparse.Namespace: action="store_true", help="HTML mode", ) - parser.add_argument( + model_select_group = parser.add_mutually_exclusive_group() + model_select_group.add_argument( "-m", "--model", metavar="JSON", @@ -89,6 +132,13 @@ def parse_args(test: ArgList = None) -> argparse.Namespace: default=pkg_resources.resource_filename(__name__, "models/ja-knbc.json"), help="custom model file path", ) + model_select_group.add_argument( + "-l", + "--lang", + metavar="LANG", + type=check_lang, + help="language of custom model", + ) parser.add_argument( "-d", "--delim", @@ -118,7 +168,8 @@ def parse_args(test: ArgList = None) -> argparse.Namespace: def _main(test: ArgList = None) -> str: args = parse_args(test=test) - with open(args.model, "r") as f: + model_path = args.lang or args.model + with open(model_path, "r") as f: model = json.load(f) parser = budoux.Parser(model) diff --git a/javascript/README.md b/javascript/README.md index 8e3d8d8..ac4c7a7 100644 --- a/javascript/README.md +++ b/javascript/README.md @@ -133,19 +133,20 @@ If you want to see help, run `budoux -h`. ```shellsession $ budoux -h -Usage: budoux [-h] [-H] [-m JSON] [-d STR] [-V] [TXT] +Usage: budoux [-h] [-H] [-d STR] [-t THRES] [-m JSON] [-V] [TXT] BudouX is the successor to Budou, the machine learning powered line break organizer tool. Arguments: - txt text + txt text Options: - -H, --html HTML mode - -d, --delim output delimiter in TEXT mode (default: "---") - -m, --model custom model file path - -V, --version output the version number - -h, --help display help for command + -H, --html HTML mode (default: false) + -d, --delim output delimiter in TEXT mode (default: "---") + -t, --thres threshold value to separate chunks (default: "1000") + -m, --model custom model file path + -V, --version output the version number + -h, --help display help for command ``` ### Attributes diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py index 991c8f2..0d50296 100644 --- a/tests/test_feature_extractor.py +++ b/tests/test_feature_extractor.py @@ -26,10 +26,10 @@ from budoux import feature_extractor, utils # noqa (module hack) -if isinstance(sys.stdin, io.TextIOWrapper) and sys.version_info >= (3, 7): +if isinstance(sys.stdin, io.TextIOWrapper): sys.stdin.reconfigure(encoding='utf-8') -if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7): +if isinstance(sys.stdout, io.TextIOWrapper): sys.stdout.reconfigure(encoding='utf-8') SOURCE_FILE_PATH = os.path.abspath( diff --git a/tests/test_main.py b/tests/test_main.py index cd2881d..7b461e0 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -24,10 +24,10 @@ from budoux import main # noqa (module hack) -if isinstance(sys.stdin, io.TextIOWrapper) and sys.version_info >= (3, 7): +if isinstance(sys.stdin, io.TextIOWrapper): sys.stdin.reconfigure(encoding='utf-8') -if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7): +if isinstance(sys.stdout, io.TextIOWrapper): sys.stdout.reconfigure(encoding='utf-8') @@ -55,6 +55,42 @@ def test_cmdargs_version(self) -> None: self.assertEqual(cm.exception.code, 0) +class TestModelOption(unittest.TestCase): + + def test_cmdargs_invalid_json(self) -> None: + cmdargs = ['-m', '404.json'] + with self.assertRaises(SystemExit) as cm: + main.parse_args(cmdargs) + + self.assertEqual(cm.exception.code, 2) + + def test_cmdargs_invalid_lang_1(self) -> None: + cmdargs = ['-l', 'aa'] + with self.assertRaises(SystemExit) as cm: + main.parse_args(cmdargs) + + self.assertEqual(cm.exception.code, 2) + + def test_cmdargs_invalid_lang_2(self) -> None: + cmdargs = ['-l', 'ja-knbc'] + with self.assertRaises(SystemExit) as cm: + main.parse_args(cmdargs) + + self.assertEqual(cm.exception.code, 2) + + def test_cmdargs_lang_ja(self) -> None: + cmdargs = ['-l', 'ja', '今日はいい天気ですね。'] + output = main._main(cmdargs) + + self.assertEqual(output, '今日は\nいい\n天気ですね。') + + def test_cmdargs_lang_zh_hans(self) -> None: + cmdargs = ['-l', 'zh-hans', '今天天气晴朗。'] + output = main._main(cmdargs) + + self.assertEqual(output, '今天天气\n晴朗。') + + class TestTextArguments(unittest.TestCase): def test_cmdargs_single_text(self) -> None: diff --git a/tests/test_parser.py b/tests/test_parser.py index 3d2fcd1..e7932d4 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -143,5 +143,16 @@ def test_translate_html_string(self) -> None: 'Should work with emojis.') +class TestDefaultParser(unittest.TestCase): + + def test_load_default_japanese_parser(self) -> None: + p_ja = parser.load_default_japanese_parser() + self.assertTrue("UW4:私" in p_ja.model) + + def test_load_default_simplified_chinese_parser(self) -> None: + p_ch = parser.load_default_simplified_chinese_parser() + self.assertTrue("UW4:力" in p_ch.model) + + if __name__ == '__main__': unittest.main()