From 283bb5b8ca4e38b3c8b60befb8e602b5bb9eb704 Mon Sep 17 00:00:00 2001
From: eggplants <w10776e8w@yahoo.co.jp>
Date: Mon, 4 Apr 2022 12:55:58 +0900
Subject: [PATCH] `--lang` option (#55)

---
 .gitignore                      |  2 +
 README.md                       | 19 ++++++++--
 budoux/main.py                  | 65 +++++++++++++++++++++++++++++----
 javascript/README.md            | 15 ++++----
 tests/test_feature_extractor.py |  4 +-
 tests/test_main.py              | 40 +++++++++++++++++++-
 tests/test_parser.py            | 11 ++++++
 7 files changed, 135 insertions(+), 21 deletions(-)

diff --git a/.gitignore b/.gitignore
index 69e1534..f1cac98 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ __pycache__
 *.pyc
 *.log
 *.egg-info
+*.coverage
+cov.xml
 
 # Python related files
 build/
diff --git a/README.md b/README.md
index 9708b2c..c5d7676 100644
--- a/README.md
+++ b/README.md
@@ -91,9 +91,17 @@ For more details of the JavaScript model, please refer to [JavaScript module REA
 You can also format inputs on your terminal with `budoux` command.
 
 ```shellsession
-$ budoux 本日は晴天です。
+$ budoux 本日は晴天です。 # default: japanese
 本日は
 晴天です。
+
+$ budoux -l ja 本日は晴天です。
+本日は
+晴天です。
+
+$ budoux -l zh-hans 今天天气晴朗。
+今天天气
+晴朗。
 ```
 
 ```shellsession
@@ -114,7 +122,7 @@ If you want to see help, run `budoux -h`.
 
 ```shellsession
 $ budoux -h
-usage: budoux [-h] [-H] [-m JSON] [-d STR] [-t THRES] [-V] [TXT]
+usage: budoux [-h] [-H] [-m JSON | -l LANG] [-d STR] [-t THRES] [-V] [TXT]
 
 BudouX is the successor to Budou,
 the machine learning powered line break organizer tool.
@@ -122,13 +130,18 @@ the machine learning powered line break organizer tool.
 positional arguments:
   TXT                      text (default: None)
 
-optional arguments:
+options:
   -h, --help               show this help message and exit
   -H, --html               HTML mode (default: False)
   -m JSON, --model JSON    custom model file path (default: /path/to/models/ja-knbc.json)
+  -l LANG, --lang LANG     language of custom model (default: None)
   -d STR, --delim STR      output delimiter in TEXT mode (default: ---)
   -t THRES, --thres THRES  threshold value to separate chunks (default: 1000)
   -V, --version            show program's version number and exit
+
+supported languages of `-l`, `--lang`:
+- zh-hans
+- ja
 ```
 
 ## Caveat
diff --git a/budoux/main.py b/budoux/main.py
index 0321364..3718c70 100644
--- a/budoux/main.py
+++ b/budoux/main.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """BudouX Script to provide CLI for user."""
 import argparse
+import glob
 import json
 import os
 import shutil
@@ -34,21 +35,60 @@ class BudouxHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,
 
 
 def check_file(path: str) -> str:
-  """Check if filepath is exist or not.
+  """Check if a given filepath exists or not.
 
   Args:
       path (str): Model path
 
   Raises:
-      FileNotFoundError: Raise if given path is not exist.
+      FileNotFoundError: Raise if given path does not exist.
 
   Returns:
-      str: Model path confirmed its existance.
+      str: A model path.
   """
   if os.path.isfile(path):
     return path
   else:
-    raise FileNotFoundError("'{}' is not found.".format(path))
+    raise argparse.ArgumentTypeError(f"'{path}' is not found.")
+
+
+def get_model_langs() -> typing.Dict[str, str]:
+  """Get a dictionary of model languages and its paths.
+
+  Returns:
+      typing.Dict[str, str]: A dictionary of model languages and its paths.
+  """
+  models = glob.glob(
+      pkg_resources.resource_filename(__name__, "models") + "/*-*.json")
+  langs = {}
+  for model in models:
+    model_name = model.split(os.sep)[-1][:-5]
+    if model_name.startswith('zh-'):
+      langs[model_name] = model
+    else:
+      langs[model_name[:2]] = model
+  return langs
+
+
+def check_lang(lang: str) -> str:
+  """Check if given language exists or not.
+
+  Args:
+      lang (str): language code (e.g.: 'ja')
+
+  Raises:
+      argparse.ArgumentTypeError: Raise if no model for given language exists.
+
+  Returns:
+      str: A model path.
+  """
+  langs = get_model_langs()
+  if lang in langs:
+    return langs[lang]
+  else:
+    raise argparse.ArgumentTypeError(
+        f"'{lang}' does not exist in builtin models. (supported languages: {list(langs.keys())})"
+    )
 
 
 def parse_args(test: ArgList = None) -> argparse.Namespace:
@@ -72,7 +112,9 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
       description=textwrap.dedent("""\
         BudouX is the successor to Budou,
         the machine learning powered line break organizer tool."""),
-  )
+      epilog="\n- ".join(
+          ["supported languages of `-l`, `--lang`:",
+           *get_model_langs().keys()]))
 
   parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text")
   parser.add_argument(
@@ -81,7 +123,8 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
       action="store_true",
       help="HTML mode",
   )
-  parser.add_argument(
+  model_select_group = parser.add_mutually_exclusive_group()
+  model_select_group.add_argument(
       "-m",
       "--model",
       metavar="JSON",
@@ -89,6 +132,13 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
       default=pkg_resources.resource_filename(__name__, "models/ja-knbc.json"),
       help="custom model file path",
   )
+  model_select_group.add_argument(
+      "-l",
+      "--lang",
+      metavar="LANG",
+      type=check_lang,
+      help="language of custom model",
+  )
   parser.add_argument(
       "-d",
       "--delim",
@@ -118,7 +168,8 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
 
 def _main(test: ArgList = None) -> str:
   args = parse_args(test=test)
-  with open(args.model, "r") as f:
+  model_path = args.lang or args.model
+  with open(model_path, "r") as f:
     model = json.load(f)
 
   parser = budoux.Parser(model)
diff --git a/javascript/README.md b/javascript/README.md
index 8e3d8d8..ac4c7a7 100644
--- a/javascript/README.md
+++ b/javascript/README.md
@@ -133,19 +133,20 @@ If you want to see help, run `budoux -h`.
 
 ```shellsession
 $ budoux -h
-Usage: budoux [-h] [-H] [-m JSON] [-d STR] [-V] [TXT]
+Usage: budoux [-h] [-H] [-d STR] [-t THRES] [-m JSON] [-V] [TXT]
 
 BudouX is the successor to Budou, the machine learning powered line break organizer tool.
 
 Arguments:
-  txt                 text
+  txt                   text
 
 Options:
-  -H, --html          HTML mode
-  -d, --delim <str>   output delimiter in TEXT mode (default: "---")
-  -m, --model <json>  custom model file path
-  -V, --version       output the version number
-  -h, --help          display help for command
+  -H, --html            HTML mode (default: false)
+  -d, --delim <str>     output delimiter in TEXT mode (default: "---")
+  -t, --thres <number>  threshold value to separate chunks (default: "1000")
+  -m, --model <json>    custom model file path
+  -V, --version         output the version number
+  -h, --help            display help for command
 ```
 
 ### Attributes
diff --git a/tests/test_feature_extractor.py b/tests/test_feature_extractor.py
index 991c8f2..0d50296 100644
--- a/tests/test_feature_extractor.py
+++ b/tests/test_feature_extractor.py
@@ -26,10 +26,10 @@
 
 from budoux import feature_extractor, utils  # noqa (module hack)
 
-if isinstance(sys.stdin, io.TextIOWrapper) and sys.version_info >= (3, 7):
+if isinstance(sys.stdin, io.TextIOWrapper):
   sys.stdin.reconfigure(encoding='utf-8')
 
-if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7):
+if isinstance(sys.stdout, io.TextIOWrapper):
   sys.stdout.reconfigure(encoding='utf-8')
 
 SOURCE_FILE_PATH = os.path.abspath(
diff --git a/tests/test_main.py b/tests/test_main.py
index cd2881d..7b461e0 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -24,10 +24,10 @@
 
 from budoux import main  # noqa (module hack)
 
-if isinstance(sys.stdin, io.TextIOWrapper) and sys.version_info >= (3, 7):
+if isinstance(sys.stdin, io.TextIOWrapper):
   sys.stdin.reconfigure(encoding='utf-8')
 
-if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7):
+if isinstance(sys.stdout, io.TextIOWrapper):
   sys.stdout.reconfigure(encoding='utf-8')
 
 
@@ -55,6 +55,42 @@ def test_cmdargs_version(self) -> None:
     self.assertEqual(cm.exception.code, 0)
 
 
+class TestModelOption(unittest.TestCase):
+
+  def test_cmdargs_invalid_json(self) -> None:
+    cmdargs = ['-m', '404.json']
+    with self.assertRaises(SystemExit) as cm:
+      main.parse_args(cmdargs)
+
+    self.assertEqual(cm.exception.code, 2)
+
+  def test_cmdargs_invalid_lang_1(self) -> None:
+    cmdargs = ['-l', 'aa']
+    with self.assertRaises(SystemExit) as cm:
+      main.parse_args(cmdargs)
+
+    self.assertEqual(cm.exception.code, 2)
+
+  def test_cmdargs_invalid_lang_2(self) -> None:
+    cmdargs = ['-l', 'ja-knbc']
+    with self.assertRaises(SystemExit) as cm:
+      main.parse_args(cmdargs)
+
+    self.assertEqual(cm.exception.code, 2)
+
+  def test_cmdargs_lang_ja(self) -> None:
+    cmdargs = ['-l', 'ja', '今日はいい天気ですね。']
+    output = main._main(cmdargs)
+
+    self.assertEqual(output, '今日は\nいい\n天気ですね。')
+
+  def test_cmdargs_lang_zh_hans(self) -> None:
+    cmdargs = ['-l', 'zh-hans', '今天天气晴朗。']
+    output = main._main(cmdargs)
+
+    self.assertEqual(output, '今天天气\n晴朗。')
+
+
 class TestTextArguments(unittest.TestCase):
 
   def test_cmdargs_single_text(self) -> None:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 3d2fcd1..e7932d4 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -143,5 +143,16 @@ def test_translate_html_string(self) -> None:
         'Should work with emojis.')
 
 
+class TestDefaultParser(unittest.TestCase):
+
+  def test_load_default_japanese_parser(self) -> None:
+    p_ja = parser.load_default_japanese_parser()
+    self.assertTrue("UW4:私" in p_ja.model)
+
+  def test_load_default_simplified_chinese_parser(self) -> None:
+    p_ch = parser.load_default_simplified_chinese_parser()
+    self.assertTrue("UW4:力" in p_ch.model)
+
+
 if __name__ == '__main__':
   unittest.main()