Skip to content

Commit

Permalink
--lang option (#55)
Browse files Browse the repository at this point in the history
  • Loading branch information
eggplants committed Apr 4, 2022
1 parent b25680c commit 283bb5b
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 21 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -6,6 +6,8 @@ __pycache__
*.pyc
*.log
*.egg-info
*.coverage
cov.xml

# Python related files
build/
Expand Down
19 changes: 16 additions & 3 deletions README.md
Expand Up @@ -91,9 +91,17 @@ For more details of the JavaScript model, please refer to [JavaScript module REA
You can also format inputs on your terminal with `budoux` command.

```shellsession
$ budoux 本日は晴天です。
$ budoux 本日は晴天です。 # default: japanese
本日は
晴天です。

$ budoux -l ja 本日は晴天です。
本日は
晴天です。

$ budoux -l zh-hans 今天天气晴朗。
今天天气
晴朗。
```

```shellsession
Expand All @@ -114,21 +122,26 @@ If you want to see help, run `budoux -h`.

```shellsession
$ budoux -h
usage: budoux [-h] [-H] [-m JSON] [-d STR] [-t THRES] [-V] [TXT]
usage: budoux [-h] [-H] [-m JSON | -l LANG] [-d STR] [-t THRES] [-V] [TXT]

BudouX is the successor to Budou,
the machine learning powered line break organizer tool.

positional arguments:
TXT text (default: None)

optional arguments:
options:
-h, --help show this help message and exit
-H, --html HTML mode (default: False)
-m JSON, --model JSON custom model file path (default: /path/to/models/ja-knbc.json)
-l LANG, --lang LANG language of custom model (default: None)
-d STR, --delim STR output delimiter in TEXT mode (default: ---)
-t THRES, --thres THRES threshold value to separate chunks (default: 1000)
-V, --version show program's version number and exit

supported languages of `-l`, `--lang`:
- zh-hans
- ja
```

## Caveat
Expand Down
65 changes: 58 additions & 7 deletions budoux/main.py
Expand Up @@ -14,6 +14,7 @@
# limitations under the License.
"""BudouX Script to provide CLI for user."""
import argparse
import glob
import json
import os
import shutil
Expand All @@ -34,21 +35,60 @@ class BudouxHelpFormatter(argparse.ArgumentDefaultsHelpFormatter,


def check_file(path: str) -> str:
"""Check if filepath is exist or not.
"""Check if a given filepath exists or not.
Args:
path (str): Model path
Raises:
FileNotFoundError: Raise if given path is not exist.
FileNotFoundError: Raise if given path does not exist.
Returns:
str: Model path confirmed its existance.
str: A model path.
"""
if os.path.isfile(path):
return path
else:
raise FileNotFoundError("'{}' is not found.".format(path))
raise argparse.ArgumentTypeError(f"'{path}' is not found.")


def get_model_langs() -> typing.Dict[str, str]:
"""Get a dictionary of model languages and its paths.
Returns:
typing.Dict[str, str]: A dictionary of model languages and its paths.
"""
models = glob.glob(
pkg_resources.resource_filename(__name__, "models") + "/*-*.json")
langs = {}
for model in models:
model_name = model.split(os.sep)[-1][:-5]
if model_name.startswith('zh-'):
langs[model_name] = model
else:
langs[model_name[:2]] = model
return langs


def check_lang(lang: str) -> str:
"""Check if given language exists or not.
Args:
lang (str): language code (e.g.: 'ja')
Raises:
argparse.ArgumentTypeError: Raise if no model for given language exists.
Returns:
str: A model path.
"""
langs = get_model_langs()
if lang in langs:
return langs[lang]
else:
raise argparse.ArgumentTypeError(
f"'{lang}' does not exist in builtin models. (supported languages: {list(langs.keys())})"
)


def parse_args(test: ArgList = None) -> argparse.Namespace:
Expand All @@ -72,7 +112,9 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
description=textwrap.dedent("""\
BudouX is the successor to Budou,
the machine learning powered line break organizer tool."""),
)
epilog="\n- ".join(
["supported languages of `-l`, `--lang`:",
*get_model_langs().keys()]))

parser.add_argument("text", metavar="TXT", nargs="?", type=str, help="text")
parser.add_argument(
Expand All @@ -81,14 +123,22 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:
action="store_true",
help="HTML mode",
)
parser.add_argument(
model_select_group = parser.add_mutually_exclusive_group()
model_select_group.add_argument(
"-m",
"--model",
metavar="JSON",
type=check_file,
default=pkg_resources.resource_filename(__name__, "models/ja-knbc.json"),
help="custom model file path",
)
model_select_group.add_argument(
"-l",
"--lang",
metavar="LANG",
type=check_lang,
help="language of custom model",
)
parser.add_argument(
"-d",
"--delim",
Expand Down Expand Up @@ -118,7 +168,8 @@ def parse_args(test: ArgList = None) -> argparse.Namespace:

def _main(test: ArgList = None) -> str:
args = parse_args(test=test)
with open(args.model, "r") as f:
model_path = args.lang or args.model
with open(model_path, "r") as f:
model = json.load(f)

parser = budoux.Parser(model)
Expand Down
15 changes: 8 additions & 7 deletions javascript/README.md
Expand Up @@ -133,19 +133,20 @@ If you want to see help, run `budoux -h`.

```shellsession
$ budoux -h
Usage: budoux [-h] [-H] [-m JSON] [-d STR] [-V] [TXT]
Usage: budoux [-h] [-H] [-d STR] [-t THRES] [-m JSON] [-V] [TXT]

BudouX is the successor to Budou, the machine learning powered line break organizer tool.

Arguments:
txt text
txt text

Options:
-H, --html HTML mode
-d, --delim <str> output delimiter in TEXT mode (default: "---")
-m, --model <json> custom model file path
-V, --version output the version number
-h, --help display help for command
-H, --html HTML mode (default: false)
-d, --delim <str> output delimiter in TEXT mode (default: "---")
-t, --thres <number> threshold value to separate chunks (default: "1000")
-m, --model <json> custom model file path
-V, --version output the version number
-h, --help display help for command
```

### Attributes
Expand Down
4 changes: 2 additions & 2 deletions tests/test_feature_extractor.py
Expand Up @@ -26,10 +26,10 @@

from budoux import feature_extractor, utils # noqa (module hack)

if isinstance(sys.stdin, io.TextIOWrapper) and sys.version_info >= (3, 7):
if isinstance(sys.stdin, io.TextIOWrapper):
sys.stdin.reconfigure(encoding='utf-8')

if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7):
if isinstance(sys.stdout, io.TextIOWrapper):
sys.stdout.reconfigure(encoding='utf-8')

SOURCE_FILE_PATH = os.path.abspath(
Expand Down
40 changes: 38 additions & 2 deletions tests/test_main.py
Expand Up @@ -24,10 +24,10 @@

from budoux import main # noqa (module hack)

if isinstance(sys.stdin, io.TextIOWrapper) and sys.version_info >= (3, 7):
if isinstance(sys.stdin, io.TextIOWrapper):
sys.stdin.reconfigure(encoding='utf-8')

if isinstance(sys.stdout, io.TextIOWrapper) and sys.version_info >= (3, 7):
if isinstance(sys.stdout, io.TextIOWrapper):
sys.stdout.reconfigure(encoding='utf-8')


Expand Down Expand Up @@ -55,6 +55,42 @@ def test_cmdargs_version(self) -> None:
self.assertEqual(cm.exception.code, 0)


class TestModelOption(unittest.TestCase):

def test_cmdargs_invalid_json(self) -> None:
cmdargs = ['-m', '404.json']
with self.assertRaises(SystemExit) as cm:
main.parse_args(cmdargs)

self.assertEqual(cm.exception.code, 2)

def test_cmdargs_invalid_lang_1(self) -> None:
cmdargs = ['-l', 'aa']
with self.assertRaises(SystemExit) as cm:
main.parse_args(cmdargs)

self.assertEqual(cm.exception.code, 2)

def test_cmdargs_invalid_lang_2(self) -> None:
cmdargs = ['-l', 'ja-knbc']
with self.assertRaises(SystemExit) as cm:
main.parse_args(cmdargs)

self.assertEqual(cm.exception.code, 2)

def test_cmdargs_lang_ja(self) -> None:
cmdargs = ['-l', 'ja', '今日はいい天気ですね。']
output = main._main(cmdargs)

self.assertEqual(output, '今日は\nいい\n天気ですね。')

def test_cmdargs_lang_zh_hans(self) -> None:
cmdargs = ['-l', 'zh-hans', '今天天气晴朗。']
output = main._main(cmdargs)

self.assertEqual(output, '今天天气\n晴朗。')


class TestTextArguments(unittest.TestCase):

def test_cmdargs_single_text(self) -> None:
Expand Down
11 changes: 11 additions & 0 deletions tests/test_parser.py
Expand Up @@ -143,5 +143,16 @@ def test_translate_html_string(self) -> None:
'Should work with emojis.')


class TestDefaultParser(unittest.TestCase):

def test_load_default_japanese_parser(self) -> None:
p_ja = parser.load_default_japanese_parser()
self.assertTrue("UW4:私" in p_ja.model)

def test_load_default_simplified_chinese_parser(self) -> None:
p_ch = parser.load_default_simplified_chinese_parser()
self.assertTrue("UW4:力" in p_ch.model)


if __name__ == '__main__':
unittest.main()

0 comments on commit 283bb5b

Please sign in to comment.