Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* add defaultdict setting function & be able to put in dict paths as parameters to the config file * add package coloredlogs to install_requires list * add a example for adding dict paths to the config file
- Loading branch information
Showing
3 changed files
with
131 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,60 +1,147 @@ | ||
from __future__ import absolute_import | ||
from __future__ import division | ||
from __future__ import print_function | ||
from __future__ import unicode_literals | ||
from __future__ import print_function | ||
from __future__ import division | ||
from __future__ import absolute_import | ||
|
||
import glob | ||
import logging | ||
from typing import Any | ||
from typing import Dict | ||
from typing import List | ||
from typing import Text | ||
|
||
from rasa_nlu.components import Component | ||
from rasa_nlu.config import RasaNLUModelConfig | ||
from rasa_nlu.tokenizers import Tokenizer, Token | ||
from rasa_nlu.training_data import Message, TrainingData | ||
from typing import Any, List, Text | ||
|
||
logger = logging.getLogger(__name__) | ||
from rasa_nlu.components import Component | ||
from rasa_nlu.training_data import Message | ||
from rasa_nlu.training_data import TrainingData | ||
|
||
import os | ||
import glob | ||
|
||
class JiebaTokenizer(Tokenizer, Component): | ||
|
||
|
||
name = "tokenizer_jieba" | ||
|
||
provides = ["tokens"] | ||
|
||
language_list = ["zh"] | ||
|
||
def __init__(self, | ||
component_config=None, # type: Dict[Text, Any] | ||
tokenizer=None | ||
): | ||
# type: (...) -> None | ||
|
||
super(JiebaTokenizer, self).__init__(component_config) | ||
|
||
self.tokenizer = tokenizer | ||
|
||
|
||
@classmethod | ||
def create(cls, cfg): | ||
# type: (RasaNLUModelConfig) -> JiebaTokenizer | ||
|
||
import jieba as tokenizer | ||
|
||
component_conf = cfg.for_component(cls.name, cls.defaults) | ||
tokenizer = cls.init_jieba(tokenizer, component_conf) | ||
|
||
return JiebaTokenizer(component_conf, tokenizer) | ||
|
||
@classmethod | ||
def load(cls, | ||
model_dir=None, # type: Optional[Text] | ||
model_metadata=None, # type: Optional[Metadata] | ||
cached_component=None, # type: Optional[Component] | ||
**kwargs # type: **Any | ||
): | ||
# type: (...) -> JiebaTokenizer | ||
|
||
import jieba as tokenizer | ||
|
||
component_meta = model_metadata.for_component(cls.name) | ||
tokenizer = cls.init_jieba(tokenizer, component_meta) | ||
|
||
return JiebaTokenizer(component_meta, tokenizer) | ||
|
||
@classmethod | ||
def required_packages(cls): | ||
# type: () -> List[Text] | ||
return ["jieba"] | ||
|
||
@staticmethod | ||
def load_custom_dictionary(path): | ||
# type: (Text) -> None | ||
"""Load all the custom dictionaries stored in the path. | ||
More information about the dictionaries file format can | ||
be found in the documentation of jieba. | ||
https://github.com/fxsjy/jieba#load-dictionary | ||
""" | ||
import jieba | ||
|
||
jieba_userdicts = glob.glob("{}/*".format(path)) | ||
for jieba_userdict in jieba_userdicts: | ||
logger.info("Loading Jieba User Dictionary at " | ||
"{}".format(jieba_userdict)) | ||
jieba.load_userdict(jieba_userdict) | ||
|
||
def train(self, training_data, config, **kwargs): | ||
# type: (TrainingData, RasaNLUModelConfig, **Any) -> None | ||
|
||
for example in training_data.training_examples: | ||
example.set("tokens", self.tokenize(example.text)) | ||
|
||
|
||
def process(self, message, **kwargs): | ||
# type: (Message, **Any) -> None | ||
|
||
message.set("tokens", self.tokenize(message.text)) | ||
|
||
|
||
def tokenize(self, text): | ||
# type: (Text) -> List[Token] | ||
import jieba | ||
tokenized = jieba.tokenize(text) | ||
tokenized = self.tokenizer.tokenize(text) | ||
tokens = [Token(word, start) for (word, start, end) in tokenized] | ||
return tokens | ||
|
||
return tokens | ||
|
||
|
||
@classmethod | ||
def init_jieba(cls, tokenizer, dict_config): | ||
|
||
if dict_config.get("default_dict"): | ||
if os.path.isfile(dict_config.get("default_dict")): | ||
path_default_dict = glob.glob("{}".format(dict_config.get("default_dict"))) | ||
tokenizer = cls.set_default_dict(tokenizer, path_default_dict[0]) | ||
else: | ||
print("Because the path of Jieba Default Dictionary has to be a file, not a directory, \ | ||
so Jieba Default Dictionary hasn't been switched.") | ||
else: | ||
print("No Jieba Default Dictionary found") | ||
|
||
if dict_config.get("user_dicts"): | ||
if os.path.isdir(dict_config.get("user_dicts")): | ||
parse_pattern = "{}/*" | ||
else: | ||
parse_pattern = "{}" | ||
|
||
path_user_dicts = glob.glob(parse_pattern.format(dict_config.get("user_dicts"))) | ||
tokenizer = cls.set_user_dicts(tokenizer, path_user_dicts) | ||
else: | ||
print("No Jieba User Dictionary found") | ||
|
||
return tokenizer | ||
|
||
|
||
@staticmethod | ||
def set_default_dict(tokenizer, path_default_dict): | ||
print("Setting Jieba Default Dictionary at " + str(path_default_dict)) | ||
tokenizer.set_dictionary(path_default_dict) | ||
|
||
return tokenizer | ||
|
||
|
||
@staticmethod | ||
def set_user_dicts(tokenizer, path_user_dicts): | ||
if len(path_user_dicts) > 0: | ||
for path_user_dict in path_user_dicts: | ||
print("Loading Jieba User Dictionary at " + str(path_user_dict)) | ||
tokenizer.load_userdict(path_user_dict) | ||
else: | ||
print("No Jieba User Dictionary found") | ||
|
||
return tokenizer | ||
|
||
|
||
def persist(self, model_dir): | ||
# type: (Text) -> Dict[Text, Any] | ||
|
||
return { | ||
"user_dicts": self.component_config.get("user_dicts"), | ||
"default_dict": self.component_config.get("default_dict") | ||
} |
15 changes: 15 additions & 0 deletions
15
sample_configs/config_jieba_mitie_sklearn_plus_dict_path.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
language: "zh" | ||
|
||
pipeline: | ||
- name: "nlp_mitie" | ||
model: "data/total_word_feature_extractor_zh.dat" | ||
- name: "tokenizer_jieba" | ||
default_dict: "./default_dict.big" | ||
user_dicts: "./jieba_userdict" | ||
# you can put in file path or directory path as the "user_dicts" value | ||
# user_dicts: "./jieba_userdict/jieba_userdict.txt" | ||
- name: "ner_mitie" | ||
- name: "ner_synonyms" | ||
- name: "intent_entity_featurizer_regex" | ||
- name: "intent_featurizer_mitie" | ||
- name: "intent_classifier_sklearn" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,7 @@ | |
"numpy>=1.13", | ||
"simplejson", | ||
"pyyaml", | ||
"coloredlogs" | ||
] | ||
|
||
extras_requires = { | ||
|