Skip to content

Commit

Permalink
update (#47)
Browse files Browse the repository at this point in the history
* add defaultdict setting function & be able to put in dict paths as parameters to the config file

* add package coloredlogs to install_requires list

* add a example for adding dict paths to the config file
  • Loading branch information
DoubleAix authored and crownpku committed May 1, 2018
1 parent ccb8fec commit be68dc8
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 28 deletions.
143 changes: 115 additions & 28 deletions rasa_nlu/tokenizers/jieba_tokenizer.py
@@ -1,60 +1,147 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import print_function
from __future__ import division
from __future__ import absolute_import

import glob
import logging
from typing import Any
from typing import Dict
from typing import List
from typing import Text

from rasa_nlu.components import Component
from rasa_nlu.config import RasaNLUModelConfig
from rasa_nlu.tokenizers import Tokenizer, Token
from rasa_nlu.training_data import Message, TrainingData
from typing import Any, List, Text

logger = logging.getLogger(__name__)
from rasa_nlu.components import Component
from rasa_nlu.training_data import Message
from rasa_nlu.training_data import TrainingData

import os
import glob

class JiebaTokenizer(Tokenizer, Component):


name = "tokenizer_jieba"

provides = ["tokens"]

language_list = ["zh"]

def __init__(self,
component_config=None, # type: Dict[Text, Any]
tokenizer=None
):
# type: (...) -> None

super(JiebaTokenizer, self).__init__(component_config)

self.tokenizer = tokenizer


@classmethod
def create(cls, cfg):
# type: (RasaNLUModelConfig) -> JiebaTokenizer

import jieba as tokenizer

component_conf = cfg.for_component(cls.name, cls.defaults)
tokenizer = cls.init_jieba(tokenizer, component_conf)

return JiebaTokenizer(component_conf, tokenizer)

@classmethod
def load(cls,
model_dir=None, # type: Optional[Text]
model_metadata=None, # type: Optional[Metadata]
cached_component=None, # type: Optional[Component]
**kwargs # type: **Any
):
# type: (...) -> JiebaTokenizer

import jieba as tokenizer

component_meta = model_metadata.for_component(cls.name)
tokenizer = cls.init_jieba(tokenizer, component_meta)

return JiebaTokenizer(component_meta, tokenizer)

@classmethod
def required_packages(cls):
# type: () -> List[Text]
return ["jieba"]

@staticmethod
def load_custom_dictionary(path):
# type: (Text) -> None
"""Load all the custom dictionaries stored in the path.
More information about the dictionaries file format can
be found in the documentation of jieba.
https://github.com/fxsjy/jieba#load-dictionary
"""
import jieba

jieba_userdicts = glob.glob("{}/*".format(path))
for jieba_userdict in jieba_userdicts:
logger.info("Loading Jieba User Dictionary at "
"{}".format(jieba_userdict))
jieba.load_userdict(jieba_userdict)

def train(self, training_data, config, **kwargs):
# type: (TrainingData, RasaNLUModelConfig, **Any) -> None

for example in training_data.training_examples:
example.set("tokens", self.tokenize(example.text))


def process(self, message, **kwargs):
# type: (Message, **Any) -> None

message.set("tokens", self.tokenize(message.text))


def tokenize(self, text):
# type: (Text) -> List[Token]
import jieba
tokenized = jieba.tokenize(text)
tokenized = self.tokenizer.tokenize(text)
tokens = [Token(word, start) for (word, start, end) in tokenized]
return tokens

return tokens


@classmethod
def init_jieba(cls, tokenizer, dict_config):

if dict_config.get("default_dict"):
if os.path.isfile(dict_config.get("default_dict")):
path_default_dict = glob.glob("{}".format(dict_config.get("default_dict")))
tokenizer = cls.set_default_dict(tokenizer, path_default_dict[0])
else:
print("Because the path of Jieba Default Dictionary has to be a file, not a directory, \
so Jieba Default Dictionary hasn't been switched.")
else:
print("No Jieba Default Dictionary found")

if dict_config.get("user_dicts"):
if os.path.isdir(dict_config.get("user_dicts")):
parse_pattern = "{}/*"
else:
parse_pattern = "{}"

path_user_dicts = glob.glob(parse_pattern.format(dict_config.get("user_dicts")))
tokenizer = cls.set_user_dicts(tokenizer, path_user_dicts)
else:
print("No Jieba User Dictionary found")

return tokenizer


@staticmethod
def set_default_dict(tokenizer, path_default_dict):
print("Setting Jieba Default Dictionary at " + str(path_default_dict))
tokenizer.set_dictionary(path_default_dict)

return tokenizer


@staticmethod
def set_user_dicts(tokenizer, path_user_dicts):
if len(path_user_dicts) > 0:
for path_user_dict in path_user_dicts:
print("Loading Jieba User Dictionary at " + str(path_user_dict))
tokenizer.load_userdict(path_user_dict)
else:
print("No Jieba User Dictionary found")

return tokenizer


def persist(self, model_dir):
# type: (Text) -> Dict[Text, Any]

return {
"user_dicts": self.component_config.get("user_dicts"),
"default_dict": self.component_config.get("default_dict")
}
15 changes: 15 additions & 0 deletions sample_configs/config_jieba_mitie_sklearn_plus_dict_path.yml
@@ -0,0 +1,15 @@
language: "zh"

pipeline:
- name: "nlp_mitie"
model: "data/total_word_feature_extractor_zh.dat"
- name: "tokenizer_jieba"
default_dict: "./default_dict.big"
user_dicts: "./jieba_userdict"
# you can put in file path or directory path as the "user_dicts" value
# user_dicts: "./jieba_userdict/jieba_userdict.txt"
- name: "ner_mitie"
- name: "ner_synonyms"
- name: "intent_entity_featurizer_regex"
- name: "intent_featurizer_mitie"
- name: "intent_classifier_sklearn"
1 change: 1 addition & 0 deletions setup.py
Expand Up @@ -39,6 +39,7 @@
"numpy>=1.13",
"simplejson",
"pyyaml",
"coloredlogs"
]

extras_requires = {
Expand Down

0 comments on commit be68dc8

Please sign in to comment.