Skip to content

Commit

Permalink
feat(tokenizer): add new tokenizer configurations for SimpleTokenizer…
Browse files Browse the repository at this point in the history
…, MecabTokenizer, NLTKTokenizer, add new tagger configurations for mecab and nltk
  • Loading branch information
entelecheia committed Jul 23, 2023
1 parent df773b9 commit 26496c1
Show file tree
Hide file tree
Showing 6 changed files with 86 additions and 0 deletions.
39 changes: 39 additions & 0 deletions src/corprep/conf/tokenizer/__init__.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
defaults:
- /normalizer: __init__
- /stopwords: __init__

_target_: corprep.tokenizer.SimpleTokenizer
lowercase: false
flatten: true
strip_pos: false
postag_delim: /
postag_length:
include_whitespace_token: true
tokenize_each_word: false
sentence_separator: '\n'
userdic_path:
wordpieces_prefix: "##"
postags:
noun_postags:
- NNG
- NNP
- XSN
- SL
- XR
- NNB
- NR
punct_postags:
- SF
- SP
- SSO
- SSC
- SY
stop_postags:
- SP
- SF
- SE
- SSO
- SSC
- SC
- SY
- SH
22 changes: 22 additions & 0 deletions src/corprep/conf/tokenizer/mecab.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
defaults:
- __init__
- tagger: mecab

_target_: corprep.tokenizer.MecabTokenizer
noun_postags:
- NNG
- NNP
- XSN
- SL
- XR
- NNB
- NR
stop_postags:
- SP
- SF
- SE
- SSO
- SSC
- SC
- SY
- SH
12 changes: 12 additions & 0 deletions src/corprep/conf/tokenizer/nltk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
defaults:
- __init__
- tagger: nltk

_target_: corprep.tokenizer.NLTKTokenizer
noun_postags:
- NN
- NNP
- NNS
- NNPS
stop_postags:
- "."
4 changes: 4 additions & 0 deletions src/corprep/conf/tokenizer/simple.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
defaults:
- __init__

_target_: corprep.tokenizer.SimpleTokenizer
3 changes: 3 additions & 0 deletions src/corprep/conf/tokenizer/tagger/mecab.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
userdic_path: ${..userdic_path}
backend: ekonlpy
verbose: ${oc.select:..verbose, false}
6 changes: 6 additions & 0 deletions src/corprep/conf/tokenizer/tagger/nltk.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
lemmatize: false
stem: true
lemmatizer:
_target_: nltk.stem.WordNetLemmatizer
stemmer:
_target_: nltk.stem.PorterStemmer

0 comments on commit 26496c1

Please sign in to comment.