Skip to content

Commit

Permalink
feat(normalizer): add new configurations for text normalization
Browse files Browse the repository at this point in the history
  • Loading branch information
entelecheia committed Jul 23, 2023
1 parent 53908dd commit ac53a45
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/corprep/conf/normalizer/__init__.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
defaults:
- ftfy: __init__
- spaces: __init__
- special_characters: __init__

_target_: corprep.tokenizer.normalizer.Normalizer
hanja2hangle: false
num_repeats: 2
5 changes: 5 additions & 0 deletions src/corprep/conf/normalizer/formal_en.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
defaults:
- __init__

ftfy:
unescape_html: "auto"
5 changes: 5 additions & 0 deletions src/corprep/conf/normalizer/formal_en_parantheses.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
defaults:
- __init__

special_characters:
regular_parentheses_only: true
4 changes: 4 additions & 0 deletions src/corprep/conf/normalizer/formal_ko.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
defaults:
- __init__

hanja2hangle: true
5 changes: 5 additions & 0 deletions src/corprep/conf/normalizer/informal_ko.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
defaults:
- formal_ko

special_characters:
fix_emoticons: true

0 comments on commit ac53a45

Please sign in to comment.