Skip to content

Commit

Permalink
tlg_plaintext_cleanup regex adjusted, speedup (#953)
Browse files Browse the repository at this point in the history
* tlg_plaintext_cleanup regex adjusted, speedup

* Regex in tlg_plaintext_cleanup simplyfied

Co-authored-by: Kyle P. Johnson <kyle@kyle-p-johnson.com>
  • Loading branch information
pharos-alexandria and kylepjohnson committed Jun 22, 2020
1 parent 5bb1cf5 commit cd19d66
Showing 1 changed file with 7 additions and 17 deletions.
24 changes: 7 additions & 17 deletions cltk/corpus/utils/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,26 +85,17 @@ def remove_non_latin(input_string, also_keep=None):
def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False):
"""Remove and substitute post-processing for Greek TLG text.
TODO: Surely more junk to pull out. Please submit bugs!
TODO: {.+?}|\(.+?\) working?
TODO: This is a rather slow now, help in speeding up welcome.
"""
remove_comp = regex.compile(r'-\n|«|»|<|>|\.\.\.|‘|’|_|{.+?}|\(.+?\)|[a-zA-Z0-9]', flags=regex.VERSION1)
remove_comp = regex.compile(r'-\n|[«»<>\(\)‘’_—:!\?\'\"\*]|{[[:print:][:space:]]+?}|\[[[:print:][:space:]]+?\]|[a-zA-Z0-9]', flags=regex.VERSION1)
text = remove_comp.sub('', text)

new_text = None
if rm_punctuation:
new_text = ''
punctuation = [',', '·', ':', '"', "'", '?', '-', '!', '*', '[', ']', '{', '}']
if rm_periods:
punctuation += ['.', ';']
for char in text:
# second try at rming some punctuation; merge with above regex
if char in punctuation:
pass
else:
new_text += char
if new_text:
text = new_text
punct_comp = regex.compile(r',|·')
text = punct_comp.sub('', text)

if rm_periods:
period_comp = regex.compile(r'\.|;')
text = period_comp.sub('', text)

# replace line breaks w/ space
replace_comp = regex.compile(r'\n')
Expand All @@ -115,7 +106,6 @@ def tlg_plaintext_cleanup(text, rm_punctuation=False, rm_periods=False):

return text


def cltk_normalize(text, compatibility=True):
if compatibility:
return normalize('NFKC', text)
Expand Down

0 comments on commit cd19d66

Please sign in to comment.