Skip to content
Permalink
Browse files

Refactoring word tokenizers (#907)

* Add regex tokenizer

* Remove word tokenizer methods; streamline language-specific tokenizer setup

* Update tests

* Update test dependent on sentence tokenizer

* Remove main function

* Reorganize tests

* Move Latin word tokenizer to own submodule; create base classes for word tokenization

* Cleanup Latin word tokenization

* Add base regex word tokenizer

* Make language specific submodules for word tokenization

* Make language specific submodules for word tokenization

* Add word tokenization for Sanskrit

* Update tests

* Improve tokenizers

* Refactor enclitic separation as regex

* Update params

* Update tests

* Fix enclitic separation

* Move akkadian to submodule

* Clean up files

* Clean up files

* Update test

* Update tokenizer references in other modules

* Update references to nltk_tokenize_words

* Update references to tokenizers

* Update test depdendent on tokenizer

* Fix missing comma?

* Update enclitic splitting

* Update enclitic splitting to ignore case for excluded words

* Fix case for enclitic splitting

* Update coverage

* Adjust test set up for Latin word tokenizer

* Remove ner doctest

* Fix missing model import

* Update comments

* Fix case with enclitic splitting

* Remove main function
  • Loading branch information...
diyclassics authored and kylepjohnson committed May 14, 2019
1 parent cec0f43 commit fb787df70003a3f1933696d8e20d055c5fa0677f
@@ -7,7 +7,7 @@
from cltk.phonology.old_norse.transcription import Consonant, Vowel, old_norse_rules, IPA_class, \
DIPHTHONGS_IPA_class, DIPHTHONGS_IPA, measure_old_norse_syllable
from cltk.phonology.syllabify import Syllabifier
from cltk.tokenize.word import tokenize_old_norse_words
from cltk.tokenize.word import WordTokenizer
import cltk.corpus.old_norse.syllabifier as old_norse_syllabifier
from cltk.stop.old_norse.stops import STOPS_LIST
from cltk.utils.cltk_logger import logger
@@ -103,7 +103,8 @@ def load_poem_from_paragraphs(paragraphs):
class ShortLine:
def __init__(self, text):
self.text = text
self.tokenized_text = tokenize_old_norse_words(text)
self.tokenizer = WordTokenizer('old_norse')
self.tokenized_text = self.tokenizer.tokenize(text)
self.first_sounds = []
self.syllabified = []
self.transcribed = []
@@ -178,7 +179,8 @@ def find_alliterations(self, other_short_line):
class LongLine:
def __init__(self, text):
self.text = text
self.tokenized_text = tokenize_old_norse_words(text)
self.tokenizer = WordTokenizer('old_norse')
self.tokenized_text = self.tokenizer.tokenize(text)
self.short_lines = None
self.first_sounds = []
self.syllabified = []
@@ -194,7 +196,7 @@ def syllabify(self, syllabifier):
:param syllabifier:
:return:
"""
for viisuordh in tokenize_old_norse_words(self.text):
for viisuordh in self.tokenized_text:
word = normalize(viisuordh)
if word:
self.syllabified.append(syllabifier.syllabify(word))
@@ -205,7 +207,7 @@ def to_phonetics(self, transcriber):
:param transcriber:
:return:
"""
for viisuordh in tokenize_old_norse_words(self.text):
for viisuordh in self.tokenized_text:
word = normalize(viisuordh)
if word:
transcribed_word = transcriber.text_to_phonetic_representation(word)
@@ -258,7 +260,7 @@ def __init__(self):
"""
self.text = ""
self.short_lines = [] # list of minimal lines
self.long_lines = [] # list of long lines
self.long_lines = [] # list of long lines
self.syllabified_text = [] # each word is replaced by a list of its syllables
self.transcribed_text = [] # each line is replaced by its phonetic transcription
self.phonological_features_text = []
@@ -56,7 +56,7 @@ def lemmatize(self, input_text, return_raw=False, return_string=False):
key-value list of lemmata-headword. If a string, tokenize with
``PunktLanguageVars()``. If a final period appears on a token, remove
it, then re-add once replacement done.
TODO: rm check for final period, change PunktLanguageVars() to nltk_tokenize_words()
TODO: rm check for final period, change PunktLanguageVars()
"""
assert type(input_text) in [list, str], \
logger.error('Input must be a list or string.')
@@ -323,7 +323,7 @@ def test_json_corpus_reader(self):
reader._fileids = ['cicero__on-behalf-of-aulus-caecina__latin.json']
self.assertTrue(len(list(reader.paras())) >= 1)
self.assertTrue(len(list(reader.sents())) > 400)
self.assertTrue(len(list(reader.words())) > 12200)
self.assertTrue(len(list(reader.words())) > 12000)
reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')
# this example has subsections
reader._fileids = ['ausonius-decimus-magnus__eclogarum-liber__latin.json']
@@ -12,7 +12,7 @@
from cltk.phonology.syllabify import Syllabifier
from cltk.tag.pos import POSTag
from cltk.corpus.utils.importer import CorpusImporter
from cltk.tokenize.word import tokenize_old_norse_words
from cltk.tokenize.word import WordTokenizer
from cltk.corpus.old_norse.syllabifier import invalid_onsets
from cltk.inflection.old_norse import pronouns, nouns
import cltk.inflection.utils as decl_utils
@@ -89,7 +89,8 @@ def test_syllabification_old_norse(self):
s = Syllabifier(language="old_norse", break_geminants=True)
text = "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok " \
"átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð."
words = tokenize_old_norse_words(text)
tokenizer = WordTokenizer('old_norse')
words = tokenizer.tokenize(text)
s.set_invalid_onsets(invalid_onsets)
syllabified_words = [s.syllabify_ssp(word.lower())
for word in words if word not in ",."]
@@ -15,7 +15,7 @@
from cltk.phonology.old_swedish import transcription as old_swedish
from cltk.phonology import utils as ut
from cltk.phonology.syllabify import Syllabifier, Syllable
from cltk.tokenize.word import tokenize_old_norse_words
from cltk.tokenize.word import WordTokenizer
from cltk.corpus.old_norse.syllabifier import invalid_onsets
import unittest

@@ -738,7 +738,8 @@ def test_syllabification_old_norse(self):
old_norse_syllabifier = Syllabifier(language="old_norse", break_geminants=True)
text = "Gefjun dró frá Gylfa glöð djúpröðul óðla, svá at af rennirauknum rauk, Danmarkar auka. Báru öxn ok " \
"átta ennitungl, þars gengu fyrir vineyjar víðri valrauf, fjögur höfuð."
words = tokenize_old_norse_words(text)
tokenizer = WordTokenizer('old_norse')
words = tokenizer.tokenize(text)
old_norse_syllabifier.set_invalid_onsets(invalid_onsets)

syllabified_words = [old_norse_syllabifier.syllabify_ssp(word.lower())
@@ -284,8 +284,7 @@ def test_classical_hindi_stops(self):
Sentence extracted from (https://github.com/cltk/hindi_text_ltrc/blob/master/miscellaneous/gandhi/main.txt)
"""
sentence = " वह काबुली फिर वहां आकर खडा हो गया है "
tokenizer = TokenizeSentence('hindi')
tokens = tokenizer.tokenize(sentence)
tokens = sentence.split()
no_stops = [word for word in tokens if word not in HINDI_STOPS]
target_list = ['काबुली', 'फिर', 'वहां', 'आकर', 'खडा', 'गया']
self.assertEqual(no_stops, target_list)
Oops, something went wrong.

0 comments on commit fb787df

Please sign in to comment.
You can’t perform that action at this time.