Skip to content
Permalink
Browse files

Docs and formatting fixes for WordTokenizer (#932)

* light pylint formatting for word.py

* make multiling tokenization docs comprehenisble

* mk example ocs
  • Loading branch information...
kylepjohnson committed Aug 21, 2019
1 parent 697bfca commit c2b604107894298ab35b59c833feb9b0d80c011a
Showing with 90 additions and 48 deletions.
  1. +29 −17 cltk/tokenize/word.py
  2. +61 −31 docs/multilingual.rst
@@ -1,31 +1,33 @@
"""Language-specific word tokenizers. Primary purpose is to handle enclitics."""
"""Language-specific word tokenizers. Primary purpose is
to handle enclitics.
"""

__author__ = ['Patrick J. Burns <patrick@diyclassics.org>',
'Kyle P. Johnson <kyle@kyle-p-johnson.com>',
'Natasha Voake <natashavoake@gmail.com>',
'Clément Besnier <clemsciences@aol.com>',
'Andrew Deloucas <adeloucas@g.harvard.edu>',
'Todd Cook <todd.g.cook@gmail.com>']
# Author info for Arabic?

__license__ = 'MIT License. See LICENSE.'

import logging
import re
from abc import abstractmethod
from typing import List, Dict, Tuple, Set, Any, Generator
from typing import List

from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from nltk.tokenize.punkt import PunktParameters
from nltk.tokenize.punkt import PunktSentenceTokenizer
from nltk.tokenize.treebank import TreebankWordTokenizer

import cltk.corpus.arabic.utils.pyarabic.araby as araby
from cltk.tokenize.greek.sentence import GreekRegexSentenceTokenizer
from cltk.tokenize.akkadian.word import tokenize_akkadian_words, tokenize_akkadian_signs
from cltk.corpus.arabic.utils.pyarabic import araby
from cltk.tokenize.greek.sentence import GreekRegexSentenceTokenizer
from cltk.tokenize.latin.word import WordTokenizer as LatinWordTokenizer
from cltk.tokenize.middle_english.params import MiddleEnglishTokenizerPatterns
from cltk.tokenize.middle_high_german.params import MiddleHighGermanTokenizerPatterns
from cltk.tokenize.old_norse.params import OldNorseTokenizerPatterns
from cltk.tokenize.old_french.params import OldFrenchTokenizerPatterns
from cltk.tokenize.latin.word import WordTokenizer as LatinWordTokenizer

LOG = logging.getLogger(__name__)
LOG.addHandler(logging.NullHandler())
@@ -40,7 +42,7 @@ def __init__(self, language):
self.language = language
self.available_languages = ['akkadian',
'arabic',
'french', # deprecate
'french', # defaults to old_french
'greek',
'latin',
'middle_english',
@@ -56,35 +58,45 @@ def __init__(self, language):
self.language,
self.available_languages)

# raise languages-specific warnings
if self.language == 'french':
self.language = 'old_french'
LOG.warning("'french' defaults to 'old_french'. 'middle_french' also available.") # pylint: disable=line-too-long

if self.language == 'arabic':
self.toker = BaseArabyWordTokenizer('arabic')
elif self.language == 'french':
self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns)
self.toker = BaseRegexWordTokenizer('old_french',
OldFrenchTokenizerPatterns)
elif self.language == 'greek':
self.toker = BasePunktWordTokenizer('greek', GreekRegexSentenceTokenizer)
self.toker = BasePunktWordTokenizer('greek',
GreekRegexSentenceTokenizer)
elif self.language == 'latin':
self.toker = LatinWordTokenizer()
elif self.language == 'old_norse':
self.toker = BaseRegexWordTokenizer('old_norse', OldNorseTokenizerPatterns)
self.toker = BaseRegexWordTokenizer('old_norse',
OldNorseTokenizerPatterns)
elif self.language == 'middle_english':
self.toker = BaseRegexWordTokenizer('middle_english', MiddleEnglishTokenizerPatterns)
self.toker = BaseRegexWordTokenizer('middle_english',
MiddleEnglishTokenizerPatterns)
elif self.language == 'middle_french':
self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns)
self.toker = BaseRegexWordTokenizer('old_french',
OldFrenchTokenizerPatterns)
elif self.language == 'middle_high_german':
self.toker = BaseRegexWordTokenizer('middle_high_german',
MiddleHighGermanTokenizerPatterns)
elif self.language == 'old_french':
self.toker = BaseRegexWordTokenizer('old_french', OldFrenchTokenizerPatterns)
self.toker = BaseRegexWordTokenizer('old_french',
OldFrenchTokenizerPatterns)
else:
LOG.warning('Falling back to default tokenizer')
LOG.warning("Falling back to default tokenizer, the NLTK's `TreebankWordTokenizer()`.")
self.toker = TreebankWordTokenizer()

def tokenize(self, text):
"""Tokenize incoming string."""
if self.language == 'akkadian':
return tokenize_akkadian_words(text)
else:
return self.toker.tokenize(text)
return self.toker.tokenize(text)

def tokenize_sign(self, word):
"""This is for tokenizing cuneiform signs."""
@@ -748,38 +748,68 @@ If you have access to the TLG or PHI5 disc, and have already imported it and con
Word tokenization
=================

The NLTK offers several methods for word tokenization. The CLTK Tokenize module offers "TreebankWordTokenizer" as a default multilingual word tokenizer.
The CLTK wraps one of the NLTK's tokenizers (``TreebankWordTokenizer``), which with the ``multilingual`` parameter works for most languages that use Latin-style whitespace and punctuation to indicate word division. There are some language-specific tokenizers, too, which do extra work to subdivide words when they are combined into one string (e.g., "armaque" in Latin). See ``WordTokenizer.available_languages`` for supported languages for such sub-string tokenization.

.. code-block:: python
In [1]: from cltk.tokenze.word('multilingual')
In [2]: s = """Anna soror, quae me suspensam insomnia terrent! Quis novus hic nostris successit sedibus hospes."""
In [3]: t = WordTokenizer('multilingual')
In [4]: t.tokenize(s)
Out[4]:
['Anna', 'soror', ',', 'quae', 'me', 'suspensam', 'insomnia', 'terrent', '!', 'Quis', 'novus', 'hic', 'nostris', 'successit', 'sedibus', 'hospes', '.']
If ``PunktLanguageVars`` doesn't suit your tokenization needs, consider another tokenizer from the NLTK, which breaks on any other regular expression pattern you choose. Here, for instance, on whitespace word breaks:

.. code-block:: python
In [7]: from nltk.tokenize import RegexpTokenizer
In [8]: word_breaks = RegexpTokenizer(r'\w+')
In [8]: tokens = word_breaks.tokenize(cleaned)
In [1]: from cltk.tokenize.word import WordTokenizer
In [9]: tokens[:10]
Out[9]: ['Arma',
'uirumque',
'cano',
'Troiae',
'qui',
'primus',
'ab',
'oris',
'Italiam',
'fato']
In [2]: tok.available_languages
Out[2]:
['akkadian',
'arabic',
'french',
'greek',
'latin',
'middle_english',
'middle_french',
'middle_high_german',
'old_french',
'old_norse',
'sanskrit',
'multilingual']
In [3]: luke_ocs = "рєчє жє притъчѫ к н҄имъ глагол҄ѧ чловѣкѹ єтєрѹ богатѹ ѹгобьѕи сѧ н҄ива"
In [4]: tok = WordTokenizer(language='multilingual')
In [5]: tok.tokenize(luke_ocs)
Out[5]:
['рєчє',
'жє',
'притъчѫ',
'к',
'н҄имъ',
'глагол҄ѧ',
'чловѣкѹ',
'єтєрѹ',
'богатѹ',
'ѹгобьѕи',
'сѧ',
'н҄ива']
If this default does not work for your texts, consider the NLTK's ``RegexpTokenizer``, which splits on a regular expression patterns of your choosing. Here, for instance, on whitespace and punctuation:

.. code-block:: python
In [6]: from nltk.tokenize import RegexpTokenizer
In [7]: word_toker = RegexpTokenizer(r'\w+')
In [8]: word_toker.tokenize(luke_ocs)
Out[8]:
['рєчє',
'жє',
'притъчѫ',
'к',
'н',
'имъ',
'глагол',
'ѧ',
'чловѣкѹ',
'єтєрѹ',
'богатѹ',
'ѹгобьѕи',
'сѧ',
'н',
'ива']

0 comments on commit c2b6041

Please sign in to comment.
You can’t perform that action at this time.