Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes #628 Greek word tokenizer added #629

Merged
merged 25 commits into from Jan 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
6539736
Check that tokens exist before handling them in Latin word tokenizer
diyclassics Aug 5, 2016
9fb9d88
Update files
diyclassics Oct 27, 2017
9c50477
Update Latin DefaultLemmatizer sequence
diyclassics Oct 27, 2017
ee2f0ba
Reset master
diyclassics Oct 29, 2017
8c891ee
Reset master
diyclassics Oct 29, 2017
8ba5f4a
Merge branch 'master' of https://github.com/cltk/cltk
diyclassics Oct 29, 2017
8686c04
Merge branch 'master' of https://github.com/cltk/cltk
diyclassics Nov 2, 2017
0b3dd02
Merge branch 'master' into master
kylepjohnson Nov 9, 2017
4e03030
Merge branch 'master' into master
kylepjohnson Nov 9, 2017
f4e531a
Merge branch 'master' into master
kylepjohnson Nov 9, 2017
fbfe22d
Merge branch 'master' into master
kylepjohnson Nov 13, 2017
ee6dae0
Merge branch 'master' of https://github.com/diyclassics/cltk
diyclassics Nov 13, 2017
1e9a2ec
Merge branch 'master' of https://github.com/diyclassics/cltk
diyclassics Nov 25, 2017
47506ea
Merge branch 'master' into master
kylepjohnson Nov 28, 2017
f91a0e6
Merge branch 'master' into master
kylepjohnson Jan 17, 2018
a004891
rm whitespace
kylepjohnson Jan 17, 2018
da99c56
Merge branch 'master' of https://github.com/diyclassics/cltk
diyclassics Jan 18, 2018
e8651af
Merge branch 'master' of https://github.com/cltk/cltk
diyclassics Jan 19, 2018
4f72fe6
Merge branch 'master' of https://github.com/diyclassics/cltk
diyclassics Jan 19, 2018
9339150
Merge branch 'master' of https://github.com/cltk/cltk
diyclassics Jan 28, 2018
6292398
Add default Greek tokenizer
diyclassics Jan 28, 2018
d33d900
Comment on word tokenizer
diyclassics Jan 28, 2018
bc2267a
Cleanup order of languages/functions, alphabetical
diyclassics Jan 28, 2018
946b219
Update docs for Greek word tokenizer
diyclassics Jan 28, 2018
8fcd3e4
Add unittest for Greek word tokenizer
diyclassics Jan 28, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 4 additions & 3 deletions cltk/lemmatize/latin/backoff.py
Expand Up @@ -518,13 +518,14 @@ def _randomize_data(train, seed):
self.pos_train_sents, self.train_sents, self.test_sents = _randomize_data(self.train, self.seed)

def _define_lemmatizer(self):
# Suggested backoff chain--should be tested for optimal order
backoff0 = None
backoff1 = IdentityLemmatizer()
backoff2 = TrainLemmatizer(model=self.LATIN_OLD_MODEL, backoff=backoff1)
backoff3 = PPLemmatizer(regexps=self.latin_verb_patterns, pps=self.latin_pps, backoff=backoff2)
backoff4 = UnigramLemmatizer(self.train_sents, backoff=backoff3)
backoff5 = RegexpLemmatizer(self.latin_sub_patterns, backoff=backoff4)
backoff6 = TrainLemmatizer(model=self.LATIN_MODEL, backoff=backoff5)
backoff4 = RegexpLemmatizer(self.latin_sub_patterns, backoff=backoff3)
backoff5 = UnigramLemmatizer(self.train_sents, backoff=backoff4)
backoff6 = TrainLemmatizer(model=self.LATIN_MODEL, backoff=backoff5)
#backoff7 = BigramPOSLemmatizer(self.pos_train_sents, include=['cum'], backoff=backoff6)
#lemmatizer = backoff7
lemmatizer = backoff6
Expand Down
19 changes: 18 additions & 1 deletion cltk/tests/test_tokenize.py
Expand Up @@ -61,6 +61,23 @@ def test_sentence_tokenizer_greek(self):
self.assertEqual(len(tokenized_sentences), len(good_tokenized_sentences))
'''


def test_greek_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('greek')

# Test sources:
# - Thuc. 1.1.1

test = "Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων, ὡς ἐπολέμησαν πρὸς ἀλλήλους, ἀρξάμενος εὐθὺς καθισταμένου καὶ ἐλπίσας μέγαν τε ἔσεσθαι καὶ ἀξιολογώτατον τῶν προγεγενημένων, τεκμαιρόμενος ὅτι ἀκμάζοντές τε ᾖσαν ἐς αὐτὸν ἀμφότεροι παρασκευῇ τῇ πάσῃ καὶ τὸ ἄλλο Ἑλληνικὸν ὁρῶν ξυνιστάμενον πρὸς ἑκατέρους, τὸ μὲν εὐθύς, τὸ δὲ καὶ διανοούμενον."

target = ['Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν', 'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',', 'ὡς', 'ἐπολέμησαν', 'πρὸς', 'ἀλλήλους', ',', 'ἀρξάμενος', 'εὐθὺς', 'καθισταμένου', 'καὶ', 'ἐλπίσας', 'μέγαν', 'τε', 'ἔσεσθαι', 'καὶ', 'ἀξιολογώτατον', 'τῶν', 'προγεγενημένων', ',', 'τεκμαιρόμενος', 'ὅτι', 'ἀκμάζοντές', 'τε', 'ᾖσαν', 'ἐς', 'αὐτὸν', 'ἀμφότεροι', 'παρασκευῇ', 'τῇ', 'πάσῃ', 'καὶ', 'τὸ', 'ἄλλο', 'Ἑλληνικὸν', 'ὁρῶν', 'ξυνιστάμενον', 'πρὸς', 'ἑκατέρους', ',', 'τὸ', 'μὲν', 'εὐθύς', ',', 'τὸ', 'δὲ', 'καὶ', 'διανοούμενον', '.']

result = word_tokenizer.tokenize(test)

self.assertEqual(result, target)


def test_latin_word_tokenizer(self):
"""Test Latin-specific word tokenizer."""
word_tokenizer = WordTokenizer('latin')
Expand Down Expand Up @@ -213,7 +230,7 @@ def test_old_norse_word_tokenizer(self):
'vilja', 'þeira', '.']
word_tokenizer = WordTokenizer('old_norse')
result = word_tokenizer.tokenize(text)
print(result)
#print(result)
self.assertTrue(result == target)

if __name__ == '__main__':
Expand Down
105 changes: 66 additions & 39 deletions cltk/tokenize/word.py
Expand Up @@ -8,6 +8,7 @@

import re

# Cleanup these imports—most are not used!
from nltk.data import load
from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize)
from nltk.tokenize.mwe import MWETokenizer
Expand Down Expand Up @@ -41,20 +42,28 @@ def __init__(self, language):
"""Take language as argument to the class. Check availability and
setup class variables."""
self.language = language
self.available_languages = ['arabic', 'latin', 'french', 'old_norse']
self.available_languages = ['arabic',
'french',
'greek',
'latin',
'old_norse']
assert self.language in self.available_languages, \
"Specific tokenizer not available for '{0}'. Only available for: '{1}'.".format(self.language, # pylint: disable=line-too-long
self.available_languages) # pylint: disable=line-too-long
self.available_languages) # pylint: disable=line-too-long
# ^^^ Necessary? since we have an 'else' in `tokenize`


def tokenize(self, string):
"""Tokenize incoming string."""

if self.language == 'latin':
tokens = tokenize_latin_words(string)
if self.language == 'arabic':
tokens = tokenize_arabic_words(string)
elif self.language == 'french':
tokens = tokenize_french_words(string)
elif self.language == 'arabic':
tokens = tokenize_arabic_words(string)
elif self.language == 'greek':
tokens = tokenize_greek_words(string)
elif self.language == 'latin':
tokens = tokenize_latin_words(string)
elif self.language == 'old_norse':
tokens = tokenize_old_norse_words(string)
else:
Expand Down Expand Up @@ -101,6 +110,56 @@ def nltk_tokenize_words(string, attached_period=False, language=None):
return new_tokens


def tokenize_arabic_words(text):

"""
Tokenize text into words
@param text: the input text.
@type text: unicode.
@return: list of words.
@rtype: list.
"""
specific_tokens = []
if not text:
return specific_tokens
else:
specific_tokens = araby.tokenize(text)
return specific_tokens


def tokenize_french_words(string):
assert isinstance(string, str), "Incoming string must be type str."

# normalize apostrophes

text = re.sub(r"’", r"'", string)

# Dealing with punctuation
text = re.sub(r"\'", r"' ", text)
text = re.sub("(?<=.)(?=[.!?)(\";:,«»\-])", " ", text)

results = str.split(text)
return (results)


def tokenize_greek_words(text):
"""
Tokenizer divides the string into a list of substrings. This is a placeholder
function that returns the default NLTK word tokenizer until
Greek-specific options are added.

Example:
>>> text = 'Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων,'
>>> tokenize_greek_words(text)
['Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν', 'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',']

:param string: This accepts the string value that needs to be tokenized
:returns: A list of substrings extracted from the string
"""

return nltk_tokenize_words(text) # Simplest implementation to start


def tokenize_latin_words(string):
"""
Tokenizer divides the string into a list of substrings
Expand Down Expand Up @@ -211,38 +270,6 @@ def replace(m):
return specific_tokens


def tokenize_french_words(string):
assert isinstance(string, str), "Incoming string must be type str."

# normalize apostrophes

text = re.sub(r"’", r"'", string)

# Dealing with punctuation
text = re.sub(r"\'", r"' ", text)
text = re.sub("(?<=.)(?=[.!?)(\";:,«»\-])", " ", text)

results = str.split(text)
return (results)


def tokenize_arabic_words(text):

"""
Tokenize text into words
@param text: the input text.
@type text: unicode.
@return: list of words.
@rtype: list.
"""
specific_tokens = []
if not text:
return specific_tokens
else:
specific_tokens = araby.tokenize(text)
return specific_tokens


def tokenize_old_norse_words(text):
"""

Expand Down
14 changes: 14 additions & 0 deletions docs/greek.rst
Expand Up @@ -856,6 +856,20 @@ the Greek language. Currently, the only available dialect is Attic as reconstruc
Out[3]: '[di.ó.tʰen kɑj dis.kɛ́ːp.trọː ti.mɛ̂ːs o.kʰy.ron zdêw.gos ɑ.trẹː.dɑ̂n stó.lon ɑr.gẹ́ː.ɔːn]'


Word Tokenization
=================

.. code-block:: python

In [1]: from cltk.tokenize.word import WordTokenizer

In [2]: word_tokenizer = WordTokenizer('greek')

In [3]: text = 'Θουκυδίδης Ἀθηναῖος ξυνέγραψε τὸν πόλεμον τῶν Πελοποννησίων καὶ Ἀθηναίων,'

In [4]: word_tokenizer.tokenize(text)
Out[4]: ['Θουκυδίδης', 'Ἀθηναῖος', 'ξυνέγραψε', 'τὸν', 'πόλεμον', 'τῶν', 'Πελοποννησίων', 'καὶ', 'Ἀθηναίων', ',']


Word2Vec
========
Expand Down