Skip to content

Commit

Permalink
Merge branch 'diyclassics-tess-reader'
Browse files Browse the repository at this point in the history
  • Loading branch information
Johnson, Kyle P committed Apr 2, 2019
2 parents fe15e24 + f598fcc commit 325d9a1
Show file tree
Hide file tree
Showing 4 changed files with 203 additions and 13 deletions.
7 changes: 6 additions & 1 deletion cltk/corpus/greek/corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,10 @@
'origin': 'https://github.com/cltk/First1KGreek',
'location': 'remote',
'type': 'text'},
{'name': 'greek_text_tesserae',
'encoding': 'utf-8',
'markup': 'plaintext', #modified plaintext with Tesserae-style citations
'origin': 'https://github.com/cltk/greek_text_tesserae.git',
'location': 'remote',
'type': 'text'},
]

159 changes: 158 additions & 1 deletion cltk/corpus/readers.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
"""`reader.py` - Corpus reader utility objects."""
import json
import os
import re
import codecs
import time

import logging
from typing import List, Dict, Tuple, Set, Any, Generator

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader import PlaintextCorpusReader
from nltk.probability import FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize # Replace with CLTK
from nltk import pos_tag # Replace with CLTK

from cltk.prosody.latin.string_utils import flatten
from cltk.tokenize.sentence import TokenizeSentence
from cltk.tokenize.word import WordTokenizer
Expand All @@ -17,7 +24,8 @@
# TODO add your corpus here:
SUPPORTED_CORPORA = {
'latin': ['latin_text_latin_library', 'latin_text_perseus'],
'greek': ['greek_text_perseus']
'greek': ['greek_text_perseus',
'greek_text_tesserae']
} # type: Dict[str, List[str]]


Expand Down Expand Up @@ -62,6 +70,15 @@ def get_corpus_reader(corpus_name: str = None, language: str = None) -> CorpusRe
word_tokenizer=the_word_tokenizer,
target_language='grc') #: this abbreviation is required

if corpus_name == 'greek_text_tesserae':
# tokenizers/taggers need to be replaced with CLTK version
# most obv. for POS tagging!
return TesseraeCorpusReader(root=root, fileids=r'.*\.tess',
sent_tokenizer=sent_tokenize,
word_tokenizer=word_tokenize,
pos_tagger=pos_tag,
target_language='grc') #: this abbreviation is required

# TODO add other languages and write tests for each corpus


Expand Down Expand Up @@ -361,3 +378,143 @@ def __iter__(self) -> Generator[str, str, None]:
"""convenience iterator for Word2Vec training."""
for sent in self.sents():
yield sent


# WRITE DOCSTRING
class TesseraeCorpusReader(PlaintextCorpusReader):
"""
"""

def __init__(self, root, fileids=None, encoding='utf8', skip_keywords=None,
**kwargs):
"""
:param root: The file root of the corpus directory
:param fileids: the list of file ids to consider, or wildcard expression
:param skip_keywords: a list of words which indicate whole paragraphs that should
be skipped by the paras and words methods()
:param encoding: utf8
:param kwargs: Any values to be passed to NLTK super classes, such as sent_tokenizer,
word_tokenizer.
"""
# Initialize the NLTK corpus reader objects
PlaintextCorpusReader.__init__(self, root, fileids, encoding)
# CorpusReader.__init__(self, root, fileids, encoding)
if 'sent_tokenizer' in kwargs:
self._sent_tokenizer = kwargs['sent_tokenizer']
if 'word_tokenizer' in kwargs:
self._word_tokenizer = kwargs['word_tokenizer']
if 'pos_tagger' in kwargs:
self.pos_tagger = kwargs['pos_tagger']


def docs(self: object, fileids:str):
"""
Returns the complete text of a .tess file, closing the document after
we are done reading it and yielding it in a memory-safe fashion.
"""

for path, encoding in self.abspaths(fileids, include_encoding=True):
with codecs.open(path, 'r', encoding=encoding) as f:
yield f.read()

def texts(self: object, fileids: str, plaintext: bool = True):
"""
Returns the text content of a .tess file, i.e. removing the bracketed
citation info (e.g. "<Ach. Tat. 1.1.0>")
"""

for doc in self.docs(fileids):
if plaintext==True:
doc = re.sub(r'<.+?>\s', '', doc) # Remove citation info
doc = doc.rstrip() # Clean up final line breaks
yield doc


def paras(self: object, fileids: str):
"""
Returns paragraphs in a .tess file, as defined by two \n characters.
NB: Most .tess files do not have this feature; only the Homeric poems
from what I have noticed so far. Perhaps a feature worth looking into.
"""

for text in self.texts(fileids):
for para in text.split('\n\n'):
yield para

def lines(self: object, fileids: str, plaintext: bool = True):
"""
Tokenizes documents in the corpus by line
"""

for text in self.texts(fileids, plaintext):
text = re.sub(r'\n\s*\n', '\n', text, re.MULTILINE) # Remove blank lines
for line in text.split('\n'):
yield line

def sents(self: object, fileids: str):
"""
Tokenizes documents in the corpus by sentence
"""

for para in self.paras(fileids):
for sent in sent_tokenize(para):
yield sent

def words(self: object, fileids: str):
"""
Tokenizes documents in the corpus by word
"""
for sent in self.sents(fileids):
for token in word_tokenize(sent):
yield token

def pos_tokenize(self: object, fileids: str):
"""
Segments, tokenizes, and POS tag a document in the corpus.
"""
for para in self.paras(fileids):
yield [
self.pos_tagger(word_tokenize(sent))
for sent in sent_tokenize(para)
]

def describe(self: object, fileids: str = None):
"""
Performs a single pass of the corpus and returns a dictionary with a
variety of metrics concerning the state of the corpus.
based on (Bengfort et al, 2018: 46)
"""
started = time.time()

# Structures to perform counting
counts = FreqDist()
tokens = FreqDist()

# Perform a single pass over paragraphs, tokenize, and counts
for para in self.paras(fileids):
counts['paras'] += 1

for sent in para:
counts['sents'] += 1

# Include POS at some point
for word in sent:
counts['words'] += 1
tokens[word] += 1

# Compute the number of files in the corpus
n_fileids = len(self.fileids())

# Return data structure with information
return {
'files': n_fileids,
'paras': counts['paras'],
'sents': counts['sents'],
'words': counts['words'],
'vocab': len(tokens),
'lexdiv': round((counts['words'] / len(tokens)), 3),
'ppdoc': round((counts['paras'] / n_fileids), 3),
'sppar':round((counts['sents'] / counts['paras']), 3),
'secs': round((time.time()-started), 3),
}
20 changes: 19 additions & 1 deletion cltk/tests/test_corpus/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import unittest
from unittest.mock import patch

import nltk

from cltk.corpus.greek.alphabet import expand_iota_subscript
from cltk.corpus.greek.alphabet import filter_non_greek
from cltk.corpus.greek.beta_to_unicode import Replacer
Expand Down Expand Up @@ -74,6 +76,9 @@ def setUpClass(cls):
corpus_importer.import_corpus('latin_text_perseus')
corpus_importer = CorpusImporter('greek')
corpus_importer.import_corpus('greek_text_perseus')
corpus_importer.import_corpus('greek_text_tesserae')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
except:
raise Exception('Failure to download test corpus')

Expand Down Expand Up @@ -396,7 +401,6 @@ def test_filtered_corpus_reader_sizes(self):
reader._fileids = ['catullus.txt']
self.assertTrue(len(list(reader.sizes())) > 0)

# Causes tokenizer test to fail
def test_json_corpus_reader(self):
"""Test filtered corpus sents method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')
Expand All @@ -419,6 +423,20 @@ def test_json_corpus_reader(self):
# self.assertTrue(len(list(reader.sents())) > 260)
# self.assertTrue(len(list(reader.words())) > 9800)

def test_tesserae_corpus_reader(self):
"""Test Tesserae corpus methods."""
# Update when corpus is add to CLTK
reader = get_corpus_reader(language='greek', corpus_name='greek_text_tesserae')
sample = reader.fileids()[0]
self.assertTrue(len(list(reader.docs(sample))) >= 1)
self.assertTrue(len(list(reader.texts(sample))) >= 1)
self.assertTrue(len(list(reader.paras(sample))) >= 1)
self.assertTrue(len(list(reader.sents(sample))) >= 1)
self.assertTrue(len(list(reader.words(sample))) >= 1)
self.assertTrue(len(list(reader.lines(sample))) >= 1)
self.assertTrue(reader.describe())
self.assertTrue(len(list(reader.pos_tokenize(sample))) >= 1)

def test_json_corpus_reader_sizes(self):
"""Test filtered corpus sizes method."""
reader = get_corpus_reader(language='latin', corpus_name='latin_text_perseus')
Expand Down
30 changes: 20 additions & 10 deletions cltk/tests/test_nlp/test_stem.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,26 @@
class TestSequenceFunctions(unittest.TestCase): # pylint: disable=R0904
"""Class for unittest"""

def setUp(self):
"""Import sanskrit models first, some CSV files necessary for the
Indian lang tokenizers.
"""
corpus_importer = CorpusImporter('sanskrit')
corpus_importer.import_corpus('sanskrit_models_cltk')
file_rel = os.path.join('~/cltk_data/sanskrit/model/sanskrit_models_cltk/README.md')
file = os.path.expanduser(file_rel)
file_exists = os.path.isfile(file)
self.assertTrue(file_exists)
# def setUp(self):
# """Import sanskrit models first, some CSV files necessary for the
# Indian lang tokenizers.
# """
# corpus_importer = CorpusImporter('sanskrit')
# corpus_importer.import_corpus('sanskrit_models_cltk')
# file_rel = os.path.join('~/cltk_data/sanskrit/model/sanskrit_models_cltk/README.md')
# file = os.path.expanduser(file_rel)
# file_exists = os.path.isfile(file)
# self.assertTrue(file_exists)

@classmethod
def setUpClass(cls):
try:
corpus_importer = CorpusImporter('sanskrit')
corpus_importer.import_corpus('sanskrit_models_cltk')
corpus_importer = CorpusImporter('greek')
corpus_importer.import_corpus('greek_models_cltk')
except:
raise Exception('Failure to download test corpus')

def test_latin_i_u_transform(self):
"""Test converting ``j`` to ``i`` and ``v`` to ``u``."""
Expand Down

0 comments on commit 325d9a1

Please sign in to comment.