In [None]:
# Imports

from nltk.corpus import PlaintextCorpusReader

In [None]:
# Make new reader

class VicifonsCorpusReader(PlaintextCorpusReader):
    """
    A corpus reader for working with Vicifons-sourced plaintext docs
    """
    def __init__(self, root, fileids):
        PlaintextCorpusReader.__init__(self, root, fileids)

In [None]:
# Instantiate reader 

root = 'texts/vicifons'
file_pattern = '.+\.txt'
CR = VicifonsCorpusReader(root, file_pattern)

In [None]:
# Show fileids
 
CR.fileids()

In [None]:
# Show default sents

CR.sents()

In [None]:
# Show default words

CR.words()

In [None]:
# Show default raw

print(CR.raw()[:500])

In [None]:
# Show default raw, pretty printed

from pprint import pprint
pprint(CR.raw()[:500])

In [None]:
# No lines method, force error

CR.lines()

In [None]:
# Make new reader, with new lines method

class VicifonsCorpusReader(PlaintextCorpusReader):
    """
    A corpus reader for working with Vicifons-sourced plaintext docs
    """
    def __init__(self, root, fileids):
        PlaintextCorpusReader.__init__(self, root, fileids)

    def lines(self, fileids=None):
        raw = self.raw()
        lines = raw.split('\n')
        for line in lines:
            yield line

In [None]:
# Instantiate reader, get lines, print lines

root = 'texts/vicifons'
file_pattern = '.+\.txt'
CR = VicifonsCorpusReader(root, file_pattern)

lines = CR.lines()

print(next(lines))

In [None]:
# Print more lines

print(next(lines))
print(next(lines))
print(next(lines))
print(next(lines))

In [None]:
# Add new sents method, i.e. override

from cltk.sentence.lat import LatinPunktSentenceTokenizer
sent_tokenizer= LatinPunktSentenceTokenizer()

In [None]:
# Make new reader, with new sents method

class VicifonsCorpusReader(PlaintextCorpusReader):
    """
    A corpus reader for working with Vicifons-sourced plaintext docs
    """
    def __init__(self, root, fileids):
        PlaintextCorpusReader.__init__(self, root, fileids)

    def lines(self, fileids=None):
        raw = self.raw(fileids)
        lines = raw.split('\n')
        for line in lines:
            yield line

    def sents(self, fileids=None):
        raw = self.raw(fileids)
        raw = " ".join(raw.split('\n'))
        sentences = sent_tokenizer.tokenize(raw)
        for sent in sentences:
            yield sent

In [None]:
# Instantiate reader, get sents, print sents

root = 'texts/vicifons'
file_pattern = '.+\.txt'
CR = VicifonsCorpusReader(root, file_pattern)

sents = CR.sents()

print(next(sents))
print(next(sents))
print(next(sents))
print(next(sents))
print(next(sents))

In [None]:
# Why use custom reader/custom methods? Latin sentence segnmentation example

# Cic. Cat 1.7
test = "Meministine me ante diem XII Kalendas Novembris dicere in senatu fore in armis certo die, qui dies futurus esset ante diem VI Kal. Novembris, C. Manlium, audaciae satellitem atque administrum tuae?"

In [None]:
# Sentence segmentation with NLTK

from nltk.tokenize import sent_tokenize
nltk_sents = sent_tokenize(test)

for i, sent in enumerate(nltk_sents, 1):
    print(f'{i}: {sent}')

In [None]:
# Sentence segmentation with CLTK

from cltk.sentence.lat import LatinPunktSentenceTokenizer
sent_tokenizer= LatinPunktSentenceTokenizer()
nltk_sents = sent_tokenizer.tokenize(test)

for i, sent in enumerate(nltk_sents, 1):
    print(f'{i}: {sent}')