In [None]:
## Imports

from cltkreaders.lat import LatinTesseraeCorpusReader

from os.path import expanduser
from natsort import natsorted
from tqdm import tqdm
from pprint import pprint

In [None]:
## Set up reader
# NB: If you do not have the CLTK-Tesserae corpus already installed in CLTK_DATA, you will be prompted to download the corpus.

T = LatinTesseraeCorpusReader()

## Fileids and metadata

In [None]:
## First 10 filesnames

files = T.fileids()[:10]
pprint(files)

In [None]:
# Get files by metadata; e.g. author
files = T.fileids(author='horace')
pprint(files)

In [None]:
# Get files by metadata; e.g. mode
files = T.fileids(mode='verse')[:10]
pprint(files)

In [None]:
# Get files by metadata; e.g. date
files = T.fileids(date=54)
pprint(files)

In [None]:
# Get files by metadata; e.g. date & mode
files = T.fileids(date=54, mode='verse')
pprint(files)

In [None]:
# Get files by metadata; e.g. max_date

files = T.fileids(max_date=150)[:10]
pprint(files)

In [None]:
# Get files by metadata; e.g. filename match

files = T.fileids(match='lucretius')[:10]
pprint(files)

In [None]:
# Get metadata for file

file = T.fileids()[0]

print(file)
print(T.metadata('mode', file))
pprint(T._metadata[file]) # TODO: should make this more direct

## Doc structures

In [None]:
catullus = 'catullus.carmina.tess'

In [None]:
## Docs

catullus_docs = T.docs(catullus)
catullus_doc = next(catullus_docs)
print(catullus_doc[:446])


In [None]:
## Texts

catullus_texts = T.texts(catullus)
catullus_text = next(catullus_texts)
pprint(catullus_text[:335])

In [None]:
## Doc Rows

catullus_docrows = T.doc_rows(catullus)

print('This is a string representation of what the output dictionary looks like...')
print(f'{str(next(catullus_docrows))[:94]} etc. }}\n')


catullus_docrows = T.doc_rows(catullus)
print('Here are the first 10 items of the dict output...')
pprint(list(next(catullus_docrows).items())[:10])


## Doc units

In [None]:
catilinam = 'cicero.in_catilinam.tess'

In [None]:
## Paras (not implemented)

# catilinam_paras = T.paras(catilinam)

In [None]:
# Sents

# By default, segmentation, tokenization, and other tagging is done using the spaCy model 'la_core_web_lg'

catilinam_sents = T.sents(catilinam)

for i in range(1,6):
    print(f'Sent {i}: {next(catilinam_sents)}')

In [None]:
# Words

catilinam_words = T.tokens(catilinam)

for i in range(1,10):
    print(f'Word {i}: {next(catilinam_words)}')


In [None]:
print([item for item in dir(type(next(catilinam_words))) if not item.startswith('_')])

In [None]:
test_token = t = next(catilinam_words)
print(t.text, t.lemma_, t.pos_, t.tag_, t.dep_)

In [None]:
# You can pass a preprocessor to `words` (or `sents`, etc.)

def custom_preprocess(text):
    text = text.lower()
    text = text.replace(',','').replace('?','')
    return text

catilinam_words = T.tokens(catilinam, preprocess=custom_preprocess)

for i in range(1,8):
    print(f'Word {i}: {next(catilinam_words)}')


In [None]:
# Tokenized sents

# i.e. Sents in the form of a list of tuples of the form `(token, lemma, tag)`

catilinam_tokenized_sents = T.tokenized_sents(catilinam)

for i in range(1,4):
    print(f'Tok Sent {i}: {next(catilinam_tokenized_sents)}')
    print()


In [None]:
# Tokenized sents, simplified

# i.e. Sents in the form of a list of tokens

catilinam_tokenized_sents = T.tokenized_sents(catilinam, simple=True)

for i in range(1,4):
    print(f'Tok Sent {i}: {next(catilinam_tokenized_sents)}')
    print()


In [None]:
# POS-tagged sents

# i.e. Sents in the form of a list of strings of the form `token/POS`

catilinam_pos_sents = T.pos_sents(catilinam)

for i in range(1,2):
    print(f'POS Sent {i}: {next(catilinam_pos_sents)}')


In [None]:
# Note spacy objects are output by default
print(type(next(catilinam_words)))

In [None]:
# Tokens, with plaintext output

plaintext_tokens = T.tokens(catilinam, plaintext=True)

plaintext_token = next(plaintext_tokens)
print(plaintext_token)
print(type(plaintext_token))

In [None]:
# Lines (designed for verse)

aeneid = T.fileids(match='aeneid')[0]

aeneid_lines = T.lines(aeneid)

for i in range(1,9):
    print(f'{i}: {next(aeneid_lines)}')

In [None]:
# Lines, maintaining citation information

aeneid_lines = T.lines(aeneid)

for i in range(1,9):
    line = next(aeneid_lines)
    print(f'{line._.citation}: {line}')

In [None]:
# Sentences, maintaing citation information, inc. over line breaks; also maintain metadata

aeneid_sents = T.sents(aeneid)
aeneid_sent = next(aeneid_sents)

print(aeneid_sent.text)
print(aeneid_sent._.sentence_citation)
print(aeneid_sent._.metadata)

In [None]:
# Tokens, maintaing citation information, inc. over line breaks; also maintain metadata

aeneid_words = T.tokens(aeneid)
aeneid_word = next(aeneid_words)

print(aeneid_word.text)
print(aeneid_word._.citation)
print(aeneid_word._.metadata)

## Doc description

In [None]:
metamorphoses = T.fileids(author='ovid', match='metamorphoses') # TODO: Add titles to metadata
pprint(metamorphoses)


In [None]:
def custom_preprocess(text):
    from cltk.alphabet.lat import JVReplacer
    replacer = JVReplacer()

    text = text.lower() # Lowercase
    text = replacer.replace(text)  # Normalize u/v & i/j

    # Remove punctuation
    punctuation ="\"#$%&\'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
    misc = '¡£¤¥¦§¨©¯°±²³´µ¶·¸¹º¼½¾¿÷·–‘’†•ↄ∞⏑〈〉（）'
    misc += punctuation
    translator = str.maketrans({key: " " for key in misc})
    text = text.translate(translator)

    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    return " ".join(text.split()).strip()

In [None]:
## Concordance, using Tesserae citations
# Not working

# metamorphoses_concordances = T.concordance(metamorphoses, preprocess=custom_preprocess)

In [None]:
# ## Basic descriptive data; note takes several minutes to run

# tess_describe = T.describe()
# pprint(tess_describe)

Sample output:  

{'files': 748,  
 'lexdiv': 24.255701516259066,  
 'secs': 143.71532320976257,  
 'sents': 314436,  
 'vocab': 329693,  
 'words': 7996935}  

In [None]:
## This data can also be returned for individual files or lists of files

print('Stats on just the file \'catullus.carmina.tess\'')
pprint(T.describe(catullus))