# Introduction to CLTK Readers

CLTK Readers is a corpus reader extension written for use with the Classical Language Toolkit, LatinCy, etc. It gives single-line api access to philologically sensible units (i.e. words, sentences, paragraph, documents, etc.) for basic processing and analyses of Latin text collections. Some readers have extended, higher-order functionality, such as the concordancer for the CLTK Tesserae texts as shown below. The repository for CLTK readers can be found [here](https://github.com/diyclassics/cltk_readers).

In [None]:
## Imports

from cltkreaders.lat import LatinTesseraeCorpusReader
from os.path import expanduser
from natsort import natsorted
from pprint import pprint

In [None]:
## Set up reader
# NB: If you do not have the CLTK-Tesserae corpus already installed in CLTK_DATA, you will be prompted to download the corpus.

T = LatinTesseraeCorpusReader()

## Fileids

In [None]:
## First 10 filesnames

pprint(T.fileids()[:10])

In [None]:
## First 10 works of Cicero

cicero = [file for file in T.fileids() if 'cicero' in file]
pprint(cicero[:10])

In [None]:
## Books of the Aeneid, sorted

aeneid = natsorted([file for file in T.fileids() if 'aeneid' in file])
pprint(aeneid)

## Doc structures

In [None]:
catullus = 'catullus.carmina.tess'

In [None]:
## Docs

catullus_doc = T.docs(catullus)
print(next(catullus_doc)[:446])

In [None]:
## Texts

catullus_text = T.texts(catullus)
print(next(catullus_text)[:335])

In [None]:
## Doc Rows

catullus_docrows = T.doc_rows(catullus)

print('This is a string representation of what the output dictionary looks like...')
print(f'{str(next(catullus_docrows))[:94]} etc. }}\n')


catullus_docrows = T.doc_rows(catullus)
print('Here are the first 10 items of the dict output...')
pprint(list(next(catullus_docrows).items())[:10])


## Doc units

In [None]:
catilinam = 'cicero.in_catilinam.tess'

In [None]:
## Paras

print("Note that for the Tesserae texts, `paras` are *not* implemented. As they are not consistently marked in the original files.")

In [None]:
# Sents

# NB: Sents are segmented by default with the LatinCy dependency parser

catilinam_sents = T.sents(catilinam)

for i in range(1,6):
    print(f'Sent {i}: {next(catilinam_sents)}')

In [None]:
# Words

# NB: Words are tokenized by default with the LatinCy tokenizer

catilinam_words = T.words(catilinam)

for i in range(1,10):
    print(f'Word {i}: {next(catilinam_words)}')


In [None]:
# You can pass a preprocessor to `words` 

def custom_preprocess(text):
    text = text.lower()
    return text

catilinam_words = T.words(catilinam, preprocess=custom_preprocess)

for i in range(1,10):
    print(f'Word {i}: {next(catilinam_words)}')


In [None]:
# Tokenized sents

# A combination of the two structures above; convenient for many applications that require lists of tokenized sentences

catilinam_tokenized_sents = T.tokenized_sents(catilinam)

for i in range(1,10):
    print(f'Tok Sent {i}: {next(catilinam_tokenized_sents)}')


## Doc description

In [None]:
metamorphoses = natsorted([file for file in T.fileids() if 'ovid.metamorphoses' in file])

In [None]:
def custom_preprocess(text):
    from cltk.alphabet.lat import JVReplacer
    replacer = JVReplacer()

    text = text.lower() # Lowercase
    text = replacer.replace(text)  # Normalize u/v & i/j

    # Remove punctuation
    punctuation ="\"#$%&\'()*+,/:;<=>@[\]^_`{|}~.?!«»—“-”"
    misc = '¡£¤¥¦§¨©¯°±²³´µ¶·¸¹º¼½¾¿÷·–‘’†•ↄ∞⏑〈〉（）'
    misc += punctuation
    translator = str.maketrans({key: " " for key in misc})
    text = text.translate(translator)

    # Remove numbers
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)

    return " ".join(text.split()).strip()

In [None]:
## Concordance, using Tesserae citations

metamorphoses_concordances = T.concordance(metamorphoses, preprocess=custom_preprocess)

met_conc_sample = next(metamorphoses_concordances)
pprint(list(met_conc_sample.items())[:3])

In [None]:
# Concordances are by default built on a file-by-file basis, but can easily be combined with the `compiled` parameter

metamorphoses_concordances = T.concordance(metamorphoses, compiled=True, preprocess=custom_preprocess)

full_met_conc_sample = next(metamorphoses_concordances)
pprint(list(full_met_conc_sample.items())[96:102])

In [None]:
# Since the concordances are output as dictionaries, you can retrieve location information using the token as a dict key...

metamorphoses_concordances = T.concordance(metamorphoses, compiled=True, preprocess=custom_preprocess)
full_met_conc_sample = next(metamorphoses_concordances)

print(f'\'corpus\' appears {len(full_met_conc_sample["corpus"])} times in the Metamorphoses.')
print('Here are the first five instances...')
print(full_met_conc_sample['corpus'][:5])

print()

print(f'\'corpora\' appears {len(full_met_conc_sample["corpora"])} times in the Metamorphoses.')
print('Here are the first five instances...')
print(full_met_conc_sample['corpora'][:5])

In [None]:
## Basic descriptive data; this data can also be returned for individual files or lists of files
# Here just Catullus...

print('Stats on just the file \'catullus.carmina.tess\'')
pprint(T.describe(catullus))

In [None]:
print('Stats on just the group of files assigned above to the variable `metamorphoses`')
pprint(T.describe(metamorphoses))

In [None]:
# ## Basic descriptive data; note takes several minutes to run

# tess_describe = T.describe()
# pprint(tess_describe)