In [1]:
## Imports

from latincyreaders import TesseraeReader, AnnotationLevel

from pprint import pprint

In [2]:
## Set up reader

T = TesseraeReader()

## Fileids and metadata

In [3]:
## First 10 filenames

files = T.fileids()[:10]
pprint(files)

['ammianus.rerum_gestarum.part.14.tess',
 'ammianus.rerum_gestarum.part.15.tess',
 'ammianus.rerum_gestarum.part.16.tess',
 'ammianus.rerum_gestarum.part.17.tess',
 'ammianus.rerum_gestarum.part.18.tess',
 'ammianus.rerum_gestarum.part.19.tess',
 'ammianus.rerum_gestarum.part.20.tess',
 'ammianus.rerum_gestarum.part.21.tess',
 'ammianus.rerum_gestarum.part.22.tess',
 'ammianus.rerum_gestarum.part.23.tess']


In [4]:
# Get files by pattern match (regex)
files = T.fileids(match='horace')
pprint(files)

['horace.ars_poetica.tess',
 'horace.carmen_saeculare.tess',
 'horace.epistles.tess',
 'horace.epodes.tess',
 'horace.odes.part.1.tess',
 'horace.odes.part.2.tess',
 'horace.odes.part.3.tess',
 'horace.odes.part.4.tess',
 'horace.satires.part.1.tess',
 'horace.satires.part.2.tess']


In [5]:
# Get files by pattern - supports regex
files = T.fileids(match=r'vergil.*aeneid')
pprint(files)

['vergil.aeneid.part.1.tess',
 'vergil.aeneid.part.2.tess',
 'vergil.aeneid.part.3.tess',
 'vergil.aeneid.part.4.tess',
 'vergil.aeneid.part.5.tess',
 'vergil.aeneid.part.6.tess',
 'vergil.aeneid.part.7.tess',
 'vergil.aeneid.part.8.tess',
 'vergil.aeneid.part.9.tess',
 'vergil.aeneid.part.10.tess',
 'vergil.aeneid.part.11.tess',
 'vergil.aeneid.part.12.tess']


In [6]:
# Get files by partial match
files = T.fileids(match='cicero')[:10]
pprint(files)

['cicero.academica.tess',
 'cicero.brutus.tess',
 'cicero.cum_populo_gratias_egit.tess',
 'cicero.de_amicitia.tess',
 'cicero.de_divinatione.tess',
 'cicero.de_domo_sua.tess',
 'cicero.de_fato.tess',
 'cicero.de_finibus_bonorum_et_malorum.part.1.tess',
 'cicero.de_finibus_bonorum_et_malorum.part.2.tess',
 'cicero.de_finibus_bonorum_et_malorum.part.3.tess']


In [7]:
# Case-insensitive regex matching
files = T.fileids(match=r'ovid')
pprint(files)

['ovid.amores.part.1.tess',
 'ovid.amores.part.2.tess',
 'ovid.amores.part.3.tess',
 'ovid.ars_amatoria.part.1.tess',
 'ovid.ars_amatoria.part.2.tess',
 'ovid.ars_amatoria.part.3.tess',
 'ovid.ex_ponto.part.1.tess',
 'ovid.ex_ponto.part.2.tess',
 'ovid.ex_ponto.part.3.tess',
 'ovid.ex_ponto.part.4.tess',
 'ovid.fasti.part.1.tess',
 'ovid.fasti.part.2.tess',
 'ovid.fasti.part.3.tess',
 'ovid.fasti.part.4.tess',
 'ovid.fasti.part.5.tess',
 'ovid.fasti.part.6.tess',
 'ovid.heroides.part.1.1-15.tess',
 'ovid.heroides.part.2.16-21.tess',
 'ovid.ibis.tess',
 'ovid.medicamina_faciei_femineae.tess',
 'ovid.metamorphoses.part.1.tess',
 'ovid.metamorphoses.part.2.tess',
 'ovid.metamorphoses.part.3.tess',
 'ovid.metamorphoses.part.4.tess',
 'ovid.metamorphoses.part.5.tess',
 'ovid.metamorphoses.part.6.tess',
 'ovid.metamorphoses.part.7.tess',
 'ovid.metamorphoses.part.8.tess',
 'ovid.metamorphoses.part.9.tess',
 'ovid.metamorphoses.part.10.tess',
 'ovid.metamorphoses.part.11.tess',
 'ovid.metamor

In [8]:
# Multiple pattern examples
files = T.fileids(match='lucretius')
pprint(files)

['lucretius.de_rerum_natura.part.1.tess',
 'lucretius.de_rerum_natura.part.2.tess',
 'lucretius.de_rerum_natura.part.3.tess',
 'lucretius.de_rerum_natura.part.4.tess',
 'lucretius.de_rerum_natura.part.5.tess',
 'lucretius.de_rerum_natura.part.6.tess',
 'polignac.antilucretius.tess']


In [9]:
# Get all files
all_files = T.fileids()
print(f"Total files: {len(all_files)}")

Total files: 900


### Filtering by metadata

The `match` parameter uses regex, which can be too broad. For precise filtering, use metadata.

In [None]:
# Problem: regex match can be too broad
# Searching for "lucretius" also finds "antilucretius"

files = T.fileids(match='lucretius')
pprint(files)
# Note: polignac.antilucretius is included!

In [None]:
# Solution: filter by exact author using metadata
# This gets ONLY Lucretius, not Anti-Lucretius

lucretius_files = [
    fileid for fileid, meta in T.metadata()
    if meta.get('author') == 'Lucretius'
]
pprint(lucretius_files)

In [None]:
# Filter by genre (e.g., find all epic poetry)

epic_files = [
    fileid for fileid, meta in T.metadata()
    if meta.get('genre') == 'epic'
]
print(f"Epic texts: {len(epic_files)} files")
pprint(epic_files[:10])  # First 10

In [None]:
# Filter by date: texts before 50 BCE (negative dates = BCE)

def get_date(meta):
    """Parse date string to int, handling missing/invalid values."""
    try:
        return int(meta.get('date', 0))
    except (ValueError, TypeError):
        return None

# Texts from before 50 BCE
early_republic = [
    fileid for fileid, meta in T.metadata()
    if (d := get_date(meta)) is not None and d < -50
]
print(f"Texts before 50 BCE: {len(early_republic)} files")
pprint(early_republic[:10])

In [None]:
# Augustan era texts (roughly 43 BCE - 14 CE)

augustan = [
    fileid for fileid, meta in T.metadata()
    if (d := get_date(meta)) is not None and -43 <= d <= 14
]
print(f"Augustan era texts: {len(augustan)} files")
pprint(augustan[:10])

In [None]:
# See all available genres in the corpus

from collections import Counter

genres = Counter(
    meta.get('genre') for _, meta in T.metadata()
    if meta.get('genre')
)
print("Available genres:")
for genre, count in genres.most_common():
    print(f"  {genre}: {count} files")

In [None]:
# Combine filters: lyric poetry from the Augustan era

augustan_lyric = [
    fileid for fileid, meta in T.metadata()
    if meta.get('genre') == 'lyric'
    and (d := get_date(meta)) is not None 
    and -43 <= d <= 14
]
print(f"Augustan lyric poetry: {len(augustan_lyric)} files")
pprint(augustan_lyric)

In [None]:
# Metadata can be accessed without NLP processing (instant)
# Use get_metadata() to avoid loading the spaCy model

catullus = 'catullus.carmina.tess'

# Fast: get metadata directly (no NLP overhead)
meta = T.get_metadata(catullus)

print(f"Metadata for {catullus}:")
pprint(meta)

## Doc structures

In [11]:
# Define a file to work with
catullus = 'catullus.carmina.tess'

In [12]:
## Docs - spaCy Doc objects with NLP annotations

catullus_doc = next(T.docs(catullus))
print(catullus_doc[:500])

Cui dono lepidum novum libellum arido modo pumice expolitum? Corneli, tibi; namque tu solebas meas esse aliquid putare nugas, iam tum cum ausus es unus Italorum omne aevum tribus explicare chartis, doctis, Iuppiter, et laboriosis! quare habe tibi quidquid hoc libelli qualecumque, quod, o patrona virgo, plus uno maneat perenne saeclo. Passer, deliciae meae puellae, quicum ludere, quem in sinu tenere, cui primum digitum dare adpetenti et acris solet incitare morsus, cum desiderio meo nitenti carum nescio quid libet iocari (et solaciolum sui doloris, credo, ut tum gravis adquiescat ardor), tecum ludere sicut ipsa possem et tristis animi levare curas! Tam gratum est mihi quam ferunt puellae pernici aureolum fuisse malum, quod zonam solvit diu ligatam. Lugete, o Veneres Cupidinesque et quantum est hominum venustiorum! passer mortuus est meae puellae, passer, deliciae meae puellae, quem plus illa oculis suis amabat; nam mellitus erat, suamque norat ipsa tam bene quam puella matrem, nec sese 

In [13]:
## Texts - raw strings (zero NLP overhead)

catullus_text = next(T.texts(catullus))
pprint(catullus_text[:400])

('Cui dono lepidum novum libellum arido modo pumice expolitum? Corneli, tibi; '
 'namque tu solebas meas esse aliquid putare nugas, iam tum cum ausus es unus '
 'Italorum omne aevum tribus explicare chartis, doctis, Iuppiter, et '
 'laboriosis! quare habe tibi quidquid hoc libelli qualecumque, quod, o '
 'patrona virgo, plus uno maneat perenne saeclo. Passer, deliciae meae '
 'puellae, quicum ludere, quem in sinu tener')


In [14]:
## Doc Rows - citation -> text mapping

catullus_docrows = next(T.doc_rows(catullus))

print('First 10 citation -> span mappings:')
for i, (citation, span) in enumerate(catullus_docrows.items()):
    if i >= 10:
        break
    print(f"  {citation}: {span.text[:40]}...")

First 10 citation -> span mappings:
  <cat. 1.1>: Cui dono lepidum novum libellum...
  <cat. 1.2>: arido modo pumice expolitum?...
  <cat. 1.3>: Corneli, tibi; namque tu solebas...
  <cat. 1.4>: meas esse aliquid putare nugas,...
  <cat. 1.5>: iam tum cum ausus es unus Italorum...
  <cat. 1.6>: omne aevum tribus explicare chartis,...
  <cat. 1.7>: doctis, Iuppiter, et laboriosis!...
  <cat. 1.8>: quare habe tibi quidquid hoc libelli...
  <cat. 1.9>: qualecumque, quod, o patrona virgo,...
  <cat. 1.10>: plus uno maneat perenne saeclo....


## Doc units

In [15]:
# Another file for examples
catilinam = 'cicero.in_catilinam.tess'

In [16]:
## Paras (not implemented - Tesserae format doesn't have paragraphs)
# Use sents() or lines() instead

In [17]:
# Sents - spaCy Span objects

# Segmentation and tokenization done using la_core_web_lg model
catilinam_sents = T.sents(catilinam)

for i in range(1, 6):
    print(f'Sent {i}: {next(catilinam_sents)}')

Sent 1: quo usque tandem abutere, Catilina, patientia nostra?
Sent 2: quam diu etiam furor iste tuus nos eludet?
Sent 3: quem ad finem sese effrenata iactabit audacia?
Sent 4: nihilne te nocturnum praesidium Palati, nihil urbis vigiliae, nihil timor populi, nihil concursus bonorum omnium, nihil hic munitissimus habendi senatus locus, nihil horum ora voltusque moverunt?
Sent 5: patere tua consilia non sentis, constrictam iam horum omnium scientia teneri coniurationem tuam non vides?


In [18]:
# Tokens - spaCy Token objects

catilinam_tokens = T.tokens(catilinam)

for i in range(1, 10):
    print(f'Word {i}: {next(catilinam_tokens)}')

Word 1: quo
Word 2: usque
Word 3: tandem
Word 4: abutere
Word 5: ,
Word 6: Catilina
Word 7: ,
Word 8: patientia
Word 9: nostra


In [19]:
# spaCy Token has many useful attributes
catilinam_tokens = T.tokens(catilinam)
token = next(catilinam_tokens)
print(f"Available attributes: {[a for a in dir(token) if not a.startswith('_')][:15]}...")

Available attributes: ['ancestors', 'check_flag', 'children', 'cluster', 'conjuncts', 'dep', 'dep_', 'doc', 'ent_id', 'ent_id_', 'ent_iob', 'ent_iob_', 'ent_kb_id', 'ent_kb_id_', 'ent_type']...


In [None]:
# Token linguistic attributes (BASIC level: text, lemma, POS, tag)
# Note: dep_ requires AnnotationLevel.FULL

catilinam_tokens = T.tokens(catilinam)
t = next(catilinam_tokens)
print(f"text: {t.text}, lemma: {t.lemma_}, pos: {t.pos_}, tag: {t.tag_}")

In [22]:
# For custom text processing, work with the raw text or spaCy Doc
# The preprocess parameter has been removed - use spaCy pipeline components instead

# Get text as strings
for token_text in T.tokens(catilinam, as_text=True):
    # Apply your own processing
    processed = token_text.lower()
    print(processed, end=' ')
    break  # Just show first token

quo 

In [23]:
# Tokenized sents - use spaCy directly
# Get (token, lemma, tag) tuples from sentences

catilinam_sents = T.sents(catilinam)

for i in range(1, 4):
    sent = next(catilinam_sents)
    tok_sent = [(t.text, t.lemma_, t.tag_) for t in sent]
    print(f'Tok Sent {i}: {tok_sent}')
    print()

Tok Sent 1: [('quo', 'quo', 'adverb'), ('usque', 'usque', 'adverb'), ('tandem', 'tandem', 'adverb'), ('abutere', 'abutor', 'verb'), (',', ',', 'punc'), ('Catilina', 'Catilina', 'proper_noun'), (',', ',', 'punc'), ('patientia', 'patientia', 'noun'), ('nostra', 'noster', 'adjective'), ('?', '?', 'punc')]

Tok Sent 2: [('quam', 'qui', 'conjunction'), ('diu', 'diu', 'adverb'), ('etiam', 'etiam', 'adverb'), ('furor', 'furor', 'noun'), ('iste', 'iste', 'adjective'), ('tuus', 'tuus', 'adjective'), ('nos', 'nos', 'pronoun'), ('eludet', 'eludo', 'verb'), ('?', '?', 'punc')]

Tok Sent 3: [('quem', 'qui', 'pronoun'), ('ad', 'ad', 'preposition'), ('finem', 'finis', 'noun'), ('sese', 'sui', 'pronoun'), ('effrenata', 'effrenatus', 'verb'), ('iactabit', 'iacto', 'verb'), ('audacia', 'audacia', 'noun'), ('?', '?', 'punc')]



In [24]:
# Tokenized sents, simplified (just tokens as strings)

catilinam_sents = T.sents(catilinam)

for i in range(1, 4):
    sent = next(catilinam_sents)
    tok_sent = [t.text for t in sent]
    print(f'Tok Sent {i}: {tok_sent}')
    print()

Tok Sent 1: ['quo', 'usque', 'tandem', 'abutere', ',', 'Catilina', ',', 'patientia', 'nostra', '?']

Tok Sent 2: ['quam', 'diu', 'etiam', 'furor', 'iste', 'tuus', 'nos', 'eludet', '?']

Tok Sent 3: ['quem', 'ad', 'finem', 'sese', 'effrenata', 'iactabit', 'audacia', '?']



In [25]:
# POS-tagged sents - token/POS pairs

catilinam_sents = T.sents(catilinam)

for i in range(1, 3):
    sent = next(catilinam_sents)
    pos_sent = [f"{t.text}/{t.pos_}" for t in sent]
    print(f'POS Sent {i}: {" ".join(pos_sent)}')

POS Sent 1: quo/ADV usque/ADV tandem/ADV abutere/VERB ,/PUNCT Catilina/PROPN ,/PUNCT patientia/NOUN nostra/DET ?/PUNCT
POS Sent 2: quam/SCONJ diu/ADV etiam/ADV furor/NOUN iste/DET tuus/DET nos/PRON eludet/VERB ?/PUNCT


In [None]:
# spaCy Token objects by default
catilinam_tokens = T.tokens(catilinam)

catilinam_token = next(catilinam_tokens)
print(catilinam_token)
print(type(next(catilinam_tokens)))

quo
<class 'spacy.tokens.token.Token'>


In [31]:
# Tokens as plain strings with as_text=True

plaintext_tokens = T.tokens(catilinam, as_text=True)

plaintext_token = next(plaintext_tokens)
print(plaintext_token)
print(type(plaintext_token))

quo
<class 'str'>


In [32]:
# Lines (citation units from the Tesserae format)

aeneid = T.fileids(match='aeneid')[0]

aeneid_lines = T.lines(aeneid)

for i in range(1, 9):
    print(f'{i}: {next(aeneid_lines)}')

1: Arma virumque cano, Troiae qui primus ab oris
2: Italiam, fato profugus, Laviniaque venit
3: litora, multum ille et terris iactatus et alto
4: vi superum saevae memorem Iunonis ob iram;
5: multa quoque et bello passus, dum conderet urbem,
6: inferretque deos Latio, genus unde Latinum,
7: Albanique patres, atque altae moenia Romae.
8: Musa, mihi causas memora, quo numine laeso,


In [33]:
# Lines with citation information preserved

aeneid_lines = T.lines(aeneid)

for i in range(1, 9):
    line = next(aeneid_lines)
    print(f'{line._.citation}: {line}')

<verg. aen. 1.1>: Arma virumque cano, Troiae qui primus ab oris
<verg. aen. 1.2>: Italiam, fato profugus, Laviniaque venit
<verg. aen. 1.3>: litora, multum ille et terris iactatus et alto
<verg. aen. 1.4>: vi superum saevae memorem Iunonis ob iram;
<verg. aen. 1.5>: multa quoque et bello passus, dum conderet urbem,
<verg. aen. 1.6>: inferretque deos Latio, genus unde Latinum,
<verg. aen. 1.7>: Albanique patres, atque altae moenia Romae.
<verg. aen. 1.8>: Musa, mihi causas memora, quo numine laeso,


In [None]:
# Sentences with citation information
# Sentences can span multiple citation lines

aeneid_doc = next(T.docs(aeneid))

# Show first few sentences with their citation ranges
for i, sent in enumerate(aeneid_doc.sents):
    if i >= 3:
        break
    overlapping = [
        span._.citation for span in aeneid_doc.spans.get("lines", [])
        if sent.start < span.end and sent.end > span.start
    ]
    if overlapping:
        cit_range = f"{overlapping[0]}–{overlapping[-1]}" if len(overlapping) > 1 else overlapping[0]
    else:
        cit_range = "?"
    print(f"{cit_range}")
    print(f"  {sent.text[:80]}...")
    print()

In [36]:
# Tokens within citation lines
# Access citation via the line spans

aeneid_doc = next(T.docs(aeneid))
line = aeneid_doc.spans["lines"][0]

print(f"Line citation: {line._.citation}")
print(f"Tokens in line: {[t.text for t in line]}")

Line citation: <verg. aen. 1.1>
Tokens in line: ['Arma', 'virum', 'que', 'cano', ',', 'Troiae', 'qui', 'primus', 'ab', 'oris']


## Doc description

In [38]:
# Get files by pattern
metamorphoses = T.fileids(match='ovid.metamorphoses')
pprint(metamorphoses)

['ovid.metamorphoses.part.1.tess',
 'ovid.metamorphoses.part.2.tess',
 'ovid.metamorphoses.part.3.tess',
 'ovid.metamorphoses.part.4.tess',
 'ovid.metamorphoses.part.5.tess',
 'ovid.metamorphoses.part.6.tess',
 'ovid.metamorphoses.part.7.tess',
 'ovid.metamorphoses.part.8.tess',
 'ovid.metamorphoses.part.9.tess',
 'ovid.metamorphoses.part.10.tess',
 'ovid.metamorphoses.part.11.tess',
 'ovid.metamorphoses.part.12.tess',
 'ovid.metamorphoses.part.13.tess',
 'ovid.metamorphoses.part.14.tess',
 'ovid.metamorphoses.part.15.tess']


In [39]:
# Custom text normalization example
# Use this pattern when you need specific preprocessing

def normalize_latin(text):
    """Normalize Latin text for analysis."""
    text = text.lower()
    # Normalize u/v and i/j
    text = text.replace('v', 'u').replace('j', 'i')
    # Remove punctuation  
    import string
    text = text.translate(str.maketrans('', '', string.punctuation))
    return " ".join(text.split()).strip()

# Example usage on raw text
sample = next(T.texts(metamorphoses[0]))[:100]
print(f"Original: {sample}")
print(f"Normalized: {normalize_latin(sample)}")

Original: In nova fert animus mutatas dicere formas corpora; di, coeptis (nam vos mutastis et illas) adspirate
Normalized: in noua fert animus mutatas dicere formas corpora di coeptis nam uos mutastis et illas adspirate


In [None]:
## Concordance

# Build a concordance: word -> list of citations where it appears
# Group by lemma (default) to see all forms of a word together

catullus_conc = T.concordance(fileids=catullus, basis="lemma")

print(f"Unique lemmas in Catullus: {len(catullus_conc)}")
print()

# Look up a specific lemma
if "amor" in catullus_conc:
    print("Citations for 'amor':")
    for cit in catullus_conc["amor"][:10]:
        print(f"  {cit}")
    if len(catullus_conc["amor"]) > 10:
        print(f"  ... and {len(catullus_conc['amor']) - 10} more")

In [None]:
# Concordance by surface text form (exact spelling)
catullus_conc_text = T.concordance(fileids=catullus, basis="text")

# Different forms of 'puella' (girl)
puella_forms = ["puella", "puellae", "puellam", "puellas", "puellis"]
print("Occurrences of 'puella' forms in Catullus:")
for form in puella_forms:
    if form in catullus_conc_text:
        count = len(catullus_conc_text[form])
        print(f"  {form}: {count} occurrences")

## KWIC (Keyword in Context)

Find words with surrounding context - useful for studying word usage patterns.

In [None]:
# Basic KWIC search - find "amor" with 5 tokens of context on each side
for hit in T.kwic("amor", fileids=catullus, window=5, limit=5):
    print(f"{hit['left']} [{hit['match']}] {hit['right']}")
    print(f"  -- {hit['citation']}")
    print()

In [None]:
# KWIC by lemma - finds all forms of a word (e.g., amo, amat, amant, amavit)
# Use by_lemma=True to match against lemmatized forms

for hit in T.kwic("amo", fileids=catullus, by_lemma=True, window=4, limit=5):
    print(f"{hit['left']} [{hit['match']}] {hit['right']}")
    print(f"  -- {hit['citation']}")
    print()

In [42]:
## Basic descriptive stats
# Count files, estimate tokens, etc.

# Quick corpus overview
files = T.fileids()
print(f"Total files: {len(files)}")

# Sample stats from one file
sample_file = files[0]
sample_text = next(T.texts(sample_file))
print(f"\nSample file: {sample_file}")
print(f"Character count: {len(sample_text)}")
print(f"Word count (approx): {len(sample_text.split())}")

Total files: 900

Sample file: ammianus.rerum_gestarum.part.14.tess
Character count: 63253
Word count (approx): 8201


### Sample output for full corpus

A full describe() method will be added in a future release.

In [43]:
## Stats for a specific file

catullus_doc = next(T.docs(catullus))

print(f'Stats for {catullus}:')
print(f'  Sentences: {len(list(catullus_doc.sents))}')
print(f'  Tokens: {len(catullus_doc)}')
print(f'  Citation lines: {len(catullus_doc.spans.get("lines", []))}')

Stats for catullus.carmina.tess:
  Sentences: 913
  Tokens: 15536
  Citation lines: 2285


## New Features in latincyreaders

The following sections demonstrate new search and filtering capabilities.

In [44]:
# search() - fast regex search across the corpus (no NLP required)
from itertools import islice

# Find lines mentioning Thebes (limit to first 5 results)
results = T.search(r'\bTheb\w+\b')
for fileid, citation, text, matches in islice(results, 5):
    print(f"{fileid} {citation}: found {matches}")
    print(f"  → {text[:60]}...")
    print()

ammianus.rerum_gestarum.part.14.tess <amm. 14.11.15>: found ['Thebaeas']
  → Emensis itaque longis intervallis et planis, cum Hadrianopol...

ammianus.rerum_gestarum.part.15.tess <amm. 15.10.9>: found ['Thebaeus']
  → Et primam Thebaeus Hercules, ad Geryonem exstinguendum (ut r...

ammianus.rerum_gestarum.part.17.tess <amm. 17.4.2>: found ['Thebas', 'Thebais']
  → Urbem priscis saeculis conditam, ambitiosa moenium strue et ...

ammianus.rerum_gestarum.part.19.tess <amm. 19.12.3>: found ['Thebaidis']
  → Materiam autem in infinitum quaestionibus extendendis dedit ...

ammianus.rerum_gestarum.part.22.tess <amm. 22.16.1>: found ['Thebaida']
  → Tres provincias Aegyptus fertur habuisse temporibus priscis,...



In [45]:
# find_lines() - find citation lines containing specific words/patterns

# Find lines with specific word forms
forms = ["Thebas", "Thebarum", "Thebis"]
for fileid, citation, text in islice(T.find_lines(forms=forms), 5):
    print(f"{citation}: {text[:70]}...")

<amm. 17.4.2>: Urbem priscis saeculis conditam, ambitiosa moenium strue et portarum c...
<amm. 22.16.2>: Igitur Thebais multas inter urbes clariores aliis Hermopolim habet, et...
<apul.fl. 22>: Crates ille Diogenis sectator, qui ut lar familiaris apud homines aeta...
<apul.met. 4.9>: Suscipit unus ex illo posteriore numero: Tune solus ignoras longe faci...
<aus. epit. 27.1>: THEBARUM regina fui, Sipyleia cautes...


In [46]:
# find_sents() - find sentences containing specific words
# Fast path: search by exact forms (uses regex, minimal NLP)

for hit in islice(T.find_sents(forms=["Caesar", "Caesarem", "Caesaris"]), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched: {hit['matches']}")
    print()

<amm. 14.1.0>: Galli Caesaris saevitia....
  Matched: ['Caesaris']

<amm. 14.1.1>: Post emensos insuperabilis expeditionis eventus, languentibus partium animis, qu...
  Matched: ['Caesaris']

<amm. 14.1.5>: sed quidquid Caesaris implacabilitati sedisset, id velut fas iusque perpensum, c...
  Matched: ['Caesaris']

<amm. 14.1.6>: Hi peragranter et dissimulanter honoratorum circulis assistendo, pervadendoque d...
  Matched: ['Caesaris']

<amm. 14.1.10>: Quibus mox Caesar acrius efferatus, velut contumaciae quoddam vexillum altius er...
  Matched: ['Caesar']



In [47]:
# find_sents() by lemma - slower but finds ALL forms
# Uses NLP to lemmatize, so it catches forms you might miss

# Find all sentences with any form of "bellum" (war)
for hit in islice(T.find_sents(lemma="bellum"), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched forms: {hit['matches']}")
    print()

<amm. 14.2.1>: Namque et Isauri, quibus est usitatum saepe pacari, saepeque inopinis excursibus...
  Matched forms: ['bella']

<amm. 14.3.1>: Eo adducta re per Isauriam, rege Persarum bellis finitimis illigato, repellenteq...
  Matched forms: ['bellis']

<amm. 14.6.4>: Eius populus ab incunabulis primis ad usque pueritiae tempus extremum, quod anni...
  Matched forms: ['bella']

<amm. 14.6.4>: deinde aetatem ingressus adultam, post multiplices bellorum aerumnas, Alpes tran...
  Matched forms: ['bellorum']

<amm. 14.6.10>: Alii nullo quaerente, vultus severitate assimulata, patrimonia sua in immensum e...
  Matched forms: ['bella']



In [48]:
# find_sents() with spaCy Matcher patterns - advanced pattern matching
# Search for ADJ + NOUN sequences (e.g., "magna voce", "pulchra puella")

pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}]
for hit in islice(T.find_sents(matcher_pattern=pattern, fileids=T.fileids(match="catullus")), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched: {hit['matches']}")
    print()

<cat. 1.1>: Cui dono lepidum novum libellum arido modo pumice expolitum?...
  Matched: ['novum libellum']

<cat. 1.8>: quare habe tibi quidquid hoc libelli qualecumque, quod, o patrona virgo, plus un...
  Matched: ['perenne saeclo']

<cat. 2.1>: Passer, deliciae meae puellae, quicum ludere, quem in sinu tenere, cui primum di...
  Matched: ['primum digitum', 'tristis animi']

<cat. 3.13>: at vobis male sit, malae tenebrae Orci, quae omnia bella devoratis;...
  Matched: ['malae tenebrae']

<cat. 3.16>: o miselle passer!...
  Matched: ['miselle passer']



In [49]:
# More complex Matcher patterns
# Find sentences with a specific lemma followed by a noun

pattern = [{"LEMMA": "magnus"}, {"POS": "NOUN"}]
for hit in islice(T.find_sents(matcher_pattern=pattern), 5):
    print(f"{hit['citation']}: {hit['matches']}")

<amm. 14.2.8>: ['magna parte']
<amm. 15.5.30>: ['magna industria']
<amm. 15.5.34>: ['magnis vocibus']
<amm. 15.7.10>: ['magna difficultate']
<amm. 15.8.18>: ['magnis viribus']


### Annotation Levels

Control NLP processing overhead with `AnnotationLevel`:

In [51]:
# AnnotationLevel controls how much NLP processing to apply

# NONE - use texts() for raw strings (fastest)
# TOKENIZE - tokenization + sentence boundaries only
# BASIC - adds lemmatization and POS tagging (default)
# FULL - full pipeline including NER and dependency parsing

# Create readers with different annotation levels
reader_fast = TesseraeReader(annotation_level=AnnotationLevel.TOKENIZE)
reader_full = TesseraeReader(annotation_level=AnnotationLevel.FULL)

print("Available annotation levels:")
for level in AnnotationLevel:
    print(f"  {level.name}: {level.value}")

Available annotation levels:
  NONE: none
  TOKENIZE: tokenize
  BASIC: basic
  FULL: full


In [52]:
# Export search results to TSV, CSV, or JSONL

results = T.find_sents(forms=["amor", "amoris", "amorem"], fileids=T.fileids(match="catullus"))
export = T.export_search_results(results, format="tsv")

print("TSV export (first 500 chars):")
print(export[:500])

TSV export (first 500 chars):
fileid	citation	matches	sentence
catullus.carmina.tess	<cat. 11.21>	amorem	nec meum respectet, ut ante, amorem, qui illius culpa cecidit velut prati ultimi flos, praetereunte postquam tactus aratro est.
catullus.carmina.tess	<cat. 30.6>	amorem	certe tute iubebas animam tradere, inique, me inducens in amorem, quasi tuta omnia mi forent.
catullus.carmina.tess	<cat. 45.7>	Amor	” hoc ut dixit, Amor, sinistra ut ante, dextra sternuit adprobationem.
catullus.carmina.tess	<cat. 55.17>	amoris	si linguam
