In [None]:
## Imports

from latincyreaders import TesseraeReader, AnnotationLevel

from pprint import pprint

In [None]:
## Set up reader

T = TesseraeReader()

## Fileids and metadata

In [None]:
## First 10 filenames

files = T.fileids()[:10]
pprint(files)

In [None]:
# Get files by pattern match (regex)
files = T.fileids(match='horace')
pprint(files)

In [None]:
# Get files by pattern - supports regex
files = T.fileids(match=r'vergil.*aeneid')
pprint(files)

In [None]:
# Get files by partial match
files = T.fileids(match='cicero')[:10]
pprint(files)

In [None]:
# Case-insensitive regex matching
files = T.fileids(match=r'ovid')
pprint(files)

In [None]:
# Multiple pattern examples
files = T.fileids(match='lucretius')
pprint(files)

In [None]:
# Get all files
all_files = T.fileids()
print(f"Total files: {len(all_files)}")

### Filtering by metadata

The `match` parameter uses regex, which can be too broad. For precise filtering, use metadata.

In [None]:
# Problem: regex match can be too broad
# Searching for "lucretius" also finds "antilucretius"

files = T.fileids(match='lucretius')
pprint(files)
# Note: polignac.antilucretius is included!

In [None]:
# Solution: filter by exact author using metadata
# This gets ONLY Lucretius, not Anti-Lucretius

lucretius_files = [
    fileid for fileid, meta in T.metadata()
    if meta.get('author') == 'Lucretius'
]
pprint(lucretius_files)

In [None]:
# Filter by genre (e.g., find all epic poetry)

epic_files = [
    fileid for fileid, meta in T.metadata()
    if meta.get('genre') == 'epic'
]
print(f"Epic texts: {len(epic_files)} files")
pprint(epic_files[:10])  # First 10

In [None]:
# Filter by date: texts before 50 BCE (negative dates = BCE)

def get_date(meta):
    """Parse date string to int, handling missing/invalid values."""
    try:
        return int(meta.get('date', 0))
    except (ValueError, TypeError):
        return None

# Texts from before 50 BCE
early_republic = [
    fileid for fileid, meta in T.metadata()
    if (d := get_date(meta)) is not None and d < -50
]
print(f"Texts before 50 BCE: {len(early_republic)} files")
pprint(early_republic[:10])

In [None]:
# Augustan era texts (roughly 43 BCE - 14 CE)

augustan = [
    fileid for fileid, meta in T.metadata()
    if (d := get_date(meta)) is not None and -43 <= d <= 14
]
print(f"Augustan era texts: {len(augustan)} files")
pprint(augustan[:10])

In [None]:
# See all available genres in the corpus

from collections import Counter

genres = Counter(
    meta.get('genre') for _, meta in T.metadata()
    if meta.get('genre')
)
print("Available genres:")
for genre, count in genres.most_common():
    print(f"  {genre}: {count} files")

In [None]:
# Combine filters: lyric poetry from the Augustan era

augustan_lyric = [
    fileid for fileid, meta in T.metadata()
    if meta.get('genre') == 'lyric'
    and (d := get_date(meta)) is not None 
    and -43 <= d <= 14
]
print(f"Augustan lyric poetry: {len(augustan_lyric)} files")
pprint(augustan_lyric)

In [None]:
# Metadata can be accessed without NLP processing (instant)
# Use get_metadata() to avoid loading the spaCy model

catullus = 'catullus.carmina.tess'

# Fast: get metadata directly (no NLP overhead)
meta = T.get_metadata(catullus)

print(f"Metadata for {catullus}:")
pprint(meta)

## Doc structures

In [None]:
# Define a file to work with
catullus = 'catullus.carmina.tess'

In [None]:
## Docs - spaCy Doc objects with NLP annotations

catullus_doc = next(T.docs(catullus))
print(catullus_doc[:500])

In [None]:
## Texts - raw strings (zero NLP overhead)

catullus_text = next(T.texts(catullus))
pprint(catullus_text[:400])

In [None]:
## Doc Rows - citation -> text mapping

catullus_docrows = next(T.doc_rows(catullus))

print('First 10 citation -> span mappings:')
for i, (citation, span) in enumerate(catullus_docrows.items()):
    if i >= 10:
        break
    print(f"  {citation}: {span.text[:40]}...")

## Doc units

In [None]:
# Another file for examples
catilinam = 'cicero.in_catilinam.tess'

In [None]:
## Paras (not implemented - Tesserae format doesn't have paragraphs)
# Use sents() or lines() instead

In [None]:
# Sents - spaCy Span objects

# Segmentation and tokenization done using la_core_web_lg model
catilinam_sents = T.sents(catilinam)

for i in range(1, 6):
    print(f'Sent {i}: {next(catilinam_sents)}')

In [None]:
# Tokens - spaCy Token objects

catilinam_tokens = T.tokens(catilinam)

for i in range(1, 10):
    print(f'Word {i}: {next(catilinam_tokens)}')

In [None]:
# spaCy Token has many useful attributes
catilinam_tokens = T.tokens(catilinam)
token = next(catilinam_tokens)
print(f"Available attributes: {[a for a in dir(token) if not a.startswith('_')][:15]}...")

In [None]:
# Token linguistic attributes (BASIC level: text, lemma, POS, tag)
# Note: dep_ requires AnnotationLevel.FULL

catilinam_tokens = T.tokens(catilinam)
t = next(catilinam_tokens)
print(f"text: {t.text}, lemma: {t.lemma_}, pos: {t.pos_}, tag: {t.tag_}")

In [None]:
# For custom text processing, work with the raw text or spaCy Doc
# The preprocess parameter has been removed - use spaCy pipeline components instead

# Get text as strings
for token_text in T.tokens(catilinam, as_text=True):
    # Apply your own processing
    processed = token_text.lower()
    print(processed, end=' ')
    break  # Just show first token

In [None]:
# Tokenized sents - use spaCy directly
# Get (token, lemma, tag) tuples from sentences

catilinam_sents = T.sents(catilinam)

for i in range(1, 4):
    sent = next(catilinam_sents)
    tok_sent = [(t.text, t.lemma_, t.tag_) for t in sent]
    print(f'Tok Sent {i}: {tok_sent}')
    print()

In [None]:
# Tokenized sents, simplified (just tokens as strings)

catilinam_sents = T.sents(catilinam)

for i in range(1, 4):
    sent = next(catilinam_sents)
    tok_sent = [t.text for t in sent]
    print(f'Tok Sent {i}: {tok_sent}')
    print()

In [None]:
# POS-tagged sents - token/POS pairs

catilinam_sents = T.sents(catilinam)

for i in range(1, 3):
    sent = next(catilinam_sents)
    pos_sent = [f"{t.text}/{t.pos_}" for t in sent]
    print(f'POS Sent {i}: {" ".join(pos_sent)}')

In [None]:
# spaCy Token objects by default
catilinam_tokens = T.tokens(catilinam)

catilinam_token = next(catilinam_tokens)
print(catilinam_token)
print(type(next(catilinam_tokens)))

In [None]:
# Tokens as plain strings with as_text=True

plaintext_tokens = T.tokens(catilinam, as_text=True)

plaintext_token = next(plaintext_tokens)
print(plaintext_token)
print(type(plaintext_token))

In [None]:
# Lines (citation units from the Tesserae format)

aeneid = T.fileids(match='aeneid')[0]

aeneid_lines = T.lines(aeneid)

for i in range(1, 9):
    print(f'{i}: {next(aeneid_lines)}')

In [None]:
# Lines with citation information preserved

aeneid_lines = T.lines(aeneid)

for i in range(1, 9):
    line = next(aeneid_lines)
    print(f'{line._.citation}: {line}')

In [None]:
# Sentences with citation information
# Sentences can span multiple citation lines

aeneid_doc = next(T.docs(aeneid))

# Show first few sentences with their citation ranges
for i, sent in enumerate(aeneid_doc.sents):
    if i >= 3:
        break
    overlapping = [
        span._.citation for span in aeneid_doc.spans.get("lines", [])
        if sent.start < span.end and sent.end > span.start
    ]
    if overlapping:
        cit_range = f"{overlapping[0]}–{overlapping[-1]}" if len(overlapping) > 1 else overlapping[0]
    else:
        cit_range = "?"
    print(f"{cit_range}")
    print(f"  {sent.text[:80]}...")
    print()

In [None]:
# Tokens within citation lines
# Access citation via the line spans

aeneid_doc = next(T.docs(aeneid))
line = aeneid_doc.spans["lines"][0]

print(f"Line citation: {line._.citation}")
print(f"Tokens in line: {[t.text for t in line]}")

## Doc description

In [None]:
# Get files by pattern
metamorphoses = T.fileids(match='ovid.metamorphoses')
pprint(metamorphoses)

In [None]:
# Custom text normalization example
# Use this pattern when you need specific preprocessing

def normalize_latin(text):
    """Normalize Latin text for analysis."""
    text = text.lower()
    # Normalize u/v and i/j
    text = text.replace('v', 'u').replace('j', 'i')
    # Remove punctuation  
    import string
    text = text.translate(str.maketrans('', '', string.punctuation))
    return " ".join(text.split()).strip()

# Example usage on raw text
sample = next(T.texts(metamorphoses[0]))[:100]
print(f"Original: {sample}")
print(f"Normalized: {normalize_latin(sample)}")

In [None]:
## Concordance

# Build a concordance: word -> list of citations where it appears
# Group by lemma (default) to see all forms of a word together

catullus_conc = T.concordance(fileids=catullus, basis="lemma")

print(f"Unique lemmas in Catullus: {len(catullus_conc)}")
print()

# Look up a specific lemma
if "amor" in catullus_conc:
    print("Citations for 'amor':")
    for cit in catullus_conc["amor"][:10]:
        print(f"  {cit}")
    if len(catullus_conc["amor"]) > 10:
        print(f"  ... and {len(catullus_conc['amor']) - 10} more")

In [None]:
# Concordance by surface text form (exact spelling)
catullus_conc_text = T.concordance(fileids=catullus, basis="text")

# Different forms of 'puella' (girl)
puella_forms = ["puella", "puellae", "puellam", "puellas", "puellis"]
print("Occurrences of 'puella' forms in Catullus:")
for form in puella_forms:
    if form in catullus_conc_text:
        count = len(catullus_conc_text[form])
        print(f"  {form}: {count} occurrences")

## KWIC (Keyword in Context)

Find words with surrounding context - useful for studying word usage patterns.

In [None]:
# Basic KWIC search - find "amor" with 5 tokens of context on each side
for hit in T.kwic("amor", fileids=catullus, window=5, limit=5):
    print(f"{hit['left']} [{hit['match']}] {hit['right']}")
    print(f"  -- {hit['citation']}")
    print()

In [None]:
# KWIC by lemma - finds all forms of a word (e.g., amo, amat, amant, amavit)
# Use by_lemma=True to match against lemmatized forms

for hit in T.kwic("amo", fileids=catullus, by_lemma=True, window=4, limit=5):
    print(f"{hit['left']} [{hit['match']}] {hit['right']}")
    print(f"  -- {hit['citation']}")
    print()

## N-grams and Skipgrams

Extract contiguous token sequences (n-grams) or sequences with gaps (skipgrams) for collocation analysis and language modeling.

In [None]:
# Extract bigrams (2-word sequences) from Catullus
# By default, returns strings and filters out punctuation

from itertools import islice

bigrams = list(islice(T.ngrams(n=2, fileids=catullus), 20))
print("First 20 bigrams from Catullus:")
pprint(bigrams)

In [None]:
# Trigrams (3-word sequences)
trigrams = list(islice(T.ngrams(n=3, fileids=catullus), 10))
print("First 10 trigrams:")
pprint(trigrams)

In [None]:
# Get n-grams as token tuples for linguistic analysis
# as_tuples=True returns tuples of spaCy Token objects

for gram in islice(T.ngrams(n=2, fileids=catullus, as_tuples=True), 5):
    # Access token attributes: text, lemma, POS
    print([(t.text, t.lemma_, t.pos_) for t in gram])

In [None]:
# Bigram frequency analysis - find most common word pairs
from collections import Counter

bigram_counts = Counter(T.ngrams(n=2, fileids=catullus))
print("Most common bigrams in Catullus:")
for bigram, count in bigram_counts.most_common(10):
    print(f"  {bigram}: {count}")

In [None]:
# Skipgrams - word pairs with gaps between them
# n=2 (pairs), k=1 (allow 1 word gap)
# "the quick brown fox" → "the quick", "the brown", "quick brown", "quick fox", ...

skipgrams = list(islice(T.skipgrams(n=2, k=1, fileids=catullus), 15))
print("First 15 skipgrams (bigrams with 1 skip):")
pprint(skipgrams)

In [None]:
# N-grams by lemma - normalize inflected forms to dictionary headwords
# Useful for finding collocations regardless of grammatical case/number

# Compare: text basis (default) vs lemma basis
print("Bigrams by surface text:")
text_bigrams = list(islice(T.ngrams(n=2, fileids=catullus, basis="text"), 5))
pprint(text_bigrams)

print("\nBigrams by lemma (normalized forms):")
lemma_bigrams = list(islice(T.ngrams(n=2, fileids=catullus, basis="lemma"), 5))
pprint(lemma_bigrams)

# Lemma-based frequency counts group inflected variants together
print("\nMost common lemma bigrams:")
lemma_counts = Counter(T.ngrams(n=2, fileids=catullus, basis="lemma"))
for bigram, count in lemma_counts.most_common(10):
    print(f"  {bigram}: {count}")

In [None]:
## Basic descriptive stats
# Count files, estimate tokens, etc.

# Quick corpus overview
files = T.fileids()
print(f"Total files: {len(files)}")

# Sample stats from one file
sample_file = files[0]
sample_text = next(T.texts(sample_file))
print(f"\nSample file: {sample_file}")
print(f"Character count: {len(sample_text)}")
print(f"Word count (approx): {len(sample_text.split())}")

### Sample output for full corpus

A full describe() method will be added in a future release.

In [None]:
## Stats for a specific file

catullus_doc = next(T.docs(catullus))

print(f'Stats for {catullus}:')
print(f'  Sentences: {len(list(catullus_doc.sents))}')
print(f'  Tokens: {len(catullus_doc)}')
print(f'  Citation lines: {len(catullus_doc.spans.get("lines", []))}')

## New Features in latincyreaders

The following sections demonstrate new search and filtering capabilities.

In [None]:
# search() - fast regex search across the corpus (no NLP required)
from itertools import islice

# Find lines mentioning Thebes (limit to first 5 results)
results = T.search(r'\bTheb\w+\b')
for fileid, citation, text, matches in islice(results, 5):
    print(f"{fileid} {citation}: found {matches}")
    print(f"  → {text[:60]}...")
    print()

In [None]:
# find_lines() - find citation lines containing specific words/patterns

# Find lines with specific word forms
forms = ["Thebas", "Thebarum", "Thebis"]
for fileid, citation, text in islice(T.find_lines(forms=forms), 5):
    print(f"{citation}: {text[:70]}...")

In [None]:
# find_sents() - find sentences containing specific words
# Fast path: search by exact forms (uses regex, minimal NLP)

for hit in islice(T.find_sents(forms=["Caesar", "Caesarem", "Caesaris"]), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched: {hit['matches']}")
    print()

In [None]:
# find_sents() by lemma - slower but finds ALL forms
# Uses NLP to lemmatize, so it catches forms you might miss

# Find all sentences with any form of "bellum" (war)
for hit in islice(T.find_sents(lemma="bellum"), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched forms: {hit['matches']}")
    print()

In [None]:
# find_sents() with spaCy Matcher patterns - advanced pattern matching
# Search for ADJ + NOUN sequences (e.g., "magna voce", "pulchra puella")

pattern = [{"POS": "ADJ"}, {"POS": "NOUN"}]
for hit in islice(T.find_sents(matcher_pattern=pattern, fileids=T.fileids(match="catullus")), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched: {hit['matches']}")
    print()

In [None]:
# More complex Matcher patterns
# Find sentences with a specific lemma followed by a noun

pattern = [{"LEMMA": "magnus"}, {"POS": "NOUN"}]
for hit in islice(T.find_sents(matcher_pattern=pattern), 5):
    print(f"{hit['citation']}: {hit['matches']}")

### Annotation Levels

Control NLP processing overhead with `AnnotationLevel`:

In [None]:
# AnnotationLevel controls how much NLP processing to apply

# NONE - use texts() for raw strings (fastest)
# TOKENIZE - tokenization + sentence boundaries only
# BASIC - adds lemmatization and POS tagging (default)
# FULL - full pipeline including NER and dependency parsing

# Create readers with different annotation levels
reader_fast = TesseraeReader(annotation_level=AnnotationLevel.TOKENIZE)
reader_full = TesseraeReader(annotation_level=AnnotationLevel.FULL)

print("Available annotation levels:")
for level in AnnotationLevel:
    print(f"  {level.name}: {level.value}")

In [None]:
# Export search results to TSV, CSV, or JSONL

results = T.find_sents(forms=["amor", "amoris", "amorem"], fileids=T.fileids(match="catullus"))
export = T.export_search_results(results, format="tsv")

print("TSV export (first 500 chars):")
print(export[:500])