In [None]:
## Imports

from latincyreaders import GreekTesseraeReader, AnnotationLevel

from pprint import pprint

In [None]:
## Set up reader

# TOKENIZE level: fast, no OdyCy model needed
G = GreekTesseraeReader(annotation_level=AnnotationLevel.TOKENIZE)

## Fileids

In [None]:
## First 10 filenames

files = G.fileids()[:10]
pprint(files)

In [None]:
# Get files by pattern match (regex)
files = G.fileids(match='homer')
pprint(files)

In [None]:
# Get Homer's Iliad files
iliad_files = G.fileids(match=r'homer.*iliad')
pprint(iliad_files)

In [None]:
# Get Homer's Odyssey files
odyssey_files = G.fileids(match=r'homer.*odyssey')
pprint(odyssey_files)

In [None]:
# Other authors
files = G.fileids(match='sophocles')
pprint(files)

In [None]:
files = G.fileids(match='euripides')
pprint(files)

In [None]:
# Get all files
all_files = G.fileids()
print(f"Total files: {len(all_files)}")

## Texts by line (zero NLP overhead)

In [None]:
# texts_by_line() - fastest way to read Tesserae files
# Returns (citation, text) pairs with zero NLP processing

iliad_1 = iliad_files[0] if iliad_files else G.fileids(match='homer')[0]

for i, (citation, text) in enumerate(G.texts_by_line(iliad_1)):
    if i >= 10:
        break
    print(f"{citation}: {text[:70]}...")

In [None]:
# Raw text (entire document as one string)
iliad_1_text = next(G.texts(iliad_1))
print(f"Character count: {len(iliad_1_text)}")
print(f"Word count (approx): {len(iliad_1_text.split())}")
print(f"\nFirst 200 chars: {iliad_1_text[:200]}")

## Doc structures

In [None]:
## Docs - spaCy Doc objects

iliad_doc = next(G.docs(iliad_1))
print(iliad_doc[:300])

In [None]:
## Doc Rows - citation -> text mapping

iliad_docrows = next(G.doc_rows(iliad_1))

print('First 10 citation -> span mappings:')
for i, (citation, span) in enumerate(iliad_docrows.items()):
    if i >= 10:
        break
    print(f"  {citation}: {span.text[:50]}...")

## Doc units

In [None]:
# Sents - spaCy Span objects

iliad_sents = G.sents(iliad_1)

for i in range(1, 6):
    print(f'Sent {i}: {next(iliad_sents)}')
    print()

In [None]:
# Tokens

iliad_tokens = G.tokens(iliad_1)

for i in range(1, 15):
    print(f'Word {i}: {next(iliad_tokens)}')

In [None]:
# Tokens as plain strings

plaintext_tokens = G.tokens(iliad_1, as_text=True)

for i in range(1, 10):
    t = next(plaintext_tokens)
    print(f'{t} ({type(t).__name__})')

In [None]:
# Lines (citation units from the Tesserae format)

iliad_lines = G.lines(iliad_1)

for i in range(1, 9):
    line = next(iliad_lines)
    print(f'{line._.citation}: {line}')

In [None]:
# Doc stats

iliad_doc = next(G.docs(iliad_1))

print(f'Stats for {iliad_1}:')
print(f'  Sentences: {len(list(iliad_doc.sents))}')
print(f'  Tokens: {len(iliad_doc)}')
print(f'  Citation lines: {len(iliad_doc.spans.get("lines", []))}')

## Search

Fast regex-based search across the corpus. No NLP model required.

In [None]:
from itertools import islice

In [None]:
# search() - find lines matching a regex
# Search for Achilles across all Homer

homer_files = G.fileids(match='homer')

results = G.search(r'Ἀχιλ', fileids=homer_files)
for fileid, citation, text, matches in islice(results, 10):
    print(f"{citation}: found {matches}")
    print(f"  → {text[:70]}...")
    print()

In [None]:
# find_lines() - find citation lines with specific words/patterns

# Search for Zeus across corpus
for fileid, citation, text in islice(G.find_lines(pattern=r'Ζεὺς'), 10):
    print(f"{citation}: {text[:70]}...")

In [None]:
# find_lines() with specific forms

forms = ["μῆνιν", "μῆνις", "μήνιος", "μήνιδος"]
for fileid, citation, text in G.find_lines(forms=forms, fileids=homer_files):
    print(f"{citation}: {text[:70]}...")

In [None]:
# find_sents() - find sentences containing a pattern

for hit in islice(G.find_sents(pattern=r'Ἀχιλ', fileids=homer_files), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched: {hit['matches']}")
    print()

In [None]:
# find_sents() with context

for hit in islice(G.find_sents(pattern=r'μῆνιν', fileids=homer_files, context=True), 3):
    if hit.get('prev_sent'):
        print(f"  [prev] {hit['prev_sent'][:60]}...")
    print(f"  >>> {hit['sentence'][:80]}...")
    if hit.get('next_sent'):
        print(f"  [next] {hit['next_sent'][:60]}...")
    print(f"  -- {hit['citation']}")
    print()

## KWIC (Keyword in Context)

In [None]:
# Basic KWIC search
for hit in G.kwic("θεὰ", fileids=homer_files, window=5, limit=10):
    print(f"{hit['left']:>50s} [{hit['match']}] {hit['right']:<50s}")
    print(f"{'':>50s}  {hit['citation']}")
    print()

In [None]:
# KWIC for a character name
for hit in G.kwic("Ἕκτωρ", fileids=homer_files, window=5, limit=10):
    print(f"{hit['left']:>50s} [{hit['match']}] {hit['right']:<50s}")
    print(f"{'':>50s}  {hit['citation']}")
    print()

## N-grams

In [None]:
# Bigrams from Iliad Book 1

bigrams = list(islice(G.ngrams(n=2, fileids=iliad_1), 20))
print("First 20 bigrams from Iliad 1:")
pprint(bigrams)

In [None]:
# Trigrams
trigrams = list(islice(G.ngrams(n=3, fileids=iliad_1), 10))
print("First 10 trigrams:")
pprint(trigrams)

In [None]:
# Bigram frequency analysis
from collections import Counter

bigram_counts = Counter(G.ngrams(n=2, fileids=homer_files))
print("Most common bigrams in Homer:")
for bigram, count in bigram_counts.most_common(15):
    print(f"  {bigram}: {count}")

In [None]:
# Skipgrams - word pairs with gaps
# n=2 (pairs), k=1 (allow 1 word gap)

skipgrams = list(islice(G.skipgrams(n=2, k=1, fileids=iliad_1), 15))
print("First 15 skipgrams (bigrams with 1 skip):")
pprint(skipgrams)

## Word counting

In [None]:
# Simple word counting from raw text (fastest method)

word_counts = Counter()

for citation, text in G.texts_by_line(fileids=homer_files):
    words = text.split()
    word_counts.update(words)

print(f"Total word tokens: {sum(word_counts.values())}")
print(f"Unique word types: {len(word_counts)}")
print("\nMost common words in Homer:")
for word, count in word_counts.most_common(20):
    print(f"  {word:20s} {count:>6d}")

In [None]:
# Character mention counts

characters = {
    "Achilles": r"Ἀχιλ",
    "Hector": r"Ἕκτ",
    "Odysseus": r"Ὀδυσ",
    "Zeus": r"Ζε[υύ]",
    "Athena": r"Ἀθην",
    "Apollo": r"Ἀπόλλ",
    "Agamemnon": r"Ἀγαμέμν",
    "Patroclus": r"Πατρόκλ",
}

print("Character mentions (lines) in Homer:")
for name, pattern in characters.items():
    count = len(list(G.search(pattern, fileids=homer_files)))
    print(f"  {name:15s} {count:>4d} lines")

## Export results

In [None]:
# Export search results to TSV

results = G.find_sents(pattern=r'Ἀχιλ', fileids=homer_files)
export = G.export_search_results(results, format="tsv")

print("TSV export (first 500 chars):")
print(export[:500])

In [None]:
# Export as JSONL

results = G.find_sents(forms=["μῆνιν"], fileids=homer_files)
export = G.export_search_results(results, format="jsonl")
print(export)

## With OdyCy NLP model (BASIC/FULL annotation)

The following cells require the OdyCy model for lemmatization and POS tagging.

```bash
pip install https://huggingface.co/chcaa/grc_odycy_joint_sm/resolve/main/grc_odycy_joint_sm-any-py3-none-any.whl
```

In [None]:
# Reload with BASIC annotation level for lemmatization + POS
G_nlp = GreekTesseraeReader(annotation_level=AnnotationLevel.BASIC)

In [None]:
# POS-tagged sentences

iliad_sents = G_nlp.sents(iliad_1)

for i in range(1, 4):
    sent = next(iliad_sents)
    pos_sent = [f"{t.text}/{t.pos_}" for t in sent]
    print(f'POS Sent {i}: {" ".join(pos_sent)}')
    print()

In [None]:
# Token linguistic attributes

iliad_sents = G_nlp.sents(iliad_1)
sent = next(iliad_sents)

for t in sent:
    print(f"  {t.text:20s} lemma={t.lemma_:20s} pos={t.pos_:8s} tag={t.tag_}")

In [None]:
# Concordance by lemma

homer_files = G_nlp.fileids(match='homer')
conc = G_nlp.concordance(fileids=homer_files[:2], basis="lemma")

print(f"Unique lemmas: {len(conc)}")
print("\nMost cited lemmas:")
top_lemmas = sorted(conc.items(), key=lambda x: len(x[1]), reverse=True)[:15]
for lemma, citations in top_lemmas:
    print(f"  {lemma:20s} {len(citations):>4d} occurrences")

In [None]:
# KWIC by lemma - matches all inflected forms

for hit in G_nlp.kwic("θεός", fileids=homer_files[:2], by_lemma=True, window=4, limit=5):
    print(f"{hit['left']:>40s} [{hit['match']}] {hit['right']:<40s}")
    print(f"{'':>40s}  {hit['citation']}")
    print()

In [None]:
# N-grams by lemma

print("Most common lemma bigrams in Homer (sample):")
lemma_counts = Counter(G_nlp.ngrams(n=2, fileids=homer_files[:2], basis="lemma"))
for bigram, count in lemma_counts.most_common(10):
    print(f"  {bigram}: {count}")

In [None]:
# find_sents() by lemma - finds ALL inflected forms

for hit in islice(G_nlp.find_sents(lemma="θεός", fileids=homer_files[:1]), 5):
    print(f"{hit['citation']}: {hit['sentence'][:80]}...")
    print(f"  Matched forms: {hit['matches']}")
    print()

## Annotation levels

In [None]:
# AnnotationLevel controls NLP processing overhead
# NONE     - texts() only (fastest, no spaCy)
# TOKENIZE - tokenization + sentence boundaries (spacy.blank('grc'))
# BASIC    - + lemmatization, POS tagging (OdyCy, disable parser/NER)
# FULL     - full pipeline including NER and deps (OdyCy)

print("Available annotation levels:")
for level in AnnotationLevel:
    print(f"  {level.name}: {level.value}")