# V8 Data Collection: Coptic-English Bible Corpus

## Goal
Extract Coptic-English parallel verses from OPUS Bible corpus to create mappings for anchor enhancement.

## Data Source
- **OPUS Bible corpus**: Coptic-English TMX file
- **Format**: Translation Memory eXchange (TMX)
- **Size**: ~1.5MB compressed

## Data Download

Download the OPUS Coptic-English Bible corpus (TMX format).

In [None]:
import subprocess
from pathlib import Path

# Ensure data directory exists
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
RAW_DATA_DIR = PROJECT_ROOT / "data/raw"
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)

TMX_URL = "https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/cop-en.tmx.gz"
TMX_PATH = RAW_DATA_DIR / "cop-en.tmx.gz"

# Download if not already present
if not TMX_PATH.exists():
    print(f"Downloading Coptic-English Bible corpus...")
    result = subprocess.run(
        ["curl", "-L", TMX_URL, "-o", str(TMX_PATH)],
        capture_output=True,
        text=True
    )
    if result.returncode == 0:
        print(f"✓ Downloaded to {TMX_PATH}")
        print(f"  Size: {TMX_PATH.stat().st_size / 1024:.1f} KB")
    else:
        print(f"✗ Download failed: {result.stderr}")
else:
    print(f"✓ File already exists: {TMX_PATH}")
    print(f"  Size: {TMX_PATH.stat().st_size / 1024:.1f} KB")

In [None]:
import gzip
import xml.etree.ElementTree as ET
import pandas as pd
import json
from pathlib import Path
from collections import defaultdict

# Paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
TMX_PATH = PROJECT_ROOT / 'data/raw/cop-en.tmx.gz'
OUTPUT_PATH = PROJECT_ROOT / 'data/processed/coptic_english_bible.json'

print(f'Project root: {PROJECT_ROOT}')
print(f'TMX file: {TMX_PATH}')
print(f'TMX exists: {TMX_PATH.exists()}')

## Parse TMX File

TMX format structure:
```xml
<tu>
  <tuv xml:lang="cop"><seg>Coptic text</seg></tuv>
  <tuv xml:lang="en"><seg>English text</seg></tuv>
</tu>
```

In [None]:
def parse_tmx(tmx_path):
    """Parse TMX file and extract Coptic-English parallel verses."""
    parallel_verses = []
    
    with gzip.open(tmx_path, 'rt', encoding='utf-8') as f:
        tree = ET.parse(f)
        root = tree.getroot()
        
        # Find all translation units
        for tu in root.findall('.//tu'):
            coptic_text = None
            english_text = None
            
            # Extract Coptic and English segments
            for tuv in tu.findall('tuv'):
                lang = tuv.get('{http://www.w3.org/XML/1998/namespace}lang')
                seg = tuv.find('seg')
                
                if seg is not None and seg.text:
                    if lang == 'cop':
                        coptic_text = seg.text.strip()
                    elif lang == 'en':
                        english_text = seg.text.strip()
            
            # Add to parallel verses if both exist
            if coptic_text and english_text:
                parallel_verses.append({
                    'coptic': coptic_text,
                    'english': english_text
                })
    
    return parallel_verses

print('Parsing TMX file...')
parallel_verses = parse_tmx(TMX_PATH)
print(f'Extracted {len(parallel_verses)} parallel verses')

## Inspect Data

In [None]:
# Show first 10 examples
print('First 10 Coptic-English verse pairs:\n')
for i, verse in enumerate(parallel_verses[:10]):
    print(f"{i+1}. Coptic:  {verse['coptic']}")
    print(f"   English: {verse['english']}")
    print()

In [None]:
# Statistics
coptic_words = [word for verse in parallel_verses for word in verse['coptic'].split()]
english_words = [word for verse in parallel_verses for word in verse['english'].split()]

print(f'Total verses: {len(parallel_verses)}')
print(f'Total Coptic words: {len(coptic_words)}')
print(f'Unique Coptic words: {len(set(coptic_words))}')
print(f'Total English words: {len(english_words)}')
print(f'Unique English words: {len(set(english_words))}')
print(f'Avg Coptic words/verse: {len(coptic_words) / len(parallel_verses):.1f}')
print(f'Avg English words/verse: {len(english_words) / len(parallel_verses):.1f}')

## Create Word-Level Alignments

Extract common Coptic-English word pairs using co-occurrence.

In [None]:
def extract_word_cooccurrences(parallel_verses, min_count=3):
    """Extract Coptic-English word pairs based on co-occurrence."""
    cooccurrence = defaultdict(lambda: defaultdict(int))
    
    for verse in parallel_verses:
        coptic_words = set(verse['coptic'].lower().split())
        english_words = set(verse['english'].lower().split())
        
        # Count co-occurrences
        for cop_word in coptic_words:
            for eng_word in english_words:
                cooccurrence[cop_word][eng_word] += 1
    
    # Extract high-confidence pairs
    word_pairs = []
    for cop_word, eng_counts in cooccurrence.items():
        # Get most frequent English translation
        best_eng = max(eng_counts.items(), key=lambda x: x[1])
        if best_eng[1] >= min_count:
            word_pairs.append({
                'coptic': cop_word,
                'english': best_eng[0],
                'count': best_eng[1]
            })
    
    # Sort by frequency
    word_pairs.sort(key=lambda x: x['count'], reverse=True)
    return word_pairs

print('Extracting word-level co-occurrences...')
word_pairs = extract_word_cooccurrences(parallel_verses, min_count=5)
print(f'Extracted {len(word_pairs)} Coptic-English word pairs')

In [None]:
# Show top 50 word pairs
print('Top 50 Coptic-English word pairs by frequency:\n')
for i, pair in enumerate(word_pairs[:50]):
    print(f"{i+1:2d}. {pair['coptic']:20s} → {pair['english']:20s} (count: {pair['count']})")

## Save Data

In [None]:
# Save parallel verses
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
    json.dump(parallel_verses, f, ensure_ascii=False, indent=2)

print(f'Saved {len(parallel_verses)} parallel verses to {OUTPUT_PATH}')

# Save word pairs
WORD_PAIRS_PATH = PROJECT_ROOT / 'data/processed/coptic_english_word_pairs.json'
with open(WORD_PAIRS_PATH, 'w', encoding='utf-8') as f:
    json.dump(word_pairs, f, ensure_ascii=False, indent=2)

print(f'Saved {len(word_pairs)} word pairs to {WORD_PAIRS_PATH}')

## Summary

Successfully extracted Coptic-English parallel data from OPUS Bible corpus.

**Next Steps**:
1. Download Černý's Etymological Dictionary
2. Extract Egyptian-Coptic cognate pairs
3. Map Egyptian → Coptic → English to create enhanced anchors