In [None]:
!pip install transformers torch sentencepiece tqdm

!git clone https://github.com/churchstudio-org/openbible.git
%cd ./openbible

!python scripts/list_metadata.py https://github.com/churchstudio-org/openbible/raw/main/
!cat metadata.json

In [None]:
# -*- coding: utf-8 -*-
import os
import json
import shutil
import torch
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer
from google.colab import files  # If you're running this on Colab

# -------------------------------------------
# 0. CUDA device check
# -------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -------------------------------------------
# 1. Load metadata.json
# -------------------------------------------
with open("metadata.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)

# -------------------------------------------
# 2. Helper: Translate a batch of verses
# -------------------------------------------
def translate_batch(verses, tokenizer, model, batch_size=8):
    """Translate a list of verses in batches using the provided model and tokenizer."""
    translations = []
    for i in range(0, len(verses), batch_size):
        batch = verses[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to(device)
        translated = model.generate(**inputs)
        out = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
        translations.extend(out)
    return translations

# -------------------------------------------
# 3. Helper: Split Bible JSON into per-book files
# -------------------------------------------
BOOK_NAMES = [
    "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy",
    "Joshua", "Judges", "Ruth", "Samuel1", "Samuel2",
    "Kings1", "Kings2", "Chronicles1", "Chronicles2", "Ezra",
    "Nehemiah", "Esther", "Job", "Psalms", "Proverbs",
    "Ecclesiastes", "SongOfSolomon", "Isaiah", "Jeremiah", "Lamentations",
    "Ezekiel", "Daniel", "Hosea", "Joel", "Amos",
    "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk",
    "Zephaniah", "Haggai", "Zechariah", "Malachi", "Matthew",
    "Mark", "Luke", "John", "Acts", "Romans",
    "Corinthians1", "Corinthians2", "Galatians", "Ephesians", "Philippians",
    "Colossians", "Thessalonians1", "Thessalonians2", "Timothy1", "Timothy2",
    "Titus", "Philemon", "Hebrews", "James", "Peter1",
    "Peter2", "John1", "John2", "John3", "Jude",
    "Revelation"
]

def split_bible_by_book(bible_data, version_dir):
    """Split the bible_data into one JSON file per book inside version_dir/books."""
    books_dir = os.path.join(version_dir, "books")
    os.makedirs(books_dir, exist_ok=True)

    for i, book in enumerate(bible_data):
        book_name = BOOK_NAMES[i]
        book_path = os.path.join(books_dir, f"{book_name}.json")
        with open(book_path, "w", encoding="utf-8") as bf:
            json.dump(book, bf, ensure_ascii=False, indent=2)

# -------------------------------------------
# 4. Process each metadata entry
# -------------------------------------------
generated_versions = []  # Keep track of versions that were translated now

for entry in metadata:
    version = entry.get("version")
    model_name = entry.get("model")
    source_version = entry.get("source")

    if not version:
        continue  # Skip if no version specified

    version_dir = os.path.join(os.getcwd(), version)
    bible_path = os.path.join(version_dir, f"bible.json")

    # Skip if already generated
    if os.path.exists(bible_path):
        print(f"✅ Skipping {version} (already exists)")
        continue

    # If no model is provided, we can't translate
    if not model_name:
        print(f"⚠️ No model specified for version '{version}', skipping.")
        continue

    # If no source is provided, we can't translate
    if not source_version:
        print(f"⚠️ No source specified for version '{version}', skipping.")
        continue

    # Load the model and tokenizer for this version
    print(f"\n🌐 Translating version '{version}' using model '{model_name}'...")
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name).to(device)

    # Translate the entire Bible
    with open(os.path.join(source_version[0], "bible.json"), "r", encoding="utf-8") as f:
        source = json.load(f)

    bible_translated = []
    for book in tqdm(source, desc=f"Translating {version}", unit="book"):
        book_translated = []
        for chapter in book:
            chapter_translated = translate_batch(chapter, tokenizer, model)
            book_translated.append(chapter_translated)
        bible_translated.append(book_translated)

    # Save translated Bible JSON
    os.makedirs(version_dir, exist_ok=True)
    with open(bible_path, "w", encoding="utf-8") as f:
        json.dump(bible_translated, f, ensure_ascii=False, indent=2)

    print(f"💾 Saved {version}.json in {version_dir}")

    # Split into per-book files
    split_bible_by_book(bible_translated, version_dir)
    print(f"📚 Split Bible into per-book JSON files in {version_dir}/books")

    generated_versions.append(version_dir)

# -------------------------------------------
# 5. Download generated versions as zip files (Colab)
# -------------------------------------------
if generated_versions:
    for version_dir in generated_versions:
        zip_name = f"{os.path.basename(version_dir)}.zip"
        shutil.make_archive(os.path.splitext(zip_name)[0], 'zip', version_dir)
        files.download(f"{os.path.splitext(zip_name)[0]}.zip")

print("✅ All processing finished.")


cuda
['In the beginning God created the heaven and the earth.', 'And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.']


Livros: 100%|██████████| 66/66 [38:29<00:00, 35.00s/it]

Tradução final salva em kjv_pt.json ✅



