In [5]:
import pymupdf
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from io import BytesIO

# Function to detect the language of a text
def detect_language(text):
    return detect(text)

# Function to translate text using Hugging Face MarianMT model
def translate_text(text, src_lang, tgt_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    translated_ids = model.generate(**inputs)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    return translated_text

# Function to extract text and images from PDF
def extract_text_and_images(pdf_path):
    doc = fitz.open(pdf_path)
    extracted_data = []

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        images = page.get_images(full=True)

        page_data = {
            "text": text,
            "images": []
        }

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image_ext = base_image["ext"]
            image = Image.open(BytesIO(image_bytes))

            page_data["images"].append({
                "index": img_index,
                "image": image,
                "ext": image_ext,
                "bbox": img[4]
            })

        extracted_data.append(page_data)

    return extracted_data

# Function to translate extracted text and create a new PDF
def create_translated_pdf(extracted_data, tgt_lang, output_pdf_path):
    c = canvas.Canvas(output_pdf_path, pagesize=letter)

    for page_data in extracted_data:
        text = page_data["text"]
        src_lang = detect_language(text)
        translated_text = translate_text(text, src_lang, tgt_lang)

        # Adding text to the PDF
        c.setFont("Helvetica", 12)
        c.drawString(72, 800, translated_text)

        # Adding images to the PDF
        for img_data in page_data["images"]:
            image = img_data["image"]
            bbox = img_data["bbox"]

            image_stream = BytesIO()
            image.save(image_stream, format=img_data["ext"].upper())
            image_stream.seek(0)

            c.drawImage(
                image_stream,
                bbox[0],  # x
                bbox[1],  # y
                bbox[2] - bbox[0],  # width
                bbox[3] - bbox[1]   # height
            )

        c.showPage()

    c.save()

# Main function to handle the process
def main(pdf_path, tgt_lang):
    extracted_data = extract_text_and_images(pdf_path)
    output_pdf_path = "translated_document.pdf"
    create_translated_pdf(extracted_data, tgt_lang, output_pdf_path)
    print(f"Translated PDF saved to {output_pdf_path}")

ModuleNotFoundError: No module named 'pymupdf'

In [31]:
# main(r"C:\Users\sengu\Downloads\fossils_tour-2.pdf", 'pdf', src_lang='en', tgt_lang='fr')
# Example usage
pdf_path = 'path_to_your_pdf.pdf'
tgt_lang = 'fr'  # Target language code
main(pdf_path, tgt_lang)
