In [9]:
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageDraw, ImageFont
import fitz  # PyMuPDF
from googletrans import Translator


pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Extract text from image
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    pages = convert_from_path(pdf_path)
    text = ""
    for page in pages:
        text += pytesseract.image_to_string(page)
    return text

# Translate text
def translate_text(text, src_lang='en', tgt_lang='fr'):
    translator = Translator()
    translated = translator.translate(text, src=src_lang, dest=tgt_lang)
    return translated.text

# Replace text in image
def replace_text_in_image(image_path, translated_text):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    draw.text((10, 10), translated_text, font=font, fill="black")
    # Save the translated image in the same directory
    output_image_path = "translated_image.png"
    image.save(output_image_path)
    print(f"Translated image saved to {output_image_path}")

# Replace text in PDF
def replace_text_in_pdf(pdf_path, src_lang='en', tgt_lang='fr'):
    doc = fitz.open(pdf_path)
    translator = Translator()

    for page in doc:
        blocks = page.get_text("blocks")
        for block in blocks:
            block_text = block[4]
            translated_text = translator.translate(block_text, src=src_lang, dest=tgt_lang).text
            # Clear the original text block by overlaying a white rectangle
            rect = fitz.Rect(block[:4])
            page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))

            # Insert translated text at the same position
            page.insert_text((block[0], block[1]), translated_text, fontsize=12, fontname="helv", color=(0, 0, 0), render_mode=0)

    # Save the translated PDF in the same directory
    output_pdf_path = "translated_document.pdf"
    doc.save(output_pdf_path)
    print(f"Translated PDF saved to {output_pdf_path}")

# Main function
def main(file_path, file_type, src_lang='en', tgt_lang='fr'):
    if file_type == 'image':
        text = extract_text_from_image(file_path)
        translated_text = translate_text(text, src_lang, tgt_lang)
        replace_text_in_image(file_path, translated_text)
    elif file_type == 'pdf':
        replace_text_in_pdf(file_path, src_lang, tgt_lang)
    else:
        raise ValueError("Unsupported file type. Use 'image' or 'pdf'.")


In [10]:
main(r"C:\Users\sengu\Downloads\Get_Started_With_Smallpdf.pdf", 'pdf', src_lang='en', tgt_lang='fr')

Translated PDF saved to translated_document.pdf


In [29]:
# Import libraries
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageDraw, ImageFont
import fitz  # PyMuPDF
from transformers import MarianMTModel, MarianTokenizer

# Specify the path to the Tesseract executable
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Extract text from image
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")
    return text

# Translate text using Hugging Face MarianMT model
def translate_text(text, src_lang, tgt_lang):
    # Load pre-trained MarianMT model and tokenizer
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

    # Perform translation
    translated_ids = model.generate(**inputs)

    # Decode translated text
    try:
        translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    except IndexError:
        translated_text = "Translation failed. Please check the input text and try again."
    return translated_text

# Replace text in image
def replace_text_in_image(image_path, translated_text):
    image = Image.open(image_path)
    draw = ImageDraw.Draw(image)
    font = ImageFont.load_default()
    draw.text((10, 10), translated_text, font=font, fill="black")
    # Save the translated image in the same directory
    output_image_path = "translated_image.png"
    image.save(output_image_path)
    print(f"Translated image saved to {output_image_path}")

# Normalize color values to 0-1 range
def normalize_color(color):
    if isinstance(color, int):
        return fitz.utils.getColor(color)
    elif isinstance(color, (tuple, list)) and len(color) == 3:
        return tuple(c / 255 for c in color)
    else:
        return (0, 0, 0)  # Default to black if color format is unknown

# Replace text in PDF
def replace_text_in_pdf(pdf_path, src_lang='en', tgt_lang='fr'):
    doc = fitz.open(pdf_path)
    translator = Translator()

    for page in doc:
        blocks = page.get_text("blocks")
        for block in blocks:
            block_text = block[4]
            translated_text = translator.translate(block_text, src=src_lang, dest=tgt_lang).text
            # Clear the original text block by overlaying a white rectangle
            rect = fitz.Rect(block[:4])
            page.draw_rect(rect, color=(1, 1, 1), fill=(1, 1, 1))

            # Insert translated text at the same position
            page.insert_text((block[0], block[1]), translated_text, fontsize=12, fontname="helv", color=(0, 0, 0), render_mode=0)

    # Save the translated PDF in the same directory
    output_pdf_path = "translated_document.pdf"
    doc.save(output_pdf_path)
    print(f"Translated PDF saved to {output_pdf_path}")

# Main function
def main(file_path, file_type, src_lang='en', tgt_lang='fr'):
    if file_type == 'image':
        text = extract_text_from_image(file_path)
        translated_text = translate_text(text, src_lang, tgt_lang)
        replace_text_in_image(file_path, translated_text)
    elif file_type == 'pdf':
        replace_text_in_pdf(file_path, src_lang, tgt_lang)
    else:
        raise ValueError("Unsupported file type. Use 'image' or 'pdf'.")

In [30]:
main(r"C:\Users\sengu\Downloads\fossils_tour-2.pdf", 'pdf', src_lang='en', tgt_lang='fr')

IndexError: list index out of range