In [2]:
import fitz  # PyMuPDF
from transformers import MarianMTModel, MarianTokenizer

def extract_text(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

def translate_text(text, target_language):
    model_name = f"Helsinki-NLP/opus-mt-en-{target_language}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translated = model.generate(**tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512))
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return translated_text

def replace_text_in_pdf(pdf_path, original_text, translated_text, output_pdf_path):
    doc = fitz.open(pdf_path)
    for page in doc:
        for text_instance in page.search_for(original_text):
            page.insert_text(text_instance[:2], translated_text)
            page.delete(text_instance)
    doc.save(output_pdf_path)

# Paths
pdf_path = r"D:\DS WorkFlow\Github\Gen_AI\Langchain\data\smallpdf.pdf"
output_pdf_path = r"D:\DS WorkFlow\Github\Gen_AI\Langchain\output_data\translated_pdf.pdf"
target_language = "fr"  # e.g., French

# Extract text from the PDF
original_text = extract_text(pdf_path)

# Translate the extracted text
translated_text = translate_text(original_text, target_language)

# Replace text in the original PDF with the translated text
replace_text_in_pdf(pdf_path, original_text, translated_text, output_pdf_path)

print(f"Translated PDF saved at: {output_pdf_path}")

Translated PDF saved at: D:\DS WorkFlow\Github\Gen_AI\Langchain\data\translated_pdf.pdf


In [9]:
import fitz  # PyMuPDF
from transformers import MarianMTModel, MarianTokenizer

def extract_text(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

def translate_text(text, target_language):
    model_name = f"Helsinki-NLP/opus-mt-en-{target_language}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translated = model.generate(**tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512))
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return translated_text

def create_translated_pdf(pdf_path, translated_text, target_language, output_pdf_path):
    doc = fitz.open()
    for page in fitz.open(pdf_path):
        try:
            page_text = page.get_text()
            translated_page_text = page_text.replace(original_text, translated_text)
            translated_lines = translated_page_text.split('\n')
            for line_index, line in enumerate(translated_lines):
                # Calculate y-coordinate for each line
                y = page.rect.height - (line_index + 1) * 12  # Assuming font size is 12
                page.insert_text((50, y), line)
        except ValueError as e:
            print(f"Error processing page: {e}")
            # Handle the error gracefully
    if doc.page_count > 0:
        doc.save(output_pdf_path)
        print(f"Translated PDF saved at: {output_pdf_path}")
    else:
        print("No pages were successfully translated.")


# Paths
pdf_path = r"D:\DS WorkFlow\Github\Gen_AI\Langchain\data\smallpdf.pdf"
target_language = "fr"  # e.g., French
output_pdf_path = r"D:\DS WorkFlow\Github\Gen_AI\Langchain\output_data\translated_pdf.pdf"

# Extract text from the PDF
original_text = extract_text(pdf_path)
print("Original text extracted:", original_text)

# Translate the extracted text
translated_text = translate_text(original_text, target_language)
print("Translated text:", translated_text)

# Create and save the translated PDF
create_translated_pdf(pdf_path, translated_text, target_language, output_pdf_path)

Original text extracted: Welcome to Smallpdf
Digital Documents—All In One Place
Access Files Anytime, Anywhere 
Enhance Documents in One Click 
Collaborate With Others 
With the new Smallpdf experience, you can 
freely upload, organize, and share digital 
documents. When you enable the ‘Storage’ 
option, we’ll also store all processed files here. 
You can access files stored on Smallpdf from 
your computer, phone, or tablet. We’ll also 
sync files from the Smallpdf Mobile App to our 
online portal
When you right-click on a file, we’ll present 
you with an array of options to convert, 
compress, or modify it. 
Forget mundane administrative tasks. With 
Smallpdf, you can request e-signatures, send 
large files, or even enable the Smallpdf G Suite 
App for your entire organization. 
Ready to take document management to the next level? 

Translated text: Bienvenue à Smallpdf Documents numériques — Tous en un seul endroit Fichiers d'accès à tout moment, n'importe où Améliorer les documents 

In [11]:
# from reportlab.lib.pagesizes import letter
# from reportlab.pdfgen import canvas
# import fitz

# def extract_text_and_formatting(pdf_path):
#     document = fitz.open(pdf_path)
#     pages = {}
#     for page_num in range(len(document)):
#         page = document.load_page(page_num)
#         blocks = page.get_text("dict")["blocks"]
#         page_content = ""
#         for block in blocks:
#             if block["type"] == 0:  # text block
#                 for line in block["lines"]:
#                     for span in line["spans"]:
#                         page_content += span["text"]
#                         if span["flags"] & 4:  # space after
#                             page_content += " "
#                     page_content += "\n"  # new line after each line
#         pages[page_num + 1] = page_content.strip()  # Add 1 to page_num to start page numbering from 1
#     return pages

# def translate_text(text, target_language):
#     # Simulate translation by simply appending the target language code to each line
#     translated_text = {}
#     for page_num, page_content in text.items():
#         translated_text[page_num] = page_content + f" ({target_language})"
#     return translated_text

# def create_translated_pdf(pdf_path, translated_text, output_pdf_path):
#     original_doc = fitz.open(pdf_path)
#     new_doc = fitz.open()
#     for original_page in original_doc:
#         new_page = new_doc.new_page(width=letter[0], height=letter[1])
#         new_page.show_pdf_page(new_page.rect, original_page)
#         page_text = translated_text.get(original_page.number, "")
#         if page_text:
#             # Write translated text on the new page
#             can = canvas.Canvas(new_doc, new_page.rect)
#             can.setFont("Helvetica", 12)
#             for line_num, line in enumerate(page_text.split('\n')):
#                 can.drawString(50, new_page.rect.height - 50 - (line_num * 14), line)  # Adjust position as needed
#             can.save()
#     new_doc.save(output_pdf_path)
#     print(f"Translated PDF saved at: {output_pdf_path}")

# # Paths and settings
# pdf_path = r"D:\DS WorkFlow\Github\Gen_AI\Langchain\data\smallpdf.pdf"
# target_language = "fr"  # Target language code
# output_pdf_path = r"D:\DS WorkFlow\Github\Gen_AI\Langchain\output_data\translated_pdf.pdf"

# # Extract text from original PDF
# extracted_text = extract_text_and_formatting(pdf_path)

# # Translate extracted text
# translated_text = translate_text(extracted_text, target_language)

# # Create translated PDF
# create_translated_pdf(pdf_path, translated_text, output_pdf_path)

AttributeError: 'Page' object has no attribute 'is_pdf'