In [2]:
pip install transformers sentencepiece torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install python-docx




In [4]:
pip install pdfplumber

Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
Collecting pdfminer.six==20251107 (from pdfplumber)
  Downloading pdfminer_six-20251107-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-5.2.0-py3-none-win_amd64.whl.metadata (67 kB)
Downloading pdfplumber-0.11.8-py3-none-any.whl (60 kB)
Downloading pdfminer_six-20251107-py3-none-any.whl (5.6 MB)
   ---------------------------------------- 0.0/5.6 MB ? eta -:--:--
   ----- ---------------------------------- 0.8/5.6 MB 6.1 MB/s eta 0:00:01
   ----------------------------------- ---- 5.0/5.6 MB 15.6 MB/s eta 0:00:01
   ---------------------------------------- 5.6/5.6 MB 14.5 MB/s  0:00:00
Downloading pypdfium2-5.2.0-py3-none-win_amd64.whl (3.1 MB)
   ---------------------------------------- 0.0/3.1 MB ? eta -:--:--
   ------------------------------------- -- 2.9/3.1 MB 21.4 MB/s eta 0:00:01
   ---------------------------------------- 3.1/3.1 MB 

In [5]:
pip install fpdf

Note: you may need to restart the kernel to use updated packages.


In [None]:
from transformers import MarianMTModel, MarianTokenizer
import docx
import pdfplumber
from fpdf import FPDF

# -------------------------------
# FONCTIONS DE BASE
# -------------------------------

# Lire un fichier Word
def read_word(file_path):
    doc = docx.Document(file_path)
    text = "\n".join([p.text for p in doc.paragraphs])
    return text

# Lire un fichier PDF
def read_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Traduire le texte avec MarianMT
def translate_text(text, src_lang="fr", tgt_lang="en"):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    
    # Découper en phrases pour éviter les limites
    sentences = text.split("\n")
    translated_text = ""
    
    for sentence in sentences:
        if sentence.strip() == "":
            translated_text += "\n"
            continue
        batch = tokenizer([sentence], return_tensors="pt", padding=True)
        gen = model.generate(**batch)
        translated = tokenizer.decode(gen[0], skip_special_tokens=True)
        translated_text += translated + "\n"
    
    return translated_text

# Générer un document PDF
def save_to_pdf(text, output_path):
    pdf = FPDF()
    pdf.add_page()
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)
    
    for line in text.split("\n"):
        pdf.multi_cell(0, 10, line)
    pdf.output(output_path)

# -------------------------------
# UTILISATION
# -------------------------------

# 1. Charger le document
file_path = "document.pdf"  # ou document.docx
if file_path.endswith(".pdf"):
    original_text = read_pdf(file_path)
elif file_path.endswith(".docx"):
    original_text = read_word(file_path)
else:
    raise ValueError("Format non supporté")

# 2. Traduire
translated_text = translate_text(original_text, src_lang="fr", tgt_lang="en")

# 3. Sauvegarder dans un PDF
save_to_pdf(translated_text, "translated_document.pdf")

print("Traduction terminée ! Fichier généré : translated_document.pdf")
