In [1]:
!pip install pymupdf transformers

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.m

In [3]:
# Import necessary libraries
import os
import re
from pathlib import Path
import fitz  # PyMuPDF
import torch
from transformers import MarianMTModel, MarianTokenizer

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF while preserving structure."""
    doc = fitz.open(pdf_path)
    full_text = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text = page.get_text()
        full_text.append(text)
    
    # Join all pages text
    complete_text = "\n".join(full_text)
    
    # Clean up extra whitespace while preserving paragraph structure
    complete_text = re.sub(r'\n\s*\n', '\n\n', complete_text)
    complete_text = re.sub(r' +', ' ', complete_text)
    
    return complete_text

def detect_language(text):
    """Simple language detection based on script."""
    # Check for Arabic characters
    if re.search(r'[\u0600-\u06FF]', text):
        return 'ar'
    # Check for Latin characters (covers French, English, etc.)
    elif re.search(r'[a-zA-Z]', text):
        return 'en'
    else:
        return 'unknown'

def translate_text(text, source_lang, target_lang='fr'):
    """Translate text using Opus-MT model."""
    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
    
    try:
        print(f"Loading translation model: {model_name}")
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        
        # Process text in chunks to avoid exceeding max token length
        max_length = tokenizer.model_max_length
        chunks = []
        
        # Split by paragraphs
        paragraphs = text.split('\n\n')
        current_chunk = ""
        
        for paragraph in paragraphs:
            # If adding this paragraph would exceed max length, save current chunk and start new one
            if len(tokenizer.encode(current_chunk + paragraph)) > max_length - 10:  # Leave some margin
                chunks.append(current_chunk.strip())
                current_chunk = paragraph + "\n\n"
            else:
                current_chunk += paragraph + "\n\n"
        
        # Add the last chunk if not empty
        if current_chunk.strip():
            chunks.append(current_chunk.strip())
        
        # Translate each chunk
        translated_chunks = []
        for i, chunk in enumerate(chunks):
            print(f"Translating chunk {i+1}/{len(chunks)}...")
            encoded = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            translated = model.generate(**encoded)
            translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
            translated_chunks.append(translated_text)
        
        # Join all translated chunks
        translated_text = "\n\n".join(translated_chunks)
        return translated_text
        
    except Exception as e:
        print(f"Translation error: {e}")
        # Try fallback options
        if source_lang == 'ar':
            try:
                print("Attempting to use multilingual model...")
                return translate_text(text, "apc", target_lang)  # Try Arabic dialect
            except:
                print("All translation attempts failed. Returning original text.")
                return text
        else:
            return text

def process_document(input_file, output_file=None):
    """Process PDF document: extract text, detect language, translate, and save."""
    print(f"Processing {input_file}...")
    
    # Extract text from PDF
    extracted_text = extract_text_from_pdf(input_file)
    
    # Save the extracted text for inspection
    extracted_path = "/kaggle/working/extracted_text.txt"
    with open(extracted_path, 'w', encoding='utf-8') as f:
        f.write(extracted_text)
    print(f"Extracted text saved to {extracted_path}")
    
    # Detect language
    source_lang = detect_language(extracted_text)
    print(f"Detected language: {source_lang}")
    
    if source_lang == 'unknown':
        print("Could not detect language. Using 'ar' as default.")
        source_lang = 'ar'
    
    # Translate to French
    if source_lang != 'fr':  # Skip translation if already French
        print(f"Translating from {source_lang} to French...")
        translated_text = translate_text(extracted_text, source_lang)
    else:
        translated_text = extracted_text
        print("Document already in French. No translation needed.")
    
    # Generate output filename if not provided
    if not output_file:
        input_path = Path(input_file)
        output_file = input_path.with_stem(f"{input_path.stem}_fr").with_suffix('.txt')
    
    # Save translated text
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(translated_text)
    
    print(f"Translation saved to {output_file}")
    return output_file

# Check if a GPU is available and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hardcoded paths for Kaggle environment
input_pdf = "/kaggle/input/note-commune/---06-1.pdf"
output_file = "/kaggle/working/translated_document_fr.txt"

# Check if the file exists
if not os.path.exists(input_pdf):
    print(f"File not found: {input_pdf}")
    print("Available files in input directory:")
    input_dir = "/kaggle/input"
    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.pdf'):
                print(os.path.join(root, file))
    
    # Allow user to specify a different path
    input_pdf = input("Enter the correct path to the PDF file: ")

# Process the document
if os.path.exists(input_pdf):
    process_document(input_pdf, output_file)
    
    # Display the first few lines of the translated document
    print("\nPreview of translated document:")
    with open(output_file, "r", encoding="utf-8") as f:
        preview = f.read(1000)
    print(preview + "...")
else:
    print(f"File not found: {input_pdf}")

Using device: cuda
Processing /kaggle/input/note-commune/---06-1.pdf...
Extracted text saved to /kaggle/working/extracted_text.txt
Detected language: ar
Translating from ar to French...
Loading translation model: Helsinki-NLP/opus-mt-ar-fr


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/918k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/824k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/311M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (556 > 512). Running this sequence through the model will result in indexing errors


Translating chunk 1/2...


model.safetensors:   0%|          | 0.00/311M [00:00<?, ?B/s]

Translating chunk 2/2...
Translation saved to /kaggle/working/translated_document_fr.txt

Preview of translated document:
Sur ce sujet, il convient de préciser: 1. En ce qui concerne la vente de parcelles de terrain par les bailleurs de biens immobiliers, il s'agit de la vente de parcelles de terrain par les propriétaires de biens immobiliers: les salariés de la section 1 du chapitre 5 du chapitre 1 du Code de la valeur ajoutée sont soumis à la valeur ajoutée par les bailleurs de fonds immobiliers et, conformément aux dispositions du chapitre 58 du Code de la valeur ajoutée, la vente de parcelles de terrain par les bailleurs de fonds de la section 7 s'entend de la répartition des parcelles de terrain par la rubrique 1. En ce qui concerne la vente de parcelles de terrain par les bailleurs de fonds de la section 5 du chapitre 1 du Code de la valeur ajoutée, les travailleurs de la section 1 du chapitre 1 du Code de la valeur ajoutée sont soumis à la perte de la valeur ajoutée de la vente 