#OCR a PDF in Portuguese and Split into Phrases with English Translation

####Enter URL and click Runtime>Run All

In [None]:
url = 'http://objdigital.bn.br/objdigital2/acervo_digital/div_obrasgerais/drg177349/drg177349.pdf'

#OCR

In [None]:
!pip install pdf2image
!pip install PyPDF2
!pip install pytesseract
!apt-get install tesseract-ocr
!pip install sentencepiece
!pip install --upgrade transformers
!apt-get install poppler-utils 

In [None]:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import sys
from pdf2image import convert_from_path
import os
import io
import cv2
import numpy as np
from PyPDF2 import PdfReader
import requests
from urllib.parse import urlparse
from transformers import MarianTokenizer, MarianMTModel
import concurrent.futures
import re
import json

In [None]:
response = requests.get(url)

# Extract the filename from the URL
parsed = urlparse(url)
filename = os.path.basename(parsed.path)

with open(filename, 'wb') as f:
    f.write(response.content)

In [None]:
# Extract filename without extension
doc_name = os.path.splitext(os.path.basename(parsed.path))[0]

In [None]:
def preprocess_image(image):
    # Convert to YCrCb color space
    img_ycrcb = cv2.cvtColor(image, cv2.COLOR_BGR2YCrCb)

    # Apply a sharp S-curve to Y channel
    y, cr, cb = cv2.split(img_ycrcb)
    y = np.clip(y * 1.9 - 100, 0, 255).astype(np.uint8)
    img_ycrcb = cv2.merge((y, cr, cb))

    # Convert back to BGR color space
    img_bgr = cv2.cvtColor(img_ycrcb, cv2.COLOR_YCrCb2BGR)

    # Reduce reds and yellows
    b, g, r = cv2.split(img_bgr)
    r = np.clip(r * 0.8, 0, 255).astype(np.uint8)
    g = np.clip(g * 0.9, 0, 255).astype(np.uint8)
    img_reduced = cv2.merge((b, g, r))

    # Convert to grayscale
    img_gray = cv2.cvtColor(img_reduced, cv2.COLOR_BGR2GRAY)

    # Apply median blur
    img_blur = cv2.medianBlur(img_gray, 1)

    # Apply Otsu's thresholding
    _, img_thresh = cv2.threshold(img_blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    return img_thresh


In [None]:
# Download Portugese training data for tesseract
url = 'https://github.com/tesseract-ocr/tessdata/raw/main/por.traineddata'
response = requests.get(url)

with open('por.traineddata', 'wb') as f:
    f.write(response.content)

In [None]:
# Move to the required folder
!mv por.traineddata /usr/share/tesseract-ocr/4.00/tessdata/por.traineddata

In [None]:
pdf_path = f'/content/{doc_name}.pdf'
output_filename = f"/content/{doc_name}.txt"

pdf_reader = PdfReader(open(pdf_path, "rb"))
total_pages = len(pdf_reader.pages)

# Batching keeps the memory from overloading - especially on larger pdf files
batch_size = 10
num_batches = (total_pages + batch_size - 1) // batch_size

# Create folder for images
sub_dir = str(f"/content/{doc_name}/")
if not os.path.exists(sub_dir):
    os.makedirs(sub_dir)

# Loop through, preprocessing images and OCRing
for batch in range(num_batches):
    start_page = batch * batch_size
    end_page = min((batch + 1) * batch_size, total_pages)
    pages = convert_from_path(pdf_path, first_page=start_page, last_page=end_page - 1)

    for i, page in enumerate(pages):
        pg_cntr = start_page + i + 1
        filename = f"pg_{str(pg_cntr)}_{doc_name}.jpg"
        page.save(sub_dir + filename)

        # Load the saved image and preprocess it
        img = cv2.imread(sub_dir + filename)
        preprocessed_img = preprocess_image(img)
        img_pil = Image.fromarray(preprocessed_img)

        # Save preprocessed image
        preprocessed_filename = "preprocessed_" + filename
        cv2.imwrite(sub_dir + preprocessed_filename, preprocessed_img)

        with io.open(output_filename, 'a+', encoding='utf8') as f:
            f.write(pytesseract.image_to_string(f"/content/{doc_name}/{preprocessed_filename}", lang='por') + "\n")

#Separate OCRed Text Into Phrases

In [None]:
path = "/content/"

# get a list of all files in the directory
files = os.listdir(path)

# filter the list to include only files with a .txt extension
txt_files = [file for file in files if file.endswith(".txt")]

In [None]:
chunk_size = 10000  
delimiters = [';', '—', '.', '?', '!', ':']

# Define patterns for initials and titles
INITIAL_PATTERN = re.compile(r'^[A-Z]\.$')
TITLE_PATTERN = re.compile(r'^[A-Z][a-z]+\s[A-Z][a-z]+$')

phrases = []
for i in range(len(txt_files)):
    example = txt_files[i]
    with open(path + example, 'r', encoding='utf-8') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break

            # Remove hyphen and newline for pattern1
            chunk = re.sub(r'([a-zA-Z])-\n([a-zA-Z])', r'\1\2', chunk)
            chunk = re.sub(r'\n', r' ', chunk)
            # Split the chunk into phrases
            phrase_start = 0
            for i, c in enumerate(chunk):
                if c in delimiters:
                    # Check if the delimiter is part of a title or initial or hyphenated word
                    if i > 1 and (INITIAL_PATTERN.match(chunk[i-2:i+1]) or TITLE_PATTERN.match(chunk[phrase_start:i+1])):
                        continue

                    # Add the phrase to the list
                    phrases.append(chunk[phrase_start:i+1])
                    phrase_start = i+1

            # Add the last phrase to the list
            if phrase_start < len(chunk):
                phrases.append(chunk[phrase_start:])


In [None]:
len(phrases)

#Clean up characters that shouldn't have been recognized and drop empty phrases

In [None]:
# Define a regex pattern to match all non-Portuguese letters and non-valid punctuation
pattern = re.compile(r'[^a-zA-Zà-úÁ-Ú0-9,.?!:;()"\' ]')

# Clean each phrase in the list of phrases
cleaned_phrases = []
for phrase in phrases:
    cleaned_phrase = re.sub(pattern, '', phrase)
    
    # Remove any phrases that do not contain any Portuguese letters
    if not re.search(r'[à-úÁ-Úa-zA-Z]', cleaned_phrase):
        continue
    
    # Remove any extra spaces at the start or end of the phrase
    cleaned_phrase = cleaned_phrase.strip()
    
    # Remove multiple spaces in the middle of the phrase
    cleaned_phrase = re.sub(r'\s+', ' ', cleaned_phrase)
    
    cleaned_phrases.append(cleaned_phrase)

In [None]:
len(cleaned_phrases)

#Translate each of the phrases and export to json every 1000 examples

In [None]:
src_language = 'pt'
tgt_language = 'en'

# Load the tokenizer and model for the language pair
model_name = 'Helsinki-NLP/opus-mt-ROMANCE-en'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

translated_phrases = []

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = model.generate(**inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return [text, translation]

# Define the number of threads to use
num_threads = 6

In [None]:
translated_phrases = []
num_threads = 8

# Process the phrases using a thread pool
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit the translation tasks and store the Future objects in a list
    futures = [executor.submit(translate, text) for text in cleaned_phrases]

    # Iterate over the completed Future objects and collect the results
    count = 0
    for future in concurrent.futures.as_completed(futures):
        result = future.result()
        translated_phrases.append(result)
        count += 1
        
        # Write translations to a file every 1000 phrases
        if count % 1000 == 0:
            with open(f'ptbr_phrases_{doc_name}_{(count // 1000)}.json', 'w') as f:
                json.dump(translated_phrases[-1000:], f)
    
    # Write any remaining translations to a file
    if translated_phrases:
        with open(f'ptbr_phrases_{doc_name}_{count // 1000 + 1}.json', 'w') as f:
            json.dump(translated_phrases, f)

