<a href="https://colab.research.google.com/github/forbiddenvelocity/ocr-pdf-text/blob/main/multipleLanguages.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install easyocr
!pip install langdetect
!pip install pymupdf
!pip install rapidfuzz


Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->easyocr)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==

In [None]:
import cv2
import easyocr
from langdetect import detect, LangDetectException
import torch
from PIL import Image
from google.colab import files
import fitz  # PyMuPDF
from rapidfuzz import distance
from nltk.metrics import edit_distance

reader_en = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
reader_hi = easyocr.Reader(['hi'], gpu=torch.cuda.is_available())
reader_te = easyocr.Reader(['te'], gpu=torch.cuda.is_available())

# Function to extract text using EasyOCR
def extract_text_from_image(image, reader):
    result = reader.readtext(image, detail=0, paragraph=True)
    return " ".join(result)

# Function to detect the language of the extracted text
def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except LangDetectException:
        return None

# Function to detect tables in an image using OpenCV
def detect_tables(image):
    img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    img_bin = cv2.adaptiveThreshold(~img, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, -2)

    # Create horizontal and vertical structure
    horizontal = img_bin.copy()
    vertical = img_bin.copy()

    horizontal_size = int(horizontal.shape[1] / 30)
    horizontal_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (horizontal_size, 1))
    horizontal = cv2.erode(horizontal, horizontal_structure)
    horizontal = cv2.dilate(horizontal, horizontal_structure)

    vertical_size = int(vertical.shape[0] / 30)
    vertical_structure = cv2.getStructuringElement(cv2.MORPH_RECT, (1, vertical_size))
    vertical = cv2.erode(vertical, vertical_structure)
    vertical = cv2.dilate(vertical, vertical_structure)

    # Combine horizontal and vertical lines
    mask = horizontal + vertical

    # Find contours
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    table_images = []

    for cnt in contours:
        x, y, w, h = cv2.boundingRect(cnt)
        if w > 100 and h > 100:  # Filtering small boxes
            table_images.append(img[y:y+h, x:x+w])

    return table_images

# Function to save extracted text to a file
def save_text_to_file(text, file_name):
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(text)

# Function to convert PDF to images
def pdf_to_images(pdf_path):
    doc = fitz.open(pdf_path)
    images = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        pix = page.get_pixmap()
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    return images

# Upload your files
uploaded = files.upload()

def calculate_cer(predicted_text, ground_truth_text):
    return distance.Levenshtein.distance(predicted_text, ground_truth_text) / len(ground_truth_text)

# Function to calculate Word Error Rate (WER)
def calculate_wer(predicted_text, ground_truth_text):
    predicted_words = predicted_text.split()
    ground_truth_words = ground_truth_text.split()
    return edit_distance(predicted_words, ground_truth_words) / len(ground_truth_words)

# Function to load ground truth text from a file
def load_ground_truth(file_name):
    with open(file_name, 'r', encoding='utf-8') as file:
        return file.read().strip()

for file_name in uploaded.keys():
    print(f"Processing file: {file_name}")

    if file_name.lower().endswith('.pdf'):
        images = pdf_to_images(file_name)
    else:
        images = [Image.open(file_name)]

    for page_num, image in enumerate(images):
        image_path = f"{file_name}_page_{page_num + 1}.png"
        image.save(image_path)

        english_text_sample = extract_text_from_image(image_path, reader_en)
        hindi_text_sample = extract_text_from_image(image_path, reader_hi)
        telugu_text_sample = extract_text_from_image(image_path, reader_te)

        detected_language = detect_language(english_text_sample + " " + hindi_text_sample + " " + telugu_text_sample)

        if detected_language == 'en':
            print("Detected language: English")
            extracted_text = english_text_sample
        elif detected_language == 'hi':
            print("Detected language: Hindi")
            extracted_text = hindi_text_sample
        elif detected_language == 'te':
            print("Detected language: Telugu")
            extracted_text = telugu_text_sample
        else:
            print("Language not detected or supported. Defaulting to English.")
            extracted_text = english_text_sample

        print("Extracted Text:\n", extracted_text)

        text_file_name = f"{image_path}_extracted_text.txt"
        save_text_to_file(extracted_text, text_file_name)

        ground_truth_text = load_ground_truth('ground_truth.txt')  # Replace with your ground truth file
        cer = calculate_cer(extracted_text, ground_truth_text)
        wer = calculate_wer(extracted_text, ground_truth_text)

        print(f"CER: {cer:.2f}")
        print(f"WER: {wer:.2f}")

        table_images = detect_tables(cv2.imread(image_path))
        for i, table_image in enumerate(table_images):
            table_image_pil = Image.fromarray(table_image)
            display(table_image_pil)
            table_text = extract_text_from_image(table_image, reader_en) + "\n" + extract_text_from_image(table_image, reader_hi)
            print(f"Table {i+1} Text:\n", table_text)

            table_text_file_name = f"{image_path}_table_{i+1}_text.txt"
            save_text_to_file(table_text, table_text_file_name)


In [None]:
import zipfile
import os

# Path to the uploaded zip file
zip_path = '/content/GROTOAP2-sample.zip'

# Create a directory to store the extracted files
extract_dir = '/content/grotoap2_sample'
os.makedirs(extract_dir, exist_ok=True)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)


In [None]:
!pip install easyocr
!pip install rapidfuzz
!pip install lxml
!pip install pdf2image
!apt-get install poppler-utils


Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 45 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.4 [186 kB]
Fetched 186 kB in 2s (115 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 121925 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.4_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.4) ...
Setting up poppler-utils (22.02.0-2ubuntu0.4) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
# Display the contents of download_pdfs.sh
!cat /content/grotoap2_sample/grotoap2/download_pdfs.sh

# Run the download_pdfs.sh script
!bash /content/grotoap2_sample/grotoap2/download_pdfs.sh


#!/bin/bash

OIFS="$IFS"
IFS=' '

# Navigate to the correct directory
cd /content/grotoap2_sample/grotoap2/dataset || exit

for linksfile in pdflinks*.txt
do
    while read -r line
    do
        read -ra params <<< "${line}"
        source=${params[1]}
        dest=${params[0]}.pdf
        destcopy=$dest.copy
        if [ ! -f $dest ]; then
            wget $source -O $destcopy
            mv $destcopy $dest
            sleep 2
        fi
    done < "${linksfile}"
done

IFS="$OIFS"
/content/grotoap2_sample/grotoap2/download_pdfs.sh: line 9: pdflinks*.txt: No such file or directory


In [None]:
import xml.etree.ElementTree as ET

# Function to inspect and print full XML content
def inspect_full_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    ET.dump(root)  # Print the full XML structure

# Inspect a few XML files
xml_files = [os.path.join(dataset_dir, file) for file in os.listdir(dataset_dir) if file.endswith('.cxml')]

for xml_file in xml_files[:5]:
    print(f"Inspecting {xml_file}:")
    inspect_full_xml(xml_file)
    print("-" * 40)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
<WordNext Value="6298" />
<WordNumChars Value="" />
<Character>
<CharacterID Value="36200" />
<CharacterCorners>
<Vertex x="489.1" y="171.4" />
<Vertex x="494.3" y="178.3" />
</CharacterCorners>
<CharacterNext Value="36201" />
<GT_Text Value="N" />
</Character>
<Character>
<CharacterID Value="36201" />
<CharacterCorners>
<Vertex x="494.3" y="171.4" />
<Vertex x="496.1" y="178.3" />
</CharacterCorners>
<CharacterNext Value="36202" />
<GT_Text Value="," />
</Character>
</Word>
<Word>
<WordID Value="6298" />
<WordCorners>
<Vertex x="498.2" y="171.4" />
<Vertex x="503.3" y="178.3" />
</WordCorners>
<WordNext Value="6299" />
<WordNumChars Value="" />
<Character>
<CharacterID Value="36202" />
<CharacterCorners>
<Vertex x="498.2" y="171.4" />
<Vertex x="501.3" y="178.3" />
</CharacterCorners>
<CharacterNext Value="36203" />
<GT_Text Value="e" />
</Character>
<Character>
<CharacterID Value="36203" />
<CharacterCorners>
<Vertex x=

In [None]:
import xml.etree.ElementTree as ET
import easyocr
from rapidfuzz import distance
from pdf2image import convert_from_path
import cv2
import numpy as np
import os

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'], gpu=False)

# Function to calculate CER
def calculate_cer(predicted_text, ground_truth_text):
    if len(ground_truth_text) == 0:
        return float('inf')  # Return infinity if ground truth is empty to indicate an error
    return distance.Levenshtein.distance(predicted_text, ground_truth_text) / len(ground_truth_text)

# Path to the dataset directory
dataset_dir = '/content/grotoap2_sample/grotoap2/dataset/00'

# Counter to limit processing to the first 10 images/PDFs
counter = 0
limit = 10

# Iterate through the dataset
for root_dir, _, files in os.walk(dataset_dir):
    for file in files:
        if file.endswith('.cxml') and counter < limit:
            # Parse the XML file
            xml_path = os.path.join(root_dir, file)
            tree = ET.parse(xml_path)
            root = tree.getroot()

            # Extract ground truth text from GT_Text attributes within Character elements
            ground_truth_text = ' '.join([char.attrib.get('Value', '') for char in root.findall('.//Character/GT_Text') if char.attrib.get('Value', '')])

            # Skip if ground truth text is empty
            if not ground_truth_text:
                print(f"Skipping {file} due to empty ground truth text.")
                continue

            # Corresponding PDF path (downloaded previously)
            file_id = file.replace('.cxml', '')
            pdf_path = os.path.join(root_dir, f'{file_id}.pdf')

            if not os.path.exists(pdf_path):
                print(f"PDF file does not exist: {pdf_path}")
                continue

            try:
                # Convert the first page of the PDF to an image
                images = convert_from_path(pdf_path)
                if images:
                    first_page_image = images[0]
                    first_page_image = cv2.cvtColor(np.array(first_page_image), cv2.COLOR_RGB2BGR)

                    # Perform OCR on the first page image
                    ocr_result = reader.readtext(first_page_image, detail=0)
                    extracted_text = ' '.join(ocr_result)

                    # Calculate CER
                    cer = calculate_cer(extracted_text, ground_truth_text)
                    print(f"CER: {cer:.2f}")

                    # Display the results
                    print(f"Ground Truth: {ground_truth_text}")
                    print(f"Extracted Text: {extracted_text}")

                    counter += 1
            except Exception as e:
                print(f"Error processing file {pdf_path}: {e}")
                continue

        if counter >= limit:
            break
    if counter >= limit:
        break




CER: 0.94
Ground Truth: v o n K ä n e l e t a l . B M C P s y c h i a t r y 2 0 1 1 , 1 1 : 9 8 h t t p : / / w w w . b i o m e d c e n t r a l . c o m / 1 4 7 1 - 2 4 4 X / 1 1 / 9 8 R E S E A R C H A R T I C L E O p e n A c c e s s D i s t r e s s r e l a t e d t o m y o c a r d i a l i n f a r c t i o n a n d c a r d i o v a s c u l a r o u t c o m e : a r e t r o s p e c t i v e o b s e r v a t i o n a l s t u d y R o l a n d v o n K ä n e l 1 , 2 * , R o m a n H a r i 1 , J e a n - P a u l S c h m i d 2 , H u g o S a n e r 2 a n d S t e f a n B e g r é 1 A b s t r a c t B a c k g r o u n d : D u r i n g a c u t e c o r o n a r y s y n d r o m e s p a t i e n t s p e r c e i v e i n t e n s e d i s t r e s s . W e h y p o t h e s i z e d t h a t r e t r o s p e c t i v e r a t i n g s o f p a t i e n t s ’ M I - r e l a t e d f e a r o f d y i n g , h e l p l e s s n e s s , o r p a i n , a l l a s s e s s e d w i t h i n t h e f i r s t y e a r p o s t - M I , a r e a s s o c i a 

KeyboardInterrupt: 