<a href="https://colab.research.google.com/github/deekshak77/OCR_TASK/blob/main/OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber pandas pytesseract pdf2image openpyxl
!apt-get install -y poppler-utils



Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.5-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pdfplumber
import pandas as pd
import pytesseract
from pdf2image import convert_from_path
from google.colab import files

def extract_tables_from_pdf(pdf_path, page_number):
    tables_data = []
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_number - 1]
        tables = page.extract_tables()
        for table in tables:
            df = pd.DataFrame(table)
            tables_data.append(df)
    return tables_data

def perform_ocr_on_page(pdf_path, page_number):
    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    extracted_text = []
    for image in images:
        text = pytesseract.image_to_string(image)
        extracted_text.append(text)
    return "\n".join(extracted_text)

pdf_files = {
    "cardio_structured.pdf": 6,
    "prot_sap_102.pdf": 50,
    "prot_sap_1.pdf": 14
}

output_file = "/content/extracted_tables.xlsx"
writer = pd.ExcelWriter(output_file, engine='openpyxl')

uploaded = files.upload()

for pdf_name, page_number in pdf_files.items():
    print(f"Processing {pdf_name} - Page {page_number}...")
    pdf_path = f"/content/{pdf_name}"

    if pdf_name in uploaded:
        try:
            tables = extract_tables_from_pdf(pdf_path, page_number)

            if tables:
                for i, table in enumerate(tables):
                    table.to_excel(writer, sheet_name=f"{pdf_name}_Page{page_number}_Table{i+1}", index=False)
            else:
                extracted_text = perform_ocr_on_page(pdf_path, page_number)
                df_text = pd.DataFrame([extracted_text.split("\n")])
                df_text.to_excel(writer, sheet_name=f"{pdf_name}_Page{page_number}_OCR", index=False)

        except Exception as e:
            print(f"Error processing {pdf_name} - Page {page_number}: {e}")
    else:
        print(f"Error: {pdf_name} not found. Please upload the file.")

writer.close()
print(f"Extraction completed! Saved as {output_file}")
files.download(output_file)


Saving cardio_structured.pdf to cardio_structured.pdf
Saving prot_sap_1.pdf to prot_sap_1.pdf
Saving prot_sap_102.pdf to prot_sap_102.pdf
Saving sample.xlsx to sample.xlsx
Processing cardio_structured.pdf - Page 6...




Processing prot_sap_102.pdf - Page 50...
Processing prot_sap_1.pdf - Page 14...
Extraction completed! Saved as /content/extracted_tables.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>