# Improve data extraction

In [1]:
import cv2
import json
import pytesseract

from pdf2image import convert_from_path
from textblob import TextBlob

In [2]:
filepath = "./imt/files/86971.pdf"

# 1. OCR

In [9]:

images = convert_from_path(filepath)
text = "\n".join(pytesseract.image_to_string(img) for img in images)

with open("./output.txt", "w") as f:
    f.write(text)

In [3]:
import pdfplumber

filepath2 = "./tables.pdf"


def extract_pdf_content_by_pdfplumber(filepath) -> list[dict]:
    page_results = []
    
    with pdfplumber.open(filepath) as pdf:
        for page in pdf.pages:
            extracted_tables = []
            tables = page.extract_tables()
            page_text = page.extract_text()
            for i, table in enumerate(tables):
                table_content = []
                # Convert table to a list of dictionaries
                headers = table[0]  # First row as headers
                rows = table[1:]  # Remaining rows
                
                for row in rows:
                    row_text = ", ".join(f"{headers[i]}: {row[i]}" for i in range(len(row)))
                    table_content.append(row_text)

                table_str = f"Table {i} content:\n" + "\n".join(table_content)
                extracted_tables.append(table_str)
            
            tables_str = "\n\n".join(extracted_tables)

            page_results.append({
                "tables": tables_str,
                "text": page_text
            })
    

    return page_results

result = extract_pdf_content_by_pdfplumber(filepath)

print(json.dumps(result, indent=2))

[
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  },
  {
    "tables": "",
    "text": ""
  }
]


In [4]:
result

[{'tables': 'Table 0 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz\n\nTable 1 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz',
  'text': 'Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz\nId Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz'},
 {'tables': 'Table 0 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz\n\nTable 1 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz',
  'text': 'Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz\nId Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz\nText here,\nNext text'}]

In [11]:
images = convert_from_path(filepath2)
ocr_pages = [pytesseract.image_to_string(img) for img in images]
ocr_pages

['Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n\n3 Maikel Fritz\n\nId Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n\n3 Maikel Fritz\n\n',
 'Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n\n3 Maikel Fritz\n\nId Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n\n3 Maikel Fritz\n\nText here,\n\nNext text\n\n']

In [12]:
pages = [
    {
        "pdfplumber": pdfplumber_result,
        "ocr_text": orc_text
    }
    for pdfplumber_result, orc_text in zip(result, ocr_pages)
]
pages

[{'pdfplumber': {'tables': 'Table 0 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz\n\nTable 1 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz',
   'text': 'Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz\nId Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz'},
  'ocr_text': 'Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n\n3 Maikel Fritz\n\nId Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n\n3 Maikel Fritz\n\n'},
 {'pdfplumber': {'tables': 'Table 0 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz\n\nTable 1 content:\nId: 1, Name: Enrico, Surname: Goerlitz\nId: 2, Name: Teddy, Surname: Bear\nId: 3, Name: Maikel, Surname: Fritz',
   'text': 'Id Name Surname\n1 Enrico Goerlitz\n2 Teddy Bear\n3 Maikel Fritz\nId Name Surname\n1 Enrico Goerlitz\