In [6]:
import os
import json
import spacy
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Load NLP model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

# Document categories for classification
document_categories = ["invoice", "contract", "report", "receipt"]

# Sample training data for classification
training_texts = [
    "Invoice number 12345 from Vendor XYZ for amount $500",
    "This contract between Company A and Company B is valid for 2 years",
    "The monthly financial report shows a profit of $10,000",
    "Receipt of payment for order #789, total $100"
]
training_labels = ["invoice", "contract", "report", "receipt"]

# Train a simple document classifier
vectorizer = TfidfVectorizer()
classifier = MultinomialNB()
model = make_pipeline(vectorizer, classifier)
model.fit(training_texts, training_labels)

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF using OCR."""
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img) + "\n"
    return text.strip()

def extract_named_entities(text):
    """Extract named entities from text using spaCy."""
    doc = nlp(text)
    entities = {ent.label_: ent.text for ent in doc.ents}
    return entities

def classify_document(text):
    """Classify document based on extracted text."""
    return model.predict([text])[0]

def process_document(file_path):
    """Process document: extract text, identify entities, and classify."""
    if file_path.lower().endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
    elif file_path.lower().endswith((".jpg", ".png")):
        text = pytesseract.image_to_string(Image.open(file_path))
    else:
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

    entities = extract_named_entities(text)
    category = classify_document(text)

    result = {
        "file": file_path,
        "extracted_text": text,
        "named_entities": entities,
        "document_category": category
    }

    return result

# Example usage
file_path = "sample_documents.pdf"  # Replace with your document
result = process_document(file_path)
print(json.dumps(result, indent=4))


{
    "file": "sample_documents.pdf",
    "extracted_text": "Invoice Sample\n\nInvoice No: 12345\nDate: 2024-02-01\nVendor: ABC Supplies\nAmount: $1,250.00\n\nDue Date: 2024-02-15\n\nReceipt Sample\n\nStore: XYZ Retail\nDate: 2024-02-10\nTotal: $50.99\n\nPayment Method: Credit Card\n\nContract Sample\n\nThis contract is made between Company A and Company B.\n\nThe agreement is valid for a period of 3 years.\n\nSigned on 2024-01-15.",
    "named_entities": {
        "ORG": "Company A and Company B.",
        "DATE": "2024-01-15",
        "MONEY": "50.99",
        "PERSON": "XYZ Retail\nDate"
    },
    "document_category": "contract"
}


In [5]:
!apt-get install -y poppler-utils


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 21 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.6 [186 kB]
Fetched 186 kB in 1s (266 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 124973 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.6_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.6) ...
Setting up poppler-utils (22.02.0-2ubuntu0.6) ...
Processing triggers for man-db (2.10.2-1) ...


In [2]:
!pip install pytesseract pdf2image spacy scikit-learn pillow
!python -m spacy download en_core_web_sm


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pytesseract, pdf2image
Successfully installed pdf2image-1.17.0 pytesseract-0.3.13
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m88.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by s

In [3]:
!apt-get install -y tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 21 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (3,503 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 124926 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-