In [None]:
import PyPDF2
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load trained model and vectorizer
model = joblib.load('pdf_type_classifier.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
    return text

def predict_pdf_type(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    features = vectorizer.transform([text])
    predicted_type = model.predict(features)[0]
    return predicted_type

# Example usage:
# pdf_path = 'sample.pdf'
# print("Predicted document type:", predict_pdf_type(pdf_path))

In [1]:
# THis is a script framework to train a model for classifying PDF documents.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib

# Example training data (replace with your own extracted PDF texts and labels)
texts = [
    "Invoice for services rendered in June 2023...",
    "This is a research paper about machine learning...",
    "Your bank statement for May 2023...",
    "Meeting agenda for project kickoff...",
]
labels = [
    "invoice",
    "research_paper",
    "bank_statement",
    "meeting_agenda",
]

# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Train a classifier
model = MultinomialNB()
model.fit(X, labels)

# Save the model and vectorizer
joblib.dump(model, 'pdf_type_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
import os

def extract_texts_and_labels_from_directories(base_dir):
    texts = []
    labels = []
    for label in os.listdir(base_dir):
        label_dir = os.path.join(base_dir, label)
        if os.path.isdir(label_dir):
            pdf_files = [f for f in os.listdir(label_dir) if f.lower().endswith('.pdf')]
            for pdf_file in pdf_files:
                pdf_path = os.path.join(label_dir, pdf_file)
                text = extract_text_from_pdf(pdf_path)
                texts.append(text)
                labels.append(label)
    return texts, labels

# Example usage:
# base_dir = 'path/to/base_directory'
# texts, labels = extract_texts_and_labels_from_directories(base_dir)

In [2]:
pip install PyPDF2 scikit-learn joblib

Collecting PyPDF2
  Obtaining dependency information for PyPDF2 from https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl.metadata
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.
