In [None]:
import PyPDF2
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from pdf2image import convert_from_path
import os
import pytesseract

def extract_texts_and_labels_from_directories(base_dir):
    texts = []
    labels = []
    for label in os.listdir(base_dir):
        label_dir = os.path.join(base_dir, label)
        if os.path.isdir(label_dir):
            pdf_files = [f for f in os.listdir(label_dir) if f.lower().endswith('.pdf')]
            for pdf_file in pdf_files:
                pdf_path = os.path.join(label_dir, pdf_file)
                text = extract_text_from_pdf(pdf_path)
                texts.append(text)
                labels.append(label)
    return texts, labels

base_dir = '/Users/Guy/repo/Test-Code/PDFSamples'  

# texts, labels = extract_texts_and_labels_from_directories(base_dir)


def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        # Try extracting text using PyPDF2 (works for text-based PDFs)
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text
        # If no text was found, try OCR
        if not text.strip():
            images = convert_from_path(pdf_path)
            for image in images:
                text += pytesseract.image_to_string(image)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    return text

def predict_pdf_type(pdf_path):
    text = extract_text_from_pdf(pdf_path)
    features = vectorizer.transform([text])
    predicted_type = model.predict(features)[0]
    return predicted_type

texts, labels = extract_texts_and_labels_from_directories(base_dir)

# Vectorize the text data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Train a classifier
model = MultinomialNB()
model.fit(X, labels)

# Save the model and vectorizer
joblib.dump(model, 'pdf_type_classifier.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [11]:
predict_dir = '/Users/Guy/repo/Test-Code/PDFPredict'  
if os.path.isdir(predict_dir):
    pdf_files = [f for f in os.listdir(predict_dir) if f.lower().endswith('.pdf')]
    for pdf_file in pdf_files:
        pdf_path = os.path.join(predict_dir, pdf_file)
        file_type =  predict_pdf_type(pdf_path)
        print(pdf_file + " is predicted as: " + file_type)

Beneficiary Change Sample.pdf is predicted as: Beneficiary Change Request
Disabiltiy Comp Sample.pdf is predicted as: Disability Compensation
Appeals Sample.pdf is predicted as: Appeals Notification
