diff --git a/app/pdf_processor.py b/app/pdf_processor.py index d9833e0..7eaacb1 100644 --- a/app/pdf_processor.py +++ b/app/pdf_processor.py @@ -4,9 +4,9 @@ import pytesseract from pdf2image import convert_from_path from PIL import Image +from rapidfuzz import fuzz, process # For fuzzy matching - -def extract_text_from_pdf(file_path, pattern): +def extract_text_from_pdf(file_path, patterns): try: pdf_file = fitz.open(file_path) trans_numbers = [] @@ -15,17 +15,45 @@ def extract_text_from_pdf(file_path, pattern): # Convert each PDF page to an image images = convert_from_path(file_path, dpi=300) # High DPI for better OCR + # for number, image in enumerate(images): + # # Convert image to text using Tesseract OCR + # text = pytesseract.image_to_string(image) + + # for pattern in patterns: + # regex = rf'{re.escape(pattern)}\s+(\d+)' + # matches = re.findall(regex, text) + + # if matches: + # trans_numbers.append(matches[0]) + # page_numbers.append(number) + # break # Stop checking once a pattern matches on a page + for number, image in enumerate(images): # Convert image to text using Tesseract OCR text = pytesseract.image_to_string(image) - # Search for pattern in extracted text - regex = rf'{re.escape(pattern)}\s+(\d+)' - matches = re.findall(regex, text) + for pattern in patterns: + # Use fuzzy matching to find a close match + best_match, score, _ = process.extractOne(pattern, text.split("\n"), scorer=fuzz.partial_ratio) + + if score > 80: # If similarity score is high + print(1) + print(2) + print(best_match) + match = re.findall('\d{3,}', best_match) # Match 3 or more digits + print(match) + if match: + trans_numbers.append(match[0]) + page_numbers.append(number) + break # Stop checking once a match is found + + # # Search for pattern in extracted text + # regex = rf'{re.escape(pattern)}\s+(\d+)' + # matches = re.findall(regex, text) - if matches: - trans_numbers.append(matches[0]) - page_numbers.append(number) + # if matches: + # trans_numbers.append(matches[0]) + # page_numbers.append(number) if page_numbers and trans_numbers: saved_files = [] @@ -40,7 +68,7 @@ def extract_text_from_pdf(file_path, pattern): return {"message": "Files saved successfully", "files": saved_files} else: - return {"error": f"No '{pattern}' number found in the document"} + return {"error": f"No matches found in the document"} except Exception as e: return {"error": f"An error occurred during PDF processing: {str(e)}"} diff --git a/app/routes.py b/app/routes.py index 4b73e8f..8f159ce 100644 --- a/app/routes.py +++ b/app/routes.py @@ -20,7 +20,8 @@ def index(): def upload_file(): try: file = request.files["file"] - pattern = request.form.get("pattern", r'Trans\s+(\d+)') + patterns = request.form.getlist("patterns") or ["Outbound delivery", "ABC", "AMS Outb Delivery", "ReplDLv HC w Trans", "Cash Sale"] # Default patterns + # pattern = request.form.get("pattern", r'Trans\s+(\d+)') download_format = request.form.get("download_format", "pdf") if not file or file.filename == "": @@ -33,7 +34,7 @@ def upload_file(): file_path = os.path.join(UPLOAD_FOLDER, f"{timestamp}_{file.filename}") file.save(file_path) - result = extract_text_from_pdf(file_path, pattern) + result = extract_text_from_pdf(file_path, patterns) if "error" in result: return jsonify({"error": result["error"]}), 400 diff --git a/templates/upload_file.html b/templates/upload_file.html index 9a1514f..86e03ac 100644 --- a/templates/upload_file.html +++ b/templates/upload_file.html @@ -36,10 +36,10 @@

PDF text Extraction

-
+