Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions app/pdf_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
from rapidfuzz import fuzz, process # For fuzzy matching


def extract_text_from_pdf(file_path, pattern):
def extract_text_from_pdf(file_path, patterns):
try:
pdf_file = fitz.open(file_path)
trans_numbers = []
Expand All @@ -15,17 +15,45 @@ def extract_text_from_pdf(file_path, pattern):
# Convert each PDF page to an image
images = convert_from_path(file_path, dpi=300) # High DPI for better OCR

# for number, image in enumerate(images):
# # Convert image to text using Tesseract OCR
# text = pytesseract.image_to_string(image)

# for pattern in patterns:
# regex = rf'{re.escape(pattern)}\s+(\d+)'
# matches = re.findall(regex, text)

# if matches:
# trans_numbers.append(matches[0])
# page_numbers.append(number)
# break # Stop checking once a pattern matches on a page

for number, image in enumerate(images):
# Convert image to text using Tesseract OCR
text = pytesseract.image_to_string(image)

# Search for pattern in extracted text
regex = rf'{re.escape(pattern)}\s+(\d+)'
matches = re.findall(regex, text)
for pattern in patterns:
# Use fuzzy matching to find a close match
best_match, score, _ = process.extractOne(pattern, text.split("\n"), scorer=fuzz.partial_ratio)

if score > 80: # If similarity score is high
print(1)
print(2)
print(best_match)
match = re.findall('\d{3,}', best_match) # Match 3 or more digits
print(match)
if match:
trans_numbers.append(match[0])
page_numbers.append(number)
break # Stop checking once a match is found

# # Search for pattern in extracted text
# regex = rf'{re.escape(pattern)}\s+(\d+)'
# matches = re.findall(regex, text)

if matches:
trans_numbers.append(matches[0])
page_numbers.append(number)
# if matches:
# trans_numbers.append(matches[0])
# page_numbers.append(number)

if page_numbers and trans_numbers:
saved_files = []
Expand All @@ -40,7 +68,7 @@ def extract_text_from_pdf(file_path, pattern):

return {"message": "Files saved successfully", "files": saved_files}
else:
return {"error": f"No '{pattern}' number found in the document"}
return {"error": f"No matches found in the document"}

except Exception as e:
return {"error": f"An error occurred during PDF processing: {str(e)}"}
5 changes: 3 additions & 2 deletions app/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def index():
def upload_file():
try:
file = request.files["file"]
pattern = request.form.get("pattern", r'Trans\s+(\d+)')
patterns = request.form.getlist("patterns") or ["Outbound delivery", "ABC", "AMS Outb Delivery", "ReplDLv HC w Trans", "Cash Sale"] # Default patterns
# pattern = request.form.get("pattern", r'Trans\s+(\d+)')
download_format = request.form.get("download_format", "pdf")

if not file or file.filename == "":
Expand All @@ -33,7 +34,7 @@ def upload_file():
file_path = os.path.join(UPLOAD_FOLDER, f"{timestamp}_{file.filename}")
file.save(file_path)

result = extract_text_from_pdf(file_path, pattern)
result = extract_text_from_pdf(file_path, patterns)

if "error" in result:
return jsonify({"error": result["error"]}), 400
Expand Down
4 changes: 2 additions & 2 deletions templates/upload_file.html
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@ <h2 class="text-center mb-4">PDF text Extraction</h2>
<input type="file" class="form-control" name="file" multiple required>
</div>

<div class="mb-4">
<!-- <div class="mb-4">
<label for="pattern" class="form-label">Search Pattern:</label>
<input type="text" class="form-control" name="pattern" id="pattern" placeholder="Type your pattern" value="Trans" required>
</div>
</div> -->

<!-- dropdown for download format -->
<div class="mb-4">
Expand Down