In [14]:
import os
from PyPDF2 import PdfReader
import docx
import tiktoken

def limit_to_350_tokens(text, tokenizer):
    tokens = tokenizer.encode(text)
    if len(tokens) > 350:
        tokens = tokens[:350]
    return tokenizer.decode(tokens)

def extract_pdf_text(file_path, tokenizer):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
        if len(tokenizer.encode(text)) >= 350:
            break
    return limit_to_350_tokens(text, tokenizer)

def extract_docx_text(file_path, tokenizer):
    doc = docx.Document(file_path)
    text = " ".join([para.text for para in doc.paragraphs])
    return limit_to_350_tokens(text, tokenizer)

def test_process_files_in_folder(folder_path, tokenizer):
    total_files = 0
    empty_files = 0
    skipped_files = 0
    empty_files_list = []

    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)
            file_extension = os.path.splitext(filename)[1].lower()

            if file_extension in ['.pdf', '.docx']:
                total_files += 1

                # Process file content in memory
                if file_extension == '.pdf':
                    text_excerpt = extract_pdf_text(file_path, tokenizer)
                elif file_extension == '.docx':
                    text_excerpt = extract_docx_text(file_path, tokenizer)

                # Check if the extracted text is empty
                if not text_excerpt.strip():
                    empty_files += 1
                    empty_files_list.append(file_path)  # Store full path
            else:
                skipped_files += 1

    non_empty_files = total_files - empty_files
    accuracy_percentage = (non_empty_files / total_files) * 100 if total_files > 0 else 0

    print("Process Files Testing Report")
    print(f"Total .pdf and .docx files in folder: {total_files}")
    print(f"Total empty .pdf and .docx files: {empty_files}")
    print(f"Accuracy: {accuracy_percentage:.2f}%")
    if empty_files > 0:
        print("List of empty files with full paths:")
        for file in empty_files_list:
            print(f"  - {file}")
    print(f"Total skipped non-PDF/DOCX files: {skipped_files}")
    print("\nNote: Files that are not .pdf or .docx were skipped in processing and are excluded from accuracy checks.")

tokenizer = tiktoken.get_encoding("cl100k_base")
folder_path = "../hngr-isps"
test_process_files_in_folder(folder_path, tokenizer)

Process Files Testing Report
Total .pdf and .docx files in folder: 328
Total empty .pdf and .docx files: 53
Accuracy: 83.84%
List of empty files with full paths:
  - ../hngr-isps/ISP's for Josh Price/Lambrecht, Sarah--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh Price/Overgaard, Sara--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh Price/Baker, Shane--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh Price/Kim, John--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh Price/Herzog, Heidi--Final ISP.pdf
  - ../hngr-isps/2021 ISP's for Josh-Copy/Oh, Sarah-Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Fletcher, Sarah--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Roembach-Clark, Haley--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Smith, Lauren--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Wilcox, Mary Ann--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Phillips, Rebekah--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Trim, Betsy--Final ISP.pdf
  - ../hngr-isps/ISP's for Josh(2)/Ges