# Key Words Counter

* It counts key words given in a list.

* It reads text by using PyMuPDF. It has better performancer than OCR on PDF documents

In [1]:
import re
import os
import fitz  # PyMuPDF
import pandas as pd

# Define uncertainty terms (exact words, no lemmatization)
uncertainty_related_words = ["fear", "indecision", "instability", "jittery", "nervousness", "precarious", "tense", 
                             "tension", "uncertain", "uncertainly", "uncertainty", "unclear", "unknown", "unpredictable", 
                             "unsettled", "unstable", "volatile", "volatility", "worry"]

# Folder path containing PDF reports
folder_path = r"C:\Users\ismet\OneDrive - University of Strathclyde\Desktop\TechPro\3 - DL\Ismet\13 - CV - OCR\4 - BRUI\Reports"

# Create a list to store results
results_list = []

# Iterate through files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(folder_path, filename)

        with fitz.open(file_path) as pdf_file:
            text = ""
            for page in pdf_file:
                text += page.get_text()

            # Preprocess text for cleaner analysis
            text = re.sub(r'\s+', ' ', text)  # Combine multiple spaces
            text = re.sub(r'\n', ' ', text)   # Replace newlines with spaces
            text = ''.join(ch for ch in text if ch.isprintable())  # Remove non-printable characters
            text = text.lower()  # Convert entire text to lowercase

            # Initialize keyword counts for uncertainty terms
            keyword_counts = {word: 0 for word in uncertainty_related_words}

            # Count keyword occurrences in the text (including keywords with '-')
            for keyword in uncertainty_related_words:
                keyword_pattern = re.compile(r'\b' + re.escape(keyword) + r'\b', re.IGNORECASE)
                keyword_counts[keyword] += len(re.findall(keyword_pattern, text))

            # Add results to the list
            result = {
                "File": filename
            }
            # Add individual keyword counts to result
            result.update(keyword_counts)

            results_list.append(result)

# Convert results list to DataFrame
results = pd.DataFrame(results_list)

# Get current working directory
current_dir = os.getcwd()

# Construct output file path
excel_file_path = os.path.join(current_dir, "Keyword_counts.xlsx")

# Save results to Excel file
results.to_excel(excel_file_path, index=False)

print("Excel file successfully created:", excel_file_path)


Excel file successfully created: C:\Users\ismet\OneDrive - University of Strathclyde\Desktop\TechPro\3 - DL\Ismet\13 - CV - OCR\4 - BRUI\Keyword_counts.xlsx
