In [None]:
#Pre process the new report
import fitz
import re
import os

def extract_sections_and_sentences(pdf_path, start_phrases, end_phrases, keywords):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    # Initialize variables to store extracted data
    sections = []
    sentences = []

    # Iterate through each page of the PDF
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        page_text = page.get_text()
        paragraphs = re.split(r'\n|\r\n', page_text)

        # Initialize variables for section extraction
        current_section = ""
        extracting_section = False

        # Iterate through paragraphs to extract sections
        for paragraph in paragraphs:
            # Check for the start of a section
            for start_phrase in start_phrases:
                if start_phrase.lower() in paragraph.lower():
                    current_section = paragraph
                    extracting_section = True
                    break

            # Check for the end of a section
            for end_phrase in end_phrases:
                if end_phrase.lower() in paragraph.lower():
                    current_section += "\n" + paragraph  # Include the end phrase
                    sections.append(current_section)
                    current_section = ""
                    extracting_section = False
                    break

            # If extracting a section, append to the current section
            if extracting_section:
                current_section += "\n" + paragraph

            # Extract sentences containing keywords
            for sentence in re.split(r'(?<=[.!?])\s', paragraph):
                if any(keyword.lower() in sentence.lower() for keyword in keywords):
                    sentences.append(sentence)

    pdf_document.close()

    # Combine sections and sentences
    combined_text = "\n\n".join(sections + sentences)

    return combined_text

def process_pdfs_in_folder(folder_path, start_phrases, end_phrases, keywords, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        output_file = os.path.join(output_folder, pdf_file.replace('.pdf', '.txt'))

        combined_text = extract_sections_and_sentences(pdf_path, start_phrases, end_phrases, keywords)

        # Save the combined text to the output file
        with open(output_file, 'w', encoding='utf-8') as file:
            file.write(combined_text)

        print("Sections and sentences extracted and saved to", output_file)

# Action:
folder_path = 'New Report'
start_phrases = ["recommendation", "lessons learned", "advice to planning authorities"]
end_phrases = ["reference", "appendix", "annex", "list of", "conclusion", "bibliography", "works cited",
               "introduction", "board member statements", "executive summary", "abbreviations and acronyms"]
keywords = ["he", "she", "they", "I", "user", "operator", "manager", "management", "team", "lead", "leader",
            "inspector", "mechanic", "engineer", "driver", "pilot", "crew", "worker", "contractor", "operative"]
output_folder = 'New Report Pre Processed'
process_pdfs_in_folder(folder_path, start_phrases, end_phrases, keywords, output_folder)
import nltk
from nltk.corpus import stopwords
import string
import os

# Download NLTK stopwords
nltk.download('stopwords')

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    words = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Join the words back into a cleaned text
    cleaned_text = ' '.join(words)

    return cleaned_text

def clean_text_files_in_folder(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    txt_files = [f for f in os.listdir(input_folder) if f.endswith('.txt')]

    for txt_file in txt_files:
        input_file_path = os.path.join(input_folder, txt_file)
        output_file_path = os.path.join(output_folder, txt_file)

        with open(input_file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        cleaned_text = clean_text(text)

        with open(output_file_path, 'w', encoding='utf-8') as file:
            file.write(cleaned_text)

        print(f"Text cleaned and saved to {output_file_path}")

# Action:
input_folder = 'New Report Pre Processed'
output_folder = 'New Report Cleaned'

clean_text_files_in_folder(input_folder, output_folder)

#Begin classification

import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm

# Define the folder containing models
models_folder = "Models"

# Define the path to the extracted text file
text_folder = "New Report Cleaned"
text_file_path = None
for filename in os.listdir(text_folder):
    if filename.endswith(".txt"):
        text_file_path = os.path.join(text_folder, filename)
        break
with open(text_file_path, "r", encoding="utf-8") as file:
    text = file.read()

# Tokenize
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Initialize a list to store predictions
predictions = []

# Iterate through models
for model_name in os.listdir(models_folder):
    # Check if the item in the folder is a directory
    if os.path.isdir(os.path.join(models_folder, model_name)):
        # Load the fine-tuned BERT model
        model = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_folder, model_name))

        # Make predictions using the tokenized input
        with torch.no_grad():
            outputs = model(**inputs)

        probabilities = torch.softmax(outputs.logits, dim=1)

        predicted_class = torch.argmax(probabilities, dim=1).item()

        # Store
        predictions.append({
            "model_name": model_name,
            "predicted_class": predicted_class,
            "class_probabilities": probabilities.tolist()[0]
        })

# Define the output file path
output_file_path = "Predictions/Predictions.txt"

# Open the file in write mode
with open(output_file_path, "w", encoding="utf-8") as output_file:
    for prediction in predictions:
        output_file.write(f"Model Name: {prediction['model_name']}\n")
        output_file.write(f"Predicted Class: {prediction['predicted_class']}\n")

print(f"Predictions saved to {output_file_path}") #194 seconds







Sections and sentences extracted and saved to New Report Pre Processed\v2_2023_Relatório de Investigação de Incidente - P-19 (versão para revisão da atual GESTÂO) (1).txt
Text cleaned and saved to New Report Cleaned\v2_2023_Relatório de Investigação de Incidente - P-19 (versão para revisão da atual GESTÂO) (1).txt


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ypb20167\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predictions saved to Predictions/Predictions.txt
