# Importing Libraries

In [1]:
from PyPDF2 import PdfReader
import os
from openai import OpenAI

# Function that:

1. Opens PDF
2. Checks for the pages that contains the word "Budget"
3. Extracts the text and chunks it
4. Puts it in a .txt file

In [2]:
def extract_budget_related_chunks(pdf_path):

    """
    Extract text chunks from PDF pages that contain the word 'Budget'.
    """

    chunks = []
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text = page.extract_text()
        if text and 'Budget' in text:
            # Simple chunking logic: split by paragraphs or some delimiter
            page_chunks = text.split('\n\n') # Example split by empty line
            chunks.extend(page_chunks)
    return chunks

pdf_folder = '../Datasets/PDFs/'
output_file_path = 'budget_chunks.txt'

all_chunks = []

# Process each PDF
for pdf_file in os.listdir(pdf_folder):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, pdf_file)
        try:
            pdf_chunks = extract_budget_related_chunks(pdf_path)
            all_chunks.extend(pdf_chunks)
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

# Save all chunks to a single file
with open(output_file_path, 'w', encoding='utf-8') as file:
    for chunk in all_chunks:
        file.write(chunk + '\n\n') # Write each chunk and separate by empty lines

print("Done processing PDFs and extracting text chunks.")


Error processing 20230330_jobcenter_kreis_kleve_-_amip_2023.pdf: EOF marker not found
Error processing JCDA_AMIP2023_Website.pdf: EOF marker not found


Overwriting cache for 0 1437


Done processing PDFs and extracting text chunks.


# Data Augmentation with GPT3.5

Generating 100 text samples given a prompt for generating texts based on 3 different examples from different chunks of texts from Original PDFs

In [None]:
client = OpenAI(
    api_key= ··· , # My API key here
)

generated_texts = []  # List to store generated text contents

# Generating 100 texts 

for _ in range(100):

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": """
    Bitte generiere einen Text, der auf den folgenden Beispielen basiert:

    Beispiel 1: Für das Jahr 2022 werden dem JC CB 9.976 TEUR Eingliederungsleistungen zur Verfügung stehen.

    Beispiel 2: Gemäß der aktuellen Schätzwerttabelle vom 18.10.2018 steigt das Budget für das Jobcenter Region Hannover für Verwaltungskosten auf 100,4 Mio. € um 12,6 % (Erhöhung um 11,2 Mio. €). Für die Eingliederungsleistungen erhöht sich das Budget um 16,9 Mio. € auf 97,5 Mio. € (Steigerung um 21 %).

    Generiere bitte einen weiteren Text, der ähnliche Budgetinformationen in einem deutschen Kontext beschreibt:
    """,
            }
        ],
        model="gpt-3.5-turbo",
    )
    generated_content = chat_completion.choices[0].message.content # Get chat answer
    generated_texts.append(generated_content)  # Add to the list

# List to file.txt

In [None]:
with open('generated_texts.txt', 'w') as f:
    for i in range(len(generated_texts)):
         f.write(generated_texts[i])
         f.write('\n')