In [None]:
import json
import spacy
from transformers import pipeline

# Load spaCy model
nlp = spacy.load("de_core_news_sm")  # Use a German spaCy model

# Load data from JSON file
with open("chunks.json", "r", encoding="utf-8") as file:
    data_list = json.load(file)

# Generate dataset
dataset = []

# Process each entry in the JSON file
for data in data_list:
    # Assuming the JSON structure: {"id": ..., "profession": ..., "text": ...}
    profession = data.get("profession", "")
    text = data.get("text", "")

    # Change Einstiegsgehalt lt. KV: to Einstiegsgehalt lt. KV -
    text = text.replace("Einstiegsgehalt lt. KV:\n", "Einstiegsgehalt lt. KV - ")
    #Same for BMS and BHS
    text = text.replace("BMS:\n", "BMS - ")
    text = text.replace("BHS:\n", "BHS - ")
    # # Remove Einstiegsgehalt lt. KV: and BMS: from all texts
    # text = text.replace("Einstiegsgehalt lt. KV:", "")
    # text = text.replace("BMS:", "")

    # Tokenize and extract sections using spaCy (assuming sections are separated by '\n')
    sections = [section.strip() for section in text.split('\n') if section.strip()]

    # Combine sections that logically belong together
    combined_sections = []
    current_section = ""

    for section in sections:
        if current_section and current_section.endswith(":"):
            # Combine with the next section
            combined_sections.append(current_section + " " + section)
            current_section = ""
        else:
            # Start a new section
            current_section = section

    # Generate contextually meaningful questions for each combined section
    for i, combined_section in enumerate(combined_sections):
        # Customize this logic based on your specific content
        if "Ausbildungsform:" in combined_section:
            question = f"Was ist die Ausbildungsform für einen/eine {profession}?"
        if "Gehalt:" in combined_section:
            question = f"Was ist das Gehalt eines/einer {profession}?"
        elif "Berufsbereiche:" in combined_section:
            question = f"In welchen Berufsbereichen arbeitet ein/eine {profession}?"
        elif combined_section.startswith("*"):
            question = f"Was sind besondere Aspekte im Zusammenhang mit einem/einer {profession}?"
        else:
            # If no specific pattern is detected, use a generic question
            question = f"Welche zusätzlichen Infos gibt es zu einer/einem {profession}?"

        dataset.append({"question": question, "answer": combined_section})

# Save the dataset to a JSON file
with open("output_dataset_6.json", "w", encoding="utf-8") as output_file:
    json.dump(dataset, output_file, ensure_ascii=False, indent=2)

print("Dataset generation complete.")


Dataset generation complete.


In [36]:
import json
from collections import defaultdict

# Load the generated dataset from output_dataset.json
with open("output_dataset_6.json", "r", encoding="utf-8") as file:
    dataset = json.load(file)
    
# Create a defaultdict to store answers for each question
grouped_answers = defaultdict(list)

# Iterate through the dataset and group answers by question
for entry in dataset:
    if entry["answer"] == "Einstiegsgehalt: Gehalt:":
        entry["answer"] = "Zu diesem Beruf ist leider kein Gehalt angeführt"
    question = entry["question"]
    answer = entry["answer"]
    grouped_answers[question].append(answer)

# Create a new list to store combined entries
combined_dataset = []

# Iterate through grouped answers and create combined entries
for question, answers in grouped_answers.items():
    combined_entry = {"question": question, "combined_answers": " ".join(answers)}
    combined_dataset.append(combined_entry)




# Save the combined dataset to a new JSON file
with open("combined_questions_3.json", "w", encoding="utf-8") as output_file:
    json.dump(combined_dataset, output_file, ensure_ascii=False, indent=2)

print("Combining answers complete.")


Combining answers complete.
