In [None]:
import requests
import os
import re
import time

class DDBAPI_GET_XMLS:
    def __init__(self):
        self.api_key = "OYSi9Dygc0XZ0Nvq2vgPxe4oXNmomCtWWZHM7CVd3Fo7iC0qKge1748029090188"  
        self.headers = {
            "Authorization": f'OAuth oauth_consumer_key="{self.api_key}"',
            "Accept": "application/json",
        }
    
    def safe_get(self, url, headers=None, retries=3, backoff=5, timeout=20):
        """
        Wrapper around requests.get with retries, timeout, and error handling.
        """
        for attempt in range(1, retries + 1):
            try:
                response = requests.get(url, headers=headers, timeout=timeout)
                response.raise_for_status()  # raise HTTPError for bad responses
                return response
            except requests.exceptions.Timeout:
                print(f"⏳ Timeout on attempt {attempt}/{retries} for {url}")
            except requests.exceptions.RequestException as e:
                print(f"⚠️ Request failed on attempt {attempt}/{retries}: {e}")
            
            if attempt < retries:
                print(f"🔁 Retrying in {backoff} seconds...")
                time.sleep(backoff)

        print(f"❌ Failed to fetch {url} after {retries} attempts.")
        return None

    def get_xmls_only(self, item_id, base_dir="ddb"):
        item_url = f"https://api.deutsche-digitale-bibliothek.de/items/{item_id}"
        response = self.safe_get(item_url, headers=self.headers)
        if not response or response.status_code != 200:
            print(f"❌ Failed to fetch item {item_id}")
            return None, 0, '', '', ''

        try:
            data = response.json()
            mets_xml = data["source"]["record"]["$"]
            issued = data['edm']['RDF']['ProvidedCHO']['issued']
            publisher = data['edm']['RDF']['ProvidedCHO']['publisher']['$']
            title = data['edm']['RDF']['ProvidedCHO']['title']['$']
            xml_links = re.findall(r'https://[^\s"]+\.xml', mets_xml)
            xml_links = [url for url in xml_links if url.startswith("https://api.deutsche-digitale-bibliothek.de/binary/")]

            # Create output folder
            folder = os.path.join(base_dir, item_id)
            os.makedirs(folder, exist_ok=True)
            print(f"\n📥 Downloading XMLs to: {folder}")

            for i, xml_url in enumerate(xml_links, 1):
                xml_resp = self.safe_get(xml_url, timeout=30)
                if xml_resp and xml_resp.status_code == 200:
                    xml_path = os.path.join(folder, f"page_{i}.xml")
                    with open(xml_path, "wb") as f:
                        f.write(xml_resp.content)
                    print(f"✔ Saved {xml_path}")
                else:
                    print(f"❌ Failed to download XML: {xml_url}")

            return folder, len(xml_links), issued, publisher, title

        except Exception as e:
            print(f"❗ Error with item {item_id}: {e}")
            return None, 0, '', '', ''



In [None]:
import xml.etree.ElementTree as ET

def xml2text(xml_path):
    # Parse the XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Dynamically get the namespace from the root tag
    if root.tag.startswith("{"):
        ns_uri = root.tag[1:root.tag.find("}")]
        ns = {'alto': ns_uri}
    else:
        ns = {'alto': ''}  # fallback if no namespace is defined

    lines = []

    # Traverse each TextLine in the XML
    for text_line in root.findall('.//alto:TextLine', ns):
        words = [string.attrib.get("CONTENT", "") for string in text_line.findall("alto:String", ns)]
        line_text = " ".join(words)
        if line_text.strip():
            lines.append(line_text)

    full_text = "\n".join(lines)
    return full_text

In [None]:
# **** |id|date|page number|chunk|response| ****
import csv
import os

class DatasetCollector:
    def __init__(self, csv_file):
        self.csv_file = csv_file


    def add_row(self, row_dict):
        # Create the CSV file and write the header if it doesn't exist
        if not os.path.exists(self.csv_file):
            with open(self.csv_file, mode='w', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(["item_id", "publisher", "title", "pub_date", "page_num", "chunk", "success", "anarch", "terror", "kommunis", "sozial", "revolut"])
        with open(self.csv_file, mode='a', encoding='utf-8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=row_dict.keys())
            writer.writerow(row_dict)

    def add_ids_only(self, id_list):
        """
        Appends a list of IDs into a CSV file, one per row in a single column.
        If the file doesn't exist, it creates it with a header.
        """
        file_exists = os.path.isfile(self.csv_file)

        try:
            with open(self.csv_file, mode="a", newline="", encoding="utf-8") as csvfile:
                writer = csv.writer(csvfile)

                # Write header if file is new
                if not file_exists:
                    writer.writerow(["id"])

                # Write each ID
                for item_id in id_list:
                    writer.writerow([item_id])

            print(f"✔ Added {len(id_list)} IDs to {self.csv_file}")
        except Exception as e:
            print(f"❌ Error writing IDs to {self.csv_file}: {e}")

In [None]:
import shutil

# Function to clean the folder by deleting all files and subfolders
def clean_the_folder(base_dir="ddb"):
    """
    Deletes all files and subfolders in the given base_dir.
    """

    if os.path.exists(base_dir):
        shutil.rmtree(base_dir)
        print(f"Deleted all contents in '{base_dir}'")
    else:
        print(f"Directory '{base_dir}' does not exist.")

# Function to chunk text into smaller parts for LLM processing
def chunk_text_by_words(text, max_words):
    words = text.split()
    for i in range(0, len(words), max_words):
        yield ' '.join(words[i:i + max_words])

In [None]:
terms = ["anarch", "terror", "kommunis", "sozial", "revolut"]
zdb = "2764651-8"

In [None]:
import pandas as pd

my_name = "4_Bakhadir"

# df = pd.read_csv(f"./BerTagHanZeitung/ids_split_{my_name}_1.csv")
# df = pd.read_csv(f"./BerTagHanZeitung/ids_split_{my_name}_2.csv")
# df = pd.read_csv(f"./BerTagHanZeitung/ids_split_{my_name}_3.csv")
df = pd.read_csv(f"./BerTagHanZeitung/ids_split_{my_name}_4.csv")

id_list = df['id'].tolist()
total_ids = len(id_list)
print(total_ids)

In [None]:
from regex_search_model import RegexSearchModel
model = RegexSearchModel(terms)

os.makedirs("./BerTagHanZeitung/outdata", exist_ok=True)

# csv_file = f"./BerTagHanZeitung/outdata/output_{my_name}_1.csv"
# csv_file = f"./BerTagHanZeitung/outdata/output_{my_name}_2.csv"
# csv_file = f"./BerTagHanZeitung/outdata/output_{my_name}_3.csv"
csv_file = f"./BerTagHanZeitung/outdata/output_{my_name}_4.csv"
collector = DatasetCollector(csv_file)
ddb = DDBAPI_GET_XMLS()
counter = 0
for item_id in id_list: 
    counter += 1
    print(f"\nProcessing item {counter}/{total_ids}: {item_id}")
    folder, numpages, issued, publisher, title = ddb.get_xmls_only(item_id)
    
    if folder == None or numpages == 0:
        print(f"Skipping item {item_id} due to a download error.")
        continue
    print(f"Processing item {item_id} with {numpages} pages.")
    for page in range(1, numpages + 1):
        xml_path = os.path.join(folder, f"page_{page}.xml")

        if not os.path.exists(xml_path):
            print(f"Skipping missing files for item {item_id}, page {page}")
            continue

        try:
            full_text = xml2text(xml_path)
        except (ET.ParseError, FileNotFoundError) as e:
            print(f"⚠️ Could not parse XML {xml_path}: {e}")
            continue   # skip this page if XML is malformed
        # start processing the text with LLM by chunking it into smaller parts
        # to avoid exceeding the token limit
        max_chunk_length = 100 

        for idx, chunk in enumerate(chunk_text_by_words(full_text, max_chunk_length), 1):
            print(f"Processing item {item_id}, {counter}, chunk {idx}")
            model_response = model.generate_response(chunk)
            # If model_response is not a dict (e.g., JSON decode failed), wrap it
            collector.add_row({
                "item_id": item_id,
                "publisher": publisher,
                "title": title,
                "pub_date": issued,
                "page_num": page,
                "chunk": chunk,
                **model_response  # if model_response is a dict
            })
    
    # delete the image and xml files
    clean_the_folder()