In [5]:
wish_list = {
    "Berliner Tageblatt und Handelszeitung": 30913, 
    "Norddeutsche allgemeine Zeitung": 17924, 
    "National-Zeitung": 5142,
    # clarify the "Allgemeine Zeitung"
    "Vorwärts(Berlin)": 17600,
    "Vorwärts(Leipzig)": 317,
    "General-Anzeiger für Dortmund und die Provinz Westfalen, größte und verbreitetste Tageszeitung Westdeutschlands": 10789,
    # clarify "Neueste Mitteilungen"
 }
# total: 82,685

In [5]:
# **** |id|date|page number|chunk|response| ****
import csv
import os

class DatasetCollector:
    def __init__(self, csv_file):
        self.csv_file = csv_file


    def add_row(self, row_dict):
        # Create the CSV file and write the header if it doesn't exist
        if not os.path.exists(self.csv_file):
            with open(self.csv_file, mode='w', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(["item_id", "publisher", "title", "pub_date"])
        with open(self.csv_file, mode='a', encoding='utf-8', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=row_dict.keys())
            writer.writerow(row_dict)

    def add_ids_only(self, id_list):
        """
        Appends a list of IDs into a CSV file, one per row in a single column.
        If the file doesn't exist, it creates it with a header.
        """
        file_exists = os.path.isfile(self.csv_file)

        try:
            with open(self.csv_file, mode="a", newline="", encoding="utf-8") as csvfile:
                writer = csv.writer(csvfile)

                # Write header if file is new
                if not file_exists:
                    writer.writerow(["id"])

                # Write each ID
                for item_id in id_list:
                    writer.writerow([item_id])

            print(f"✔ Added {len(id_list)} IDs to {self.csv_file}")
        except Exception as e:
            print(f"❌ Error writing IDs to {self.csv_file}: {e}")

In [6]:
import requests
import time

class DDBAPI:
    def __init__(self, zdb, rows, start):
        self.api_key = "OYSi9Dygc0XZ0Nvq2vgPxe4oXNmomCtWWZHM7CVd3Fo7iC0qKge1748029090188"  
        self.headers = {
            "Authorization": f'OAuth oauth_consumer_key="{self.api_key}"',
            "Accept": "application/json",
        }
        self.zdb = zdb
        self.params = {
            "q": "*:*",
            "fq": [
                "publication_date:[1869-12-31T23:59:59.999Z TO 1940-12-31T23:59:59.999Z]",
                f"zdb_id:{self.zdb}"
            ],
            "rows": rows,
            "start": start
        }
        self.url = "https://api.deutsche-digitale-bibliothek.de/search/index/newspaper-issues/select"

    def set_paging(self, start, rows):
        """Update pagination parameters for the API request."""
        self.params["start"] = start
        self.params["rows"] = rows

    def get_ddb_data(self):
        """
        Fetches newspaper issue IDs from the Deutsche Digitale Bibliothek API.
        Returns a list of item IDs.
        """
        response = requests.get(
            self.url,
            headers=self.headers,
            params=self.params
        )

        return response

    def get_ids(self, response):
        all_data = response.json()
        ids = [doc['id'] for doc in all_data['response']['docs']]
        return ids
    
    def get_meta_data(self, item_id):
        item_url = f"https://api.deutsche-digitale-bibliothek.de/items/{item_id}"
        try:
            response = requests.get(item_url, headers=self.headers, timeout=10)
            response.raise_for_status()  # raises an HTTPError for bad responses (4xx, 5xx)
        except (requests.exceptions.RequestException, Exception) as e:
            print(f"⚠️ Error fetching {item_id}: {e}")
            time.sleep(1)
            return '', '', ''
            
        if response.status_code != 200:
            print(f"❌ Failed to fetch item {item_id}")
            return '', '', ''

        try:
            data = response.json()
            issued = data['edm']['RDF']['ProvidedCHO']['issued']
            publisher = data['edm']['RDF']['ProvidedCHO']['publisher']['$']
            title = data['edm']['RDF']['ProvidedCHO']['title']['$']
            return issued, publisher, title

        except Exception as e:
            print(f"❗ Error with item {item_id}: {e}")
            return '', '', ''

In [8]:
zdb = "2764651-8"  # Berliner Tageblatt und Handelszeitung
rows = 1000        # number of results per request
start = 10000      # starting offset (first 10,000 already added)
total = 482722     # total number of IDs to fetch

In [9]:

csv_file = f"ids_only_{zdb}_{total}.csv"
collector = DatasetCollector(csv_file)

ddb = DDBAPI(zdb, rows, start)

while start < total:
    print(f"\n➡️ Fetching IDs {start} to {start + rows} ...")

    response = ddb.get_ddb_data()
    if response.status_code != 200:
        print(f"❌ Request failed at start={start}, status={response.status_code}")
        break

    item_ids = ddb.get_ids(response)

    if not item_ids:
        print("⚠️ No more IDs returned, stopping.")
        break

    collector.add_ids_only(item_ids)

    start += rows
    # Update start in params
    ddb.set_paging(start, rows)
    time.sleep(1)  # be nice to the API


➡️ Fetching IDs 10000 to 11000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 11000 to 12000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 12000 to 13000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 13000 to 14000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 14000 to 15000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 15000 to 16000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 16000 to 17000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 17000 to 18000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 18000 to 19000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 19000 to 20000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 20000 to 21000 ...
✔ Added 1000 IDs to ids_only_2764651-8_482722.csv

➡️ Fetching IDs 21000 to 22000 ...
✔ Added 1000 IDs t

In [1]:
from regex_search_model import RegexSearchModel
terms = ["anarch", "terror", "kommun", "sozial", "revolut"]
model = RegexSearchModel(terms)
model_response = model.generate_response("""Anarhismus, anarch, Anarchismus anarcism, (Kommunist), sozialistisch, Terror!!! Revolutionär""")
print(model_response)

{'success': True, 'word1': [' Anarchismus ', ' anarch', 'Anarhismus', 'anarcism'], 'word2': [' Terror!!!'], 'word3': [' (Kommunist)'], 'word4': [' sozialistisch'], 'word5': ['!! Revolutionär']}


In [1]:
import requests

import urllib.parse

api_key = "OYSi9Dygc0XZ0Nvq2vgPxe4oXNmomCtWWZHM7CVd3Fo7iC0qKge1748029090188"  
headers = {
    "Authorization": f'OAuth oauth_consumer_key="{api_key}"',
    "Accept": "application/json",
}

zdb = "2764651-8"  # Berliner Tageblatt und Handelszeitung

params = {
    "q": "*:*",
    "fq": [
        "publication_date:[1869-12-31T23:59:59.999Z TO 1940-12-31T23:59:59.999Z]",
        f"zdb_id:{zdb}"
    ],
    "rows": 100,
    "start": 0
}
url = "https://api.deutsche-digitale-bibliothek.de/search/index/newspaper-issues/select"

In [None]:
response = requests.get(
    url,
    headers=headers,
    params=params
)
print(response.status_code)

200


In [55]:
response_json = response.json()
num_results = response_json['response']['numFound']
print(f"Number of matching newspaper issues: {num_results}")

Number of matching newspaper issues: 482722


In [56]:
def get_ids(response):
    all_data = response.json()
    ids = [doc['id'] for doc in all_data['response']['docs']]
    return ids

In [57]:
all_ids = get_ids(response)
print(len(all_ids))

100


In [2]:
item_url = f"https://api.deutsche-digitale-bibliothek.de/items/VH2WRQWLOFKEZ5QY7M4CCMZEA2ZSZNF3"
response = requests.get(item_url, headers=headers)

In [3]:
print(response.json())

{'properties': {'item-id': 'VH2WRQWLOFKEZ5QY7M4CCMZEA2ZSZNF3', 'dataset-id': '1880100145106489acMD', 'dataset-label': 'Gesamtlieferung (Zeitungsportal) - SBB Berlin (00008125) - METS/MODS', 'revision-id': '3', 'ingest-date': '2025-02-26T16:55:19+0100', 'cortex-type': 'Kultur', 'mapping-version': '5.4'}, 'edm': {'RDF': {'Aggregation': {'@about': 'http://www.zvdd.de/record/DE-1/PPN782642756_1879100302/dmd', 'aggregatedCHO': {'@resource': 'http://www.deutsche-digitale-bibliothek.de/item/VH2WRQWLOFKEZ5QY7M4CCMZEA2ZSZNF3'}, 'dataProvider': ['Staatsbibliothek zu Berlin - Preußischer Kulturbesitz', {'@resource': 'http://www.deutsche-digitale-bibliothek.de/organization/6GFV3I4ELFEEFQIN2WECOXMTI5FUWHCK'}], 'isShownAt': {'@resource': 'https://resolver.staatsbibliothek-berlin.de/SNP27646518-18791003-1-0-0-0'}, 'isShownBy': {'@resource': 'https://content.staatsbibliothek-berlin.de/zefys/SNP27646518-18791003-1-1-0-0/full/full/0/default.jpg'}, 'provider': 'Deutsche Digitale Bibliothek', 'rights': [{

In [None]:
terms = ["Anarchismus", "Terrorismus", "Kommunismus", "Sozialismus", "Revolution"]
# Define the model ID and system message
# Uncomment the model_id you want to use
# model_id = "phi-1_5"
# model_id = "phi-3-mini-4k-instruct"
# model_id = "gpt-4o-mini"
model_id = "Llama-3.1-8b-instruct"
# model_id = "gpt-oss-20b"  
# model_id = "regex_search_model"  # Placeholder for regex search model

# Set to True if you want to use Kraken OCR
use_kraken = False

system_msg = (
    "SYSTEM MESSAGE:\n\n"
    "The texts provided in the prompt are taken from OCR outputs of old newspapers. "
    "Some characters in the OCR text may be misread (e.g., 'h' as 'n', 'd' as 'o', 's' as 'ſ', 'i' as 't', 'l' as 't'), "
    "so you should be able to recognize and process these kinds of mistakes correctly. "
    "You should also consider different cognates of a keyword as matched. For example, several word forms, "
    "derivatives, and related terms of the German word 'Terrorismus' (terrorism): Terrorist, Terroranschlag, "
    "Terrororganisation, Antiterrorkampf, terrorisieren, etc.\n\n"
    "Give your answer in a strict JSON format without any extra additional words or signs as follows:\n"
    "{\n"
    '  "success": boolean,\n'
    '  "word1": [],\n'
    '  "word2": [],\n'
    '  ...\n'
    '  "wordn": []\n'
    "}\n\n"
    "Example:\n"
    "If the query is {word1: \"Regelmäßig\", word2: \"Polizei\", word3: \"Terrorismus\"},\n"
    "the output JSON could be:\n"
    "{\n"
    '  "success": true,\n'
    '  "word1": ["Unregelmäßig"],\n'
    '  "word2": ["Polizisten", "Polizeiwagen"],\n'
    '  "word3": []\n'
    "}\n\n"
    "or\n\n"
    "{\n"
    '  "success": false,\n'
    '  "word1": [],\n'
    '  "word2": [],\n'
    '  "word3": []\n'
    "}\n\n"
    "depending on the matches found.\n\n"
    "SYSTEM MESSAGE END\n\n"
)

❌ Failed to fetch item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_1_DDB_FULLTEXT
Skipping item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_1_DDB_FULLTEXT due to a download error.
❌ Failed to fetch item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_2_DDB_FULLTEXT
Skipping item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_2_DDB_FULLTEXT due to a download error.
❌ Failed to fetch item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_3_DDB_FULLTEXT
Skipping item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_3_DDB_FULLTEXT due to a download error.
❌ Failed to fetch item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_4_DDB_FULLTEXT
Skipping item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS-ocr_4_DDB_FULLTEXT due to a download error.

📥 Downloading XMLs to: ddb\BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS
✔ Saved ddb\BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS\page_1.xml
✔ Saved ddb\BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS\page_2.xml
✔ Saved ddb\BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS\page_3.xml
✔ Saved ddb\BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS\page_4.xml
Processing item BXV7FDKRLGZBJOE5PVBXBFYUXNSL3UVS, c

Traceback (most recent call last):
  File "c:\Users\abduk\anaconda3\envs\dsai_src_code\lib\site-packages\huggingface_hub\utils\_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "c:\Users\abduk\anaconda3\envs\dsai_src_code\lib\site-packages\requests\models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/fireworks-ai/inference/v1/chat/completions

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\abduk\Desktop\UdS\DSAI_Project\src\main.py", line 172, in <module>
    main()
  File "c:\Users\abduk\Desktop\UdS\DSAI_Project\src\main.py", line 151, in main
    model_response = model.generate_response(chunk)
  File "c:\Users\abduk\Desktop\UdS\DSAI_Project\src\Llama_3p1_8b_instruct.py", line 32, in generate_response
    completion = self.client.chat.

In [2]:
model_response = "Not found"
terms = ["Anarchismus", "Kommunismus", "Sozialismus", "Revolution"]
model_response = {
    "success": False, 
    **{term: [] for term in terms},
    "llm_response": str(model_response)
}

In [3]:
print(model_response)


{'success': False, 'Anarchismus': [], 'Kommunismus': [], 'Sozialismus': [], 'Revolution': [], 'llm_response': 'Not found'}
