In [3]:
import pandas as pd

In [4]:
data = pd.read_csv('data/pataky/cleaned-namelist-pataky-02.csv')

In [5]:
names = data['Name Vorname']
# only for Brümmer
#dateOfBirth = data['Geburtsdatum']
request_res_list = []

#### returns list of lists, either empty -> no entry found
#### one dictionary --> returns the gnd
#### list of dicts -> more than one possible match

In [50]:
import requests

def autocomplete_search(query):
    url = "https://lobid.org/gnd/search"
    params = {
        'q': query,
        'format': 'json:preferredName,professionOrOccupation,gndSubjectCategory,dateOfBirth,periodOfActivity, gender'
    }

    try:
        response = requests.get(url, params=params)
        response.raise_for_status() 
        data = response.json()
        return data
    except requests.exceptions.RequestException as e:
        print(f"Error making request: {e}")
        return None


In [55]:
res = []

for item in names:
    
    res.append(autocomplete_search(item))

#### save api results

In [56]:
import pickle
try:
    with open('res', 'rb') as file:
        api_results = pickle.load(file)
    print("Using cached API results.")
except FileNotFoundError:
    # If the file is not found, make the API call and save the results
    api_results = res
    with open('api_results_pataky02.pkl', 'wb') as file:
        pickle.dump(api_results, file)
    print("API results saved to file.")

API results saved to file.


#### load api results

In [7]:
res = pd.read_pickle(r'data/pataky/api_results_pataky02.pkl')

#### extracts the gnd from url

In [8]:
def extract_id_from_url(url):
    # Extracts the number after the last '/'
    return url.rsplit('/', 1)[-1]

### filters out entries not connected with authors

In [11]:
results_temp = []

def check_matches(names, api_results):
    for name, inner_list in zip(names, api_results):
        # kein Treffer gefunden
        if not inner_list:
            results_temp.append([])
        elif len(inner_list) == 1:
            dictionary = inner_list[0]
            if (
                    "category" in dictionary
                    and "label" in dictionary
                    and dictionary["category"] == "Individualisierte Person"
                    and (
                        "Personen zu Literaturgeschichte (Schriftsteller)" in dictionary["label"]
                        or "Schriftsteller" in dictionary["label"]
                        or "Schriftstellerin" in dictionary["label"]
                        or "Autor" in dictionary["label"]
                        or "Autorin" in dictionary["label"]
                    )
                ):
                    id_value = extract_id_from_url(dictionary["id"])
                    results_temp.append(id_value)
            else:
                # eine Person gefunden, aber kein:e Autor:in
                # Liste wird unverändert angehängt
                results_temp.append(inner_list)
    # mehr als ein Treffer gefunden
    # suche in der Liste an möglichen Treffern solche, die mit Autor:innen verbunden sind
    # wenn nichts gefunden, ursprüngliche Liste anhängen            
        else:
            valid_results = []
            for result in inner_list:
                if (
                    "category" in result
                    and "label" in result
                    and result["category"] == "Individualisierte Person"
                    and (
                        "Personen zu Literaturgeschichte (Schriftsteller)" in result["label"]
                        or "Schriftsteller" in result["label"]
                        or "Schriftstellerin" in result["label"]
                    )
                ):
                    valid_results.append(result)
            if valid_results:
                results_temp.append(valid_results)
            else:
                results_temp.append(inner_list)

In [12]:
check_matches(names, res)

#### Pataky only: Filtert Personen, die mit "männlich gelabelt sind" raus

In [13]:
def check_gender(results_temp):
    
    gender_result = []
    for entry in results_temp:
        if not entry:
            gender_result.append([])
        elif isinstance(entry, str):
            gender_result.append(entry)
        elif isinstance(entry, list):
            tmp = [person for person in entry if "Männlich" not in person["label"]]
            gender_result.append(tmp)
    
    return gender_result

In [14]:
gender_results = check_gender(results_temp)

#### Abgleich mit Geburtsjahr 

In [15]:
# extracts the year from the list of birthdates
def extract_year_from_date(date_str):
    return int(date_str[-4:])

In [16]:
def extract_birth_year(label):
    try:
        for part in label.split('|'):
            cleaned_part = part.strip()

            # Check for the format "1800-05-12"
            if len(cleaned_part) == 10 and cleaned_part[4] == '-' and cleaned_part[7] == '-':
                year_part = cleaned_part[:4]
                if year_part.isdigit():
                    return int(year_part)

            # Check for the format "1800"
            elif cleaned_part.isdigit() and len(cleaned_part) == 4:
                return int(cleaned_part)

        # Return None if no valid format is found
        return None
    except TypeError as te:
        print(f"Error extracting birth year: {te}")
        return None

In [None]:
#### Brümmer only

def find_id_from_results(results, dob):
    for result in results:
        birth_year = extract_birth_year(result["label"])
        if birth_year is not None:
            if dob != ".":
                dob_year = extract_year_from_date(dob)
                if birth_year == int(dob_year) and 1770 < birth_year < 1870:
                    return extract_id_from_url(result["id"])
    return None

In [17]:
#### pataky only weil wir kein geb datum zum abgleichen haben -> "birth-date" bezieht sich entweder auf geburtsjahr oder wirkungsjahr (von api Abfrage)

def find_id_from_results(results):
    for result in results:
        birth_year = extract_birth_year(result["label"])
        if birth_year is not None:
            if 1770 < birth_year < 1870:
                return extract_id_from_url(result["id"])
    return None

In [18]:
def extract_year_from_date(date_str):
    return int(date_str[-4:])

In [19]:
def check_birthyear(names, temp_results):
    final_results = []
    for name, entry in zip(names, temp_results):
        print(f"Checking results for {name}:")
        final_result = {"name": name, "id": ""}

        if not entry:
            final_result["id"] = "kein Eintrag gefunden"
        elif isinstance(entry, str):
            final_result["id"] = entry
        elif isinstance(entry, list):
            id_value = find_id_from_results(entry)
            if id_value:
                final_result["id"] = id_value
            else:
                final_result["id"] = "Person außerhalb des Zeitfensters"
        else:
            final_result["id"] = "Person konnte nicht eindeutig bestimmt werden"

        final_results.append(final_result)

    return final_results

In [20]:
final_result = check_birthyear(names, gender_results)

Checking results for Maass Anna:
Checking results for Maass-Suhr Luise:
Checking results for Machczynśca Antonie:
Checking results for Machold A. S:
Checking results for Madelung Franziska:
Checking results for Maderny Helene:
Checking results for Mädler Minna:
Checking results for Madsack Luise:
Checking results for Magdalene:
Checking results for Maggy:
Checking results for Magosch Hermine:
Checking results for Maguire Helena:
Checking results for Mahn-Wothe Anny:
Checking results for Mai Clara:
Checking results for Mai Luise:
Checking results for Mainau Franziska:
Checking results for Maître le Elisabeth:
Checking results for Maier Anna:
Checking results for Maier Elisa:
Checking results for Maier-Streib Sophie:
Checking results for Mainhardt J:
Checking results for Majdanska Helena:
Checking results for Makowiczka A:
Checking results for Maksa-Segalla Ida:
Checking results for Malapert Konstanze Marie:
Checking results for Malcomes Gizella:
Checking results for Malecka Marie:
Check

In [None]:
import csv

def save_to_csv(data, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = data[0].keys() if data else []  # Use keys of the first dictionary as fieldnames
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for row in data:
            writer.writerow(row)

save_to_csv(final_result, 'pataky-gnd-02-03-3-24.csv')


In [None]:
import os
import glob

# merging the files
joined_files = os.path.join("temp", "*.csv")

# A list of all joined files is returned
joined_list = glob.glob(joined_files)

# Finally, the files are joined
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
print(df.head())
df.to_csv("pataky-gnd-03-03-24.csv")

              name                                 id
0      Aabel Marie  Person außerhalb des Zeitfensters
1      Aarau Agnes              kein Eintrag gefunden
2    Abegg Eugenie              kein Eintrag gefunden
3  Abel Clementine                          137537719
4      Abele Marie                         1192856465


In [None]:
import csv

def split_csv_by_id(csv_file_path, numeric_output_file, non_numeric_output_file):
    with open(csv_file_path, newline='', encoding='utf-8') as csvfile, \
         open(numeric_output_file, 'w', newline='', encoding='utf-8') as numeric_csvfile, \
         open(non_numeric_output_file, 'w', newline='', encoding='utf-8') as non_numeric_csvfile:
        
        reader = csv.DictReader(csvfile)
        numeric_writer = csv.DictWriter(numeric_csvfile, fieldnames=reader.fieldnames)
        non_numeric_writer = csv.DictWriter(non_numeric_csvfile, fieldnames=reader.fieldnames)
        
        numeric_writer.writeheader()
        non_numeric_writer.writeheader()
        
        for row in reader:
            id_value = row.get('id')
            if id_value and id_value[:-1].isdigit():  # Check if all characters except the last one are digits
                numeric_writer.writerow(row)
            else:
                non_numeric_writer.writerow(row)

In [None]:
# Replace 'input_file.csv' with the path to your input CSV file
csv_file_path = 'data/pataky/pataky-gnd-03-03-24.csv'

# Replace 'numeric_output.csv' and 'non_numeric_output.csv' with the desired output file names
numeric_output_file = 'pataky-gnd-gefunden-03-03-24.csv'
non_numeric_output_file = 'pataky-gnd-nicht-gefunden-03-03-24.csv'

# Call the function to split the CSV file
split_csv_by_id(csv_file_path, numeric_output_file, non_numeric_output_file)


In [None]:
data = pd.read_csv('data/pataky/pataky-gnd-03-03-24.csv')
data.drop_duplicates(subset=["name"], keep="first", inplace=True)
data.to_csv("pataky-gnd-03-03-24.csv")