In [None]:
import pandas as pd
from lxml import etree
import unicodedata
import plotly.express as px

#### Laden des Datensatzes: 

In [None]:
tree = etree.parse('Mittelalter.xml')
root = tree.getroot()                
ns = {'marc': 'http://www.loc.gov/MARC21/slim'} 
records = root.findall('.//marc:record', namespaces=ns)
print("Gefundene Records:", len(records))

In [None]:
# Funktion zum Extrahieren von Datensätzen
def parse_record(record):
    ns = {"marc": "http://www.loc.gov/MARC21/slim"}
    
    def extract_text(xpath_query):
        fields = record.xpath(xpath_query, namespaces=ns)
        if fields:
            return "; ".join(field.text.replace('\x98', '').replace('\x9c', '') for field in fields if field.text)
        return "unknown"

    idn = extract_text("marc:controlfield[@tag='001']")
    creator = extract_text("marc:datafield[@tag='100']/marc:subfield[@code='a']")
    title = extract_text("marc:datafield[@tag='245']/marc:subfield[@code='a']")
    subtitle = extract_text("marc:datafield[@tag='245']/marc:subfield[@code='b']")
    place = extract_text("marc:datafield[@tag='264']/marc:subfield[@code='a']")
    publisher = extract_text("marc:datafield[@tag='264']/marc:subfield[@code='b']")
    year = extract_text("marc:datafield[@tag='264']/marc:subfield[@code='c']")
    
    return {
        "idn": idn,
        "creator": creator,
        "title": title,
        "subtitle": subtitle,
        "publisher": publisher,
        "place": place,
        "year": year 
    }

In [None]:
# Übergabe der einzelnen Datensätze an die Funktion "parse_record":
result = [parse_record(record) for record in records]
df = pd.DataFrame(result)
df

In [None]:
#Filtern nach Einträgen "unknown" in der Spalte "year": 
no_year = df.loc[df['year'] == "unknown"]
print("Fehlende Jahreszahlen: " , len(no_year), "/", len(df), " - " , len(no_year)/len(df)*100, "%")

In [None]:
#Finde Einträge in der Spalte "year", die länger als 4 Zeichen sind:
longer4 = df[df['year'].str.len() > 4]
longer4

### Neue Visualisierung:

In [None]:
# Häufigkeiten der Jahre zählen
year_counts = df['year'].value_counts().reset_index()
year_counts.columns = ['year', 'count']

# Barchart erstellen
fig = px.bar(year_counts, x='year', y='count', title='Häufigkeiten der Publikationsjahre', category_orders={"year": sorted(year_counts['year'].unique())})
fig.show()

In [None]:
fig2 = px.pie(year_counts, names='year', values='count', title='Publikationsjahre', height=800,
             labels={'year':'Year'}) #, category_orders={"year": sorted(year_counts['year'].unique())})
fig2.update_traces(textposition='inside', textinfo='percent+label')
fig2.show()

In [None]:
# Bestimmte Einträge finden: 
query_df = df.loc[df['year'] == "1991"]
query_df