In [9]:
# packages
import pandas as pd
from bs4 import BeautifulSoup
import requests
from zipfile import ZipFile
import requests
import re
from datetime import date
import json
import xmltodict
import xml.etree.ElementTree as ET
import numpy as np
import os

In [33]:
# get links from website

URL = "https://ds.marktstammdatenregister.dev/Marktstammdatenregister"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
DataLink = soup.findAll('a', href = re.compile('^https://download.marktstammdatenregister.de/Gesamtdatenexport_'))[0]['href']

!!! Download takes ~15 minutes !!!

In [None]:
# download data
req = requests.get(DataLink)

In [6]:
# set filename
datum = date.today()
filename = f'MaStR_Gesamtdatenexport_{datum}.zip'


In [None]:
# writing file to system
print('Download started')
with open(filename, 'wb') as output_file:
    output_file.write(req.content)
print('Download completed')

In [None]:
mastr_directory = "/home/jan/Uni/DS-Project/data/MaStR/"
zip_path = "/home/jan/Uni/DS-Project/modules/data/MaStR_Gesamtdatenexport_2022-11-06"
# unzip folder and extract required data
Matches = []
with ZipFile(zip_path, 'r') as ZipObject:
    for names in ZipObject.namelist():
        PVA = re.findall(r'^EinheitenSolar.*xml$', names)
        Matches.append(PVA)
        WKA = re.findall(r'^EinheitenWind.*xml$', names)
        Matches.append(WKA)
    while [] in Matches :
        Matches.remove([])
    Matches = [str(M) for M in Matches]
    for i in range(len(Matches)-1):
        Source_Name = Matches[i][2:-2]
        ZipObject.extract(
            Source_Name, path = mastr_directory
        )
# delete zip folder
os.remove(zip_path)

In [2]:
def parse_XML(xml_file, df_cols): 
    """
    shamelessly stolen from:
    https://medium.com/@robertopreste/from-xml-to-pandas-dataframes-9292980b1c1c
    """
    
    xtree = ET.parse(xml_file)
    xroot = xtree.getroot()
    rows = []
    
    for node in xroot: 
        res = []
        res.append(node.attrib.get(df_cols[0]))
        for el in df_cols[1:]: 
            if node is not None and node.find(el) is not None:
                res.append(node.find(el).text)
            else: 
                res.append(None)
        rows.append({df_cols[i]: res[i] 
                     for i, _ in enumerate(df_cols)})
    
    out_df = pd.DataFrame(rows, columns=df_cols)
        
    return out_df

In [3]:
WKA_cols = ["EinheitMastrNummer",
           "EinheitMastrNummer",
           "AnlagenbetreiberMastrNummer",
           "Land",
           "Bundesland",
           "Landkreis",
           "Gemeinde",
           "Gemeindeschluessel",
           "Postleitzahl",
           "Inbetriebnahmedatum",
           "EinheitSystemstatus",
           "Nettonennleistung",
           # WKA specific
           "Laengengrad",
           "Breitengrad",
           "Einspeisungsart",
           "Hersteller",
           "Nabenhoehe",
           "Rotordurchmesser",
           "AuflagenAbschaltungSchallimmissionsschutzNachts",
           "AuflagenAbschaltungSchallimmissionsschutzTagsueber",
           "AuflagenAbschaltungSchattenwurf",
           "AuflagenAbschaltungTierschutz",
           "AuflagenAbschaltungEiswurf"
           ]
PVA_cols = ["EinheitMastrNummer",
            "AnlagenbetreiberMastrNummer",
            "Land",
            "Bundesland",
            "Landkreis",
            "Gemeinde",
            "Gemeindeschluessel",
            "Postleitzahl",
            "Inbetriebnahmedatum",
            "EinheitSystemstatus",
            "Nettonennleistung",
            "Einspeisungsart",
            "ZugeordneteWirkleistungWechselrichter",
            "AnzahlModule",
            "Lage",
            "Leistungsbegrenzung",
            "EinheitlicheAusrichtungUndNeigungswinkel",
            "Hauptausrichtung",
            "HauptausrichtungNeigungswinkel",
            "Nutzungsbereich"
            ]

In [11]:
# parse Wind file
WKA_df = parse_XML("/home/jan/Uni/DS-Project/data/MaStR/EinheitenWind.xml",
          WKA_cols)
os.remove("/home/jan/Uni/DS-Project/data/MaStR/EinheitenWind.xml")

!!! takes ~6 minutes to run !!!

In [5]:
Matches = []
with ZipFile("/home/jan/Uni/DS-Project/modules/data/MaStR_Gesamtdatenexport_2022-11-06", 'r') as ZipObject:
    for names in ZipObject.namelist():
        PVA = re.findall(r'^EinheitenSolar.*xml$', names)
        Matches.append(PVA)
        WKA = re.findall(r'^EinheitenWind.*xml$', names)
        Matches.append(WKA)
    while [] in Matches :
        Matches.remove([])
    Matches = [str(M) for M in Matches]

In [None]:
# parse Solar files
PVA_df = pd.DataFrame()
# get PVA filenames
mastr_directory = "/home/jan/Uni/DS-Project/data/MaStR/"
for i in range(len(Matches)-1):
    Source_Name = Matches[i][2:-2]
    Unit = mastr_directory + Source_Name
    print(Source_Name)
    # fill PVA_df
    PVA_part = parse_XML(Unit, PVA_cols)
    PVA_df = PVA_df.append(PVA_part)
    os.remove(Unit)

In [8]:
# write csv files to disc

print('Writing started')
WKA_df.to_csv(mastr_directory + "WKA.csv")
PVA_df.to_csv(mastr_directory + "PVA.csv")
print('Writing completed')

Writing started
Writing completed
