### Packages

In [1]:
import folium
import geopandas as gpd
import ipywidgets as widgets
from ipywidgets import interact
from IPython.display import display, clear_output
import numpy as np
import os
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.io as pio
from folium.plugins import MarkerCluster
from pathlib import Path
from shapely.geometry import Point
import sys

In [2]:
# Dokumenteneinstellung
# DataFrame Options
np.set_printoptions(threshold=sys.maxsize)
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 1000)


# Einlesen der Daten

In [3]:
class WeatherDataProcessor:
    def __init__(self, main_folder_path):
        self.main_folder_path = Path(main_folder_path)
        self.weather_data_list = []
        self.metadata_list = []
        self.missing_data_list = []
        self.incorrect_data_list = []
        self.geography_metadata_list = []
        self.parameter_metadata_list = []
        self.station_history_list = []  # Definiere die Liste für Stationen
        self.operator_history_list = []

    def load_data_and_metadata(self):
        for station_folder in self.main_folder_path.iterdir():
            if station_folder.is_dir():
                for file in station_folder.iterdir():
                    if file.suffix == '.txt':
                        with open(file, 'r', encoding='latin-1') as f:
                            lines = f.readlines()

                            if file.name.startswith('produkt_klima_tag'):
                                weather_data = pd.read_csv(file, sep=';', encoding='latin-1')
                                if 'eor' in weather_data.columns:
                                    weather_data = weather_data.drop('eor', axis=1)
                                self.weather_data_list.append(weather_data)

                            elif file.name.startswith('Metadaten_Geraete'):
                                metadata = pd.read_csv(file, sep=';', encoding='latin-1')
                                if 'eor' in metadata.columns:
                                    metadata = metadata.drop('eor', axis=1)
                                metadata = metadata.iloc[:-1] if 'generiert' in lines[-1] else metadata
                                self.metadata_list.append(metadata)

                            elif file.name.startswith('Metadaten_Fehldaten'):
                                missing_data = pd.read_csv(file, sep=';', encoding='latin-1')
                                missing_data = missing_data.drop('                                                                                                              ', axis=1)
                                if 'eor' in missing_data.columns:
                                    missing_data = missing_data.drop('eor', axis=1)
                                missing_data = missing_data.iloc[:-1] if 'generiert' in lines[-1] else missing_data
                                missing_data.columns = missing_data.columns.str.strip()
                                self.missing_data_list.append(missing_data)

                            elif file.name.startswith('Metadaten_Fehlwerte'):
                                incorrect_data = pd.read_csv(file, sep=';', encoding='latin-1')
                                incorrect_data = incorrect_data.drop('                                                                                                              ', axis=1)
                                if 'eor' in incorrect_data.columns:
                                    incorrect_data = incorrect_data.drop('eor', axis=1)
                                incorrect_data = incorrect_data.iloc[:-1] if 'generiert' in lines[-1] else incorrect_data
                                incorrect_data.columns = incorrect_data.columns.str.strip()
                                self.incorrect_data_list.append(incorrect_data)

                            elif file.name.startswith('Metadaten_Geographie'):
                                geography_metadata = pd.read_csv(file, sep=';', encoding='latin-1')
                                if 'eor' in geography_metadata.columns:
                                    geography_metadata = geography_metadata.drop('eor', axis=1)
                                self.geography_metadata_list.append(geography_metadata)

                            elif file.name.startswith('Metadaten_Parameter'):
                                parameter_metadata = pd.read_csv(file, sep=';', encoding='latin-1')
                                if 'eor' in parameter_metadata.columns:
                                    parameter_metadata = parameter_metadata.drop('eor', axis=1)
                                parameter_metadata = parameter_metadata.iloc[:-1] if 'generiert' in lines[-1] else parameter_metadata
                                self.parameter_metadata_list.append(parameter_metadata)

                            elif file.name.startswith('Metadaten_Station'):
                                station_history = pd.read_csv(file, sep=';', encoding='latin-1')
                                if 'eor' in station_history.columns:
                                    station_history = station_history.drop('eor', axis=1)
                                station_history = station_history.iloc[:-1] if 'generiert:' in lines[-1] else station_history
                                self.station_history_list.append(station_history)




    def combine_weather_data(self):
        combined_weather_data = pd.concat(self.weather_data_list, ignore_index=True)
        combined_weather_data['MESS_DATUM'] = pd.to_datetime(combined_weather_data['MESS_DATUM'],  format='%Y%m%d') # , errors="coerce")
        combined_weather_data['MESS_DATUM'] = combined_weather_data['MESS_DATUM'].dt.normalize()
        combined_weather_data = combined_weather_data.rename(columns=lambda x: x.strip())
        combined_weather_data = combined_weather_data.replace(-999, pd.NA)
        return combined_weather_data

    def combine_metadata(self):
        combined_metadata = pd.concat(self.metadata_list, ignore_index=True)
        combined_metadata['Von_Datum'] = pd.to_datetime(combined_metadata['Von_Datum'], errors='coerce', format='%Y%m%M')
        combined_metadata['Bis_Datum'] = pd.to_datetime(combined_metadata['Bis_Datum'], errors='coerce', format='%Y%m%M')
        combined_metadata['Von_Datum'] = combined_metadata['Von_Datum'].dt.normalize()
        combined_metadata['Bis_Datum'] = combined_metadata['Bis_Datum'].dt.normalize()
        return combined_metadata

    def combine_missing_data(self):
        combined_missing_data = pd.concat(self.missing_data_list, ignore_index=True)
        return combined_missing_data
    
    def combine_incorrect_data(self):
        combined_incorrect_data = pd.concat(self.incorrect_data_list, ignore_index=True)
        return combined_incorrect_data
    
    def combine_geography_metadata(self):
        combined_geography_metadata = pd.concat(self.geography_metadata_list, ignore_index=True)
        combined_geography_metadata['von_datum'] = pd.to_datetime(combined_geography_metadata['von_datum'], errors='coerce', format='%Y%m%M')
        combined_geography_metadata['bis_datum'] = pd.to_datetime(combined_geography_metadata['bis_datum'], errors='coerce', format='%Y%m%M')
        combined_geography_metadata['von_datum'] = combined_geography_metadata['von_datum'].dt.normalize()
        combined_geography_metadata['bis_datum'] = combined_geography_metadata['bis_datum'].dt.normalize()
        return combined_geography_metadata
    
    def separate_station_history(self):
        station_data = []
        operator_data = []

        for station_folder in self.main_folder_path.iterdir():
            if station_folder.is_dir():
                for file in station_folder.iterdir():
                    if file.name.startswith('Metadaten_Station'):
                        with open(file, 'r', encoding='latin-1') as f:
                            current_table = None
                            for line in f:
                                line = line.strip()
                                if not line:
                                    continue
                                if line.startswith('Stations_ID'):
                                    current_table = 'station' if 'Stationsname' in line else 'operator'
                                    continue
                                if current_table == 'station':
                                    station_data.append(line.split(';'))
                                elif current_table == 'operator':
                                    operator_data.append(line.split(';'))
    

        station_columns = ['Stations_ID', 'Stationsname', 'Von_Datum', 'Bis_Datum']
        operator_columns = ['Stations_ID', 'Betreibername', 'Von_Datum', 'Bis_Datum']

        station_df = pd.DataFrame(station_data, columns=station_columns)
        operator_df = pd.DataFrame(operator_data, columns=operator_columns)

        self.station_history_list.append(station_df)
        self.operator_history_list.append(operator_df)
        
        station_df['Von_Datum'] = pd.to_datetime(station_df['Von_Datum'], errors='coerce', format='%Y%m%M')
        station_df['Bis_Datum'] = pd.to_datetime(station_df['Bis_Datum'], errors='coerce', format='%Y%m%M')
        station_df['Von_Datum'] = station_df['Von_Datum'].dt.date
        station_df['Bis_Datum'] = station_df['Bis_Datum'].dt.date
        
        operator_df['Von_Datum'] = pd.to_datetime(operator_df['Von_Datum'], errors='coerce', format='%Y%m%M')
        operator_df['Bis_Datum'] = pd.to_datetime(operator_df['Bis_Datum'], errors='coerce', format='%Y%m%M')
        operator_df['Von_Datum'] = operator_df['Von_Datum'].dt.normalize()
        operator_df['Bis_Datum'] = operator_df['Bis_Datum'].dt.normalize()
        

        return station_df, operator_df


if __name__ == "__main__":
    processor = WeatherDataProcessor('../../Data/Wetterdaten')
    processor.load_data_and_metadata()

    weather_data = processor.combine_weather_data()

    metadata = processor.combine_metadata()

    missing_data = processor.combine_missing_data()

    incorrect_data = processor.combine_incorrect_data()

    geography_metadata = processor.combine_geography_metadata()

    parameter_metadata = processor.combine_metadata()
    
    station_df, operator_df = processor.separate_station_history()

# Eingelesene Daten verarbeiten

In [4]:
weather_data.head(3)

Unnamed: 0,STATIONS_ID,MESS_DATUM,QN_3,FX,FM,QN_4,RSK,RSKF,SDK,SHK_TAG,NM,VPM,PM,TMK,UPM,TXK,TNK,TGK
0,377,1947-01-01,,,,5,0.0,0,2.0,6,6.3,5.9,,0.2,93.0,3.7,-3.9,
1,377,1947-01-02,,,,5,0.1,7,0.0,3,7.7,5.6,,0.2,90.0,1.8,-3.5,
2,377,1947-01-03,,,,5,0.0,0,0.0,3,6.0,6.0,,0.9,90.0,2.7,-1.0,


In [5]:
metadata.head(3)

Unnamed: 0,Stations_ID,Stationsname,Geo. Laenge [Grad],Geo. Breite [Grad],Stationshoehe [m],Geberhoehe ueber Grund [m],Von_Datum,Bis_Datum,Geraetetyp Name,Messverfahren,Unnamed: 11,Barometerhoehe ueber NN [m]
0,377,Bergzabern,7.99,49.1,181.0,2.0,1947-01-01,1965-08-01,Psychrometerthermometer (trocken),"Temperatur/Feuchtemessung, konv.",,
1,377,Bergzabern,8.0,49.1,185.0,2.0,1965-09-01,1968-12-01,Psychrometerthermometer (trocken),"Temperatur/Feuchtemessung, konv.",,
2,377,"Bergzabern, Bad",8.0,49.1,185.0,2.0,1969-01-01,1977-06-01,Psychrometerthermometer (trocken),"Temperatur/Feuchtemessung, konv.",,


In [6]:
missing_data.head(3)

Unnamed: 0,Stations_ID,Stations_Name,Parameter,Von_Datum,Bis_Datum,Gesamt_Fehlwerte,Beschreibung
0,377,"Bergzabern, Bad",TMK,01.01.1947,01.04.2024,641,Gesamt_Messzeitraum
1,377,"Bergzabern, Bad",TXK,01.01.1947,01.04.2024,639,Gesamt_Messzeitraum
2,377,"Bergzabern, Bad",TNK,01.01.1947,01.04.2024,639,Gesamt_Messzeitraum


In [7]:
geography_metadata.head(3)

Unnamed: 0,Stations_id,Stationshoehe,Geogr.Breite,Geogr.Laenge,von_datum,bis_datum,Stationsname
0,377,181.0,49.1017,7.992,1926-11-01,1939-08-01,"Bergzabern, Bad"
1,377,181.0,49.1017,7.992,1941-01-01,1943-12-01,"Bergzabern, Bad"
2,377,181.0,49.1017,7.992,1947-01-01,1965-08-01,"Bergzabern, Bad"


In [8]:
parameter_metadata.head(3)

Unnamed: 0,Stations_ID,Stationsname,Geo. Laenge [Grad],Geo. Breite [Grad],Stationshoehe [m],Geberhoehe ueber Grund [m],Von_Datum,Bis_Datum,Geraetetyp Name,Messverfahren,Unnamed: 11,Barometerhoehe ueber NN [m]
0,377,Bergzabern,7.99,49.1,181.0,2.0,1947-01-01,1965-08-01,Psychrometerthermometer (trocken),"Temperatur/Feuchtemessung, konv.",,
1,377,Bergzabern,8.0,49.1,185.0,2.0,1965-09-01,1968-12-01,Psychrometerthermometer (trocken),"Temperatur/Feuchtemessung, konv.",,
2,377,"Bergzabern, Bad",8.0,49.1,185.0,2.0,1969-01-01,1977-06-01,Psychrometerthermometer (trocken),"Temperatur/Feuchtemessung, konv.",,


In [9]:
incorrect_data.head(3)

Unnamed: 0,Stations_ID,Stations_Name,Parameter,Von_Datum,Bis_Datum,Anzahl_Fehlwerte,Beschreibung
0,377,"Bergzabern, Bad",TMK,01.01.2003,30.09.2004,639.0,Station aufgeloest
1,377,"Bergzabern, Bad",TMK,16.03.2008,17.03.2008,2.0,
2,377,"Bergzabern, Bad",TXK,01.01.2003,30.09.2004,639.0,Station aufgeloest


In [10]:
station_df.head(3)

Unnamed: 0,Stations_ID,Stationsname,Von_Datum,Bis_Datum
0,377,Bergzabern,1926-11-01,1939-08-01
1,377,Bergzabern,1941-01-01,1943-12-01
2,377,Bergzabern,1947-01-01,1968-12-01


In [11]:
operator_df.head(8)

Unnamed: 0,Stations_ID,Betreibername,Von_Datum,Bis_Datum
0,377,Wetterdienst,1926-11-01,1939-08-01
1,377,Wetterdienst,1941-01-01,1943-12-01
2,377,Wetterdienst,1947-01-01,1952-11-01
3,377,DWD,1952-11-01,2002-12-01
4,377,DWD,2003-03-01,NaT
5,generiert: 03.04.2024 -- Deutscher Wetterdien...,,NaT,NaT
6,433,Wetterdienst,1928-01-01,1952-11-01
7,433,DWD,1952-11-01,NaT


# Mergen von anderen DF

In [12]:
wea_geo = pd.merge(weather_data, geography_metadata, how='left', left_on=['STATIONS_ID'], right_on=['Stations_id'])

# wea_geo = wea_geo.drop(columns=["QN_3", "FX", "FM"])
wea_geo.fillna(0.0)

wea_geo = wea_geo[(wea_geo['MESS_DATUM'] >= wea_geo['von_datum']) & (wea_geo['MESS_DATUM'] <= wea_geo['bis_datum'])]
wea_geo = wea_geo.fillna(np.NaN)
# Add the geo-location
wea_geo['geometry'] = wea_geo.apply(lambda x: Point(x['Geogr.Laenge'], x['Geogr.Breite']), axis=1)
wea_geo.head(3)

  wea_geo.fillna(0.0)
  wea_geo = wea_geo.fillna(np.NaN)


Unnamed: 0,STATIONS_ID,MESS_DATUM,QN_3,FX,FM,QN_4,RSK,RSKF,SDK,SHK_TAG,NM,VPM,PM,TMK,UPM,TXK,TNK,TGK,Stations_id,Stationshoehe,Geogr.Breite,Geogr.Laenge,von_datum,bis_datum,Stationsname,geometry
2,377,1947-01-01,,,,5.0,0.0,0.0,2.0,6.0,6.3,5.9,,0.2,93.0,3.7,-3.9,,377,181.0,49.1017,7.992,1947-01-01,1965-08-01,"Bergzabern, Bad",POINT (7.992 49.1017)
8,377,1947-01-02,,,,5.0,0.1,7.0,0.0,3.0,7.7,5.6,,0.2,90.0,1.8,-3.5,,377,181.0,49.1017,7.992,1947-01-01,1965-08-01,"Bergzabern, Bad",POINT (7.992 49.1017)
14,377,1947-01-03,,,,5.0,0.0,0.0,0.0,3.0,6.0,6.0,,0.9,90.0,2.7,-1.0,,377,181.0,49.1017,7.992,1947-01-01,1965-08-01,"Bergzabern, Bad",POINT (7.992 49.1017)


# Datenkenntnisse

In [13]:
wea_geo.dtypes

STATIONS_ID               int64
MESS_DATUM       datetime64[ns]
QN_3                    float64
FX                      float64
FM                      float64
QN_4                    float64
RSK                     float64
RSKF                    float64
SDK                     float64
SHK_TAG                 float64
NM                      float64
VPM                     float64
PM                      float64
TMK                     float64
UPM                     float64
TXK                     float64
TNK                     float64
TGK                     float64
Stations_id               int64
Stationshoehe           float64
Geogr.Breite            float64
Geogr.Laenge            float64
von_datum        datetime64[ns]
bis_datum        datetime64[ns]
Stationsname             object
geometry                 object
dtype: object

In [14]:
wea_geo_after_1994 = wea_geo[wea_geo['MESS_DATUM'] >= '1994-01-01']
stations_after_1994 = wea_geo_after_1994['Stations_id'].unique()

wea_geo_after_1994 = wea_geo_after_1994.sort_values(by=['STATIONS_ID', 'MESS_DATUM'])
wea_geo_after_1994.head(3)

Unnamed: 0,STATIONS_ID,MESS_DATUM,QN_3,FX,FM,QN_4,RSK,RSKF,SDK,SHK_TAG,NM,VPM,PM,TMK,UPM,TXK,TNK,TGK,Stations_id,Stationshoehe,Geogr.Breite,Geogr.Laenge,von_datum,bis_datum,Stationsname,geometry
102964,377,1994-01-01,,,,10.0,3.4,7.0,2.5,0.0,3.7,6.6,,2.3,87.0,6.4,-0.2,0.7,377,180.0,49.1014,8.0034,1977-06-01,2002-12-01,"Bergzabern, Bad",POINT (8.0034 49.1014)
102970,377,1994-01-02,,,,10.0,3.5,6.0,1.7,1.0,4.7,8.0,,5.5,90.0,8.5,-0.8,-4.5,377,180.0,49.1014,8.0034,1977-06-01,2002-12-01,"Bergzabern, Bad",POINT (8.0034 49.1014)
102976,377,1994-01-03,,,,10.0,8.4,6.0,0.7,0.0,4.7,9.2,,7.2,94.0,8.9,4.9,1.2,377,180.0,49.1014,8.0034,1977-06-01,2002-12-01,"Bergzabern, Bad",POINT (8.0034 49.1014)


In [15]:
station_date_ranges = wea_geo_after_1994.groupby('STATIONS_ID')['MESS_DATUM'].agg(['min', 'max'])
station_date_ranges = station_date_ranges.sort_values(by='max').reset_index()

print(f"Unique Stations: {geography_metadata.Stationsname.nunique()}")
station_date_ranges.head(3)


Unique Stations: 15


Unnamed: 0,STATIONS_ID,min,max
0,4763,1994-01-01,1998-06-30
1,3927,1994-01-01,2002-03-01
2,377,1994-01-01,2002-12-01


# Visualisierung

In [16]:
wea_geo_2010_now = wea_geo[wea_geo['MESS_DATUM'] >= '1994-01-01']

# https://stackoverflow.com/questions/65411519/typeerror-object-of-type-natype-is-not-json-serializable
# Use this fix so plotly can display the graph
wea_geo_2010_now = wea_geo_2010_now.fillna(np.nan).replace([np.nan], [None])
px.line(wea_geo_2010_now, x='MESS_DATUM', y='TMK', title='Lufttemperatur über die Zeit', color='STATIONS_ID')

  sf: grouped.get_group(s if len(s) > 1 else s[0])
  v = v.dt.to_pydatetime()


In [17]:
year_2018_data = wea_geo[wea_geo['MESS_DATUM'].dt.year == 2018]
fig = px.line(year_2018_data, y='TMK',x= 'MESS_DATUM', title='Histogramm der maximalen Temperatur im Jahr 2018',color ='STATIONS_ID')
fig.show()





The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [18]:
output = widgets.Output()

def plot_temperature_histogram(year):
    with output:
        clear_output(wait=True)  # Clear previous output
        year_data = wea_geo[wea_geo['MESS_DATUM'].dt.year == year]

        fig = px.histogram(year_data, x='TMK', color='STATIONS_ID', marginal='rug',
                           title=f'Temperaturverteilung im Jahr {year}',
                           labels={'TMK': 'Temperatur (°C)', 'count': 'Anzahl der Tage'},
                           opacity=0.7, nbins=20)
        
        fig.show()

years = sorted(wea_geo['MESS_DATUM'].dt.year.unique())

year_slider = widgets.IntSlider(min=min(years), max=max(years), step=1, value=min(years))

# Display the slider and the output widget
display(year_slider, output)

# Use interactive function to link the slider with the plotting function
widgets.interactive(plot_temperature_histogram, year=year_slider)

IntSlider(value=1858, max=2023, min=1858)

Output()

interactive(children=(IntSlider(value=1858, description='year', max=2023, min=1858), Output()), _dom_classes=(…

In [19]:
station_data = wea_geo[wea_geo['MESS_DATUM'].dt.year == 1994]

fig = px.line(station_data, x='MESS_DATUM', y='RSK',color='STATIONS_ID', title='Niederschlagsmenge über den Zeitverlauf')
fig.show()




The behavior of DatetimeProperties.to_pydatetime is deprecated, in a future version this will return a Series containing python datetime objects instead of an ndarray. To retain the old behavior, call `np.array` on the result



In [20]:
def plot_station_data(station_name):
    station_data = wea_geo_after_1994[wea_geo_after_1994['Stationsname'] == station_name]
    fig = px.line(station_data, x='MESS_DATUM', y=['TMK'], title=f'Wetterdaten für Station {station_name}')
    fig.add_scatter(x=station_data['MESS_DATUM'], y=station_data['RSK'], mode='lines', name='Niederschlag', yaxis='y2')
    fig.update_layout(yaxis=dict(title='Temperatur (°C)'), yaxis2=dict(title='Niederschlag (mm)', overlaying='y', side='right'))
    fig.show()

available_stations = wea_geo_after_1994['Stationsname'].unique()

interact(plot_station_data, station_name=widgets.Dropdown(options=available_stations, description='Station:'))

interactive(children=(Dropdown(description='Station:', options=('Bergzabern, Bad', 'Berlin-Tempelhof', 'Bochum…

<function __main__.plot_station_data(station_name)>

In [21]:
data = wea_geo.groupby('STATIONS_ID')

# Center the map around germany
map = folium.Map(location=[51.1657, 10.4515], zoom_start=6)

marker_cluster = MarkerCluster().add_to(map)

# Add a marker for every station
for index, row in data:
    lat = row['Geogr.Breite'].iloc[0]
    lon = row['Geogr.Laenge'].iloc[0]
    station_name = row['Stationsname'].iloc[0]
    folium.Marker(location=[lat, lon],popup=station_name).add_to(marker_cluster)

map