From 9e7d9bd4445a0a1a7dc72c8b6b5971bf662074ae Mon Sep 17 00:00:00 2001 From: Benjamin Gutzmann Date: Wed, 15 May 2024 22:53:41 +0200 Subject: [PATCH] Fix DWD Obs station list parsing again --- CHANGELOG.rst | 2 + .../{test_meta_index.py => test_metaindex.py} | 24 ++++++++++ .../provider/dwd/observation/metaindex.py | 48 +++++++++++-------- 3 files changed, 53 insertions(+), 21 deletions(-) rename tests/provider/dwd/observation/{test_meta_index.py => test_metaindex.py} (62%) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f7f9f4712..1ca08d617 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,6 +4,8 @@ Changelog Development *********** +- Fix DWD Obs station list parsing again + 0.83.0 (26.04.2024) ******************* diff --git a/tests/provider/dwd/observation/test_meta_index.py b/tests/provider/dwd/observation/test_metaindex.py similarity index 62% rename from tests/provider/dwd/observation/test_meta_index.py rename to tests/provider/dwd/observation/test_metaindex.py index 080fd77b8..2b46b5462 100644 --- a/tests/provider/dwd/observation/test_meta_index.py +++ b/tests/provider/dwd/observation/test_metaindex.py @@ -12,6 +12,7 @@ from wetterdienst.metadata.resolution import Resolution from wetterdienst.provider.dwd.observation import DwdObservationDataset from wetterdienst.provider.dwd.observation.metaindex import ( + _create_csv_line, create_meta_index_for_climate_observations, ) @@ -59,3 +60,26 @@ def test_meta_index_1mph_creation(default_settings): "Nordrhein-Westfalen", ) ) + + +def test_create_csv_line(): + assert ( + _create_csv_line("00001 19370101 19860630 478 47.8413 8.8493 Aach Baden-Württemberg".split()) + == "00001,19370101,19860630,478,47.8413,8.8493,Aach,Baden-Württemberg" + ) + assert ( + _create_csv_line("00126 19791101 20101130 330 49.5447 10.2213 Uffenheim (Schulstr.) Bayern".split()) + == "00126,19791101,20101130,330,49.5447,10.2213,Uffenheim (Schulstr.),Bayern" + ) + assert ( + _create_csv_line("00102 19980101 20240514 0 53.8633 8.1275 Leuchtturm Alte Weser Niedersachsen".split()) + == "00102,19980101,20240514,0,53.8633,8.1275,Leuchtturm Alte Weser,Niedersachsen" + ) + assert ( + _create_csv_line("00197 19900801 20240514 365 51.3219 9.0558 Arolsen-Volkhardinghausen, Bad Hessen".split()) + == """00197,19900801,20240514,365,51.3219,9.0558,"Arolsen-Volkhardinghausen, Bad",Hessen""" + ) + assert ( + _create_csv_line("01332 19660701 20240514 471 48.4832 12.7241 Falkenberg,Kr.Rottal-Inn Bayern".split()) + == """01332,19660701,20240514,471,48.4832,12.7241,"Falkenberg,Kr.Rottal-Inn",Bayern""" + ) diff --git a/wetterdienst/provider/dwd/observation/metaindex.py b/wetterdienst/provider/dwd/observation/metaindex.py index c8a2af30d..581fd2a6c 100644 --- a/wetterdienst/provider/dwd/observation/metaindex.py +++ b/wetterdienst/provider/dwd/observation/metaindex.py @@ -23,7 +23,6 @@ ) from wetterdienst.util.cache import CacheExpiry from wetterdienst.util.network import download_file, list_remote_files_fsspec -from wetterdienst.util.polars_util import read_fwf_from_df if TYPE_CHECKING: from wetterdienst.settings import Settings @@ -31,14 +30,14 @@ log = logging.getLogger(__name__) DWD_COLUMN_NAMES_MAPPING = { - "column_0": Columns.STATION_ID.value, - "column_1": Columns.START_DATE.value, - "column_2": Columns.END_DATE.value, - "column_3": Columns.HEIGHT.value, - "column_4": Columns.LATITUDE.value, - "column_5": Columns.LONGITUDE.value, - "column_6": Columns.NAME.value, - "column_7": Columns.STATE.value, + "column_1": Columns.STATION_ID.value, + "column_2": Columns.START_DATE.value, + "column_3": Columns.END_DATE.value, + "column_4": Columns.HEIGHT.value, + "column_5": Columns.LATITUDE.value, + "column_6": Columns.LONGITUDE.value, + "column_7": Columns.NAME.value, + "column_8": Columns.STATE.value, } METADATA_COLUMNS = [ @@ -190,21 +189,28 @@ def _read_meta_df(file: BytesIO) -> pl.LazyFrame: # Skip first line if it contains a header lines = lines[1:] lines = [line.decode("latin-1") for line in lines] - df = pl.DataFrame(lines) - column_specs = ( - (0, 4), - (21, 29), - (30, 38), - (40, 53), - (55, 65), - (67, 75), - (76, 156), - (157, 200), - ) - df = read_fwf_from_df(df, column_specs) + lines = [_create_csv_line(line.split()) for line in lines] + text = "\n".join(lines) + df = pl.read_csv(StringIO(text), has_header=False, infer_schema_length=0) return df.rename(mapping=lambda col: DWD_COLUMN_NAMES_MAPPING.get(col, col)).lazy() +def _create_csv_line(columns: list[str]) -> str: + """Each column is typically separated by a whitespace and has 7 columns. + If it has more than 7 columns, the columns from second last column and previous columns are joined together so that + there's only 7 columns in the line. + """ + num_columns = len(columns) + if num_columns > 8: + excess_columns = num_columns - 8 + station_name = " ".join(columns[-excess_columns - 2 : -1]) + columns = columns[: -excess_columns - 2] + [station_name] + columns[-1:] + station_name = columns[-2] + if "," in station_name: + columns[-2] = f'"{station_name}"' + return ",".join(columns) + + def _create_meta_index_for_subdaily_extreme_wind(period: Period, settings: Settings) -> pl.LazyFrame: """Create metadata DataFrame for subdaily wind extreme