Fix DWD Obs station list parsing again

earthobservations · May 15, 2024 · 9e7d9bd · 9e7d9bd
1 parent 9bdecee
commit 9e7d9bd
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 21 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,6 +4,8 @@ Changelog
 Development
 ***********
 
+- Fix DWD Obs station list parsing again
+
 0.83.0 (26.04.2024)
 *******************
 

diff --git a/...ovider/dwd/observation/test_meta_index.py → ...rovider/dwd/observation/test_metaindex.py b/...ovider/dwd/observation/test_meta_index.py → ...rovider/dwd/observation/test_metaindex.py
@@ -12,6 +12,7 @@
 from wetterdienst.metadata.resolution import Resolution
 from wetterdienst.provider.dwd.observation import DwdObservationDataset
 from wetterdienst.provider.dwd.observation.metaindex import (
+    _create_csv_line,
     create_meta_index_for_climate_observations,
 )
 
@@ -59,3 +60,26 @@ def test_meta_index_1mph_creation(default_settings):
             "Nordrhein-Westfalen",
         )
     )
+
+
+def test_create_csv_line():
+    assert (
+        _create_csv_line("00001 19370101 19860630 478 47.8413 8.8493 Aach Baden-Württemberg".split())
+        == "00001,19370101,19860630,478,47.8413,8.8493,Aach,Baden-Württemberg"
+    )
+    assert (
+        _create_csv_line("00126 19791101 20101130 330 49.5447 10.2213 Uffenheim (Schulstr.) Bayern".split())
+        == "00126,19791101,20101130,330,49.5447,10.2213,Uffenheim (Schulstr.),Bayern"
+    )
+    assert (
+        _create_csv_line("00102 19980101 20240514 0 53.8633 8.1275 Leuchtturm Alte Weser Niedersachsen".split())
+        == "00102,19980101,20240514,0,53.8633,8.1275,Leuchtturm Alte Weser,Niedersachsen"
+    )
+    assert (
+        _create_csv_line("00197 19900801 20240514 365 51.3219 9.0558 Arolsen-Volkhardinghausen, Bad Hessen".split())
+        == """00197,19900801,20240514,365,51.3219,9.0558,"Arolsen-Volkhardinghausen, Bad",Hessen"""
+    )
+    assert (
+        _create_csv_line("01332 19660701 20240514 471 48.4832 12.7241 Falkenberg,Kr.Rottal-Inn Bayern".split())
+        == """01332,19660701,20240514,471,48.4832,12.7241,"Falkenberg,Kr.Rottal-Inn",Bayern"""
+    )
diff --git a/wetterdienst/provider/dwd/observation/metaindex.py b/wetterdienst/provider/dwd/observation/metaindex.py
@@ -23,22 +23,21 @@
 )
 from wetterdienst.util.cache import CacheExpiry
 from wetterdienst.util.network import download_file, list_remote_files_fsspec
-from wetterdienst.util.polars_util import read_fwf_from_df
 
 if TYPE_CHECKING:
     from wetterdienst.settings import Settings
 
 log = logging.getLogger(__name__)
 
 DWD_COLUMN_NAMES_MAPPING = {
-    "column_0": Columns.STATION_ID.value,
-    "column_1": Columns.START_DATE.value,
-    "column_2": Columns.END_DATE.value,
-    "column_3": Columns.HEIGHT.value,
-    "column_4": Columns.LATITUDE.value,
-    "column_5": Columns.LONGITUDE.value,
-    "column_6": Columns.NAME.value,
-    "column_7": Columns.STATE.value,
+    "column_1": Columns.STATION_ID.value,
+    "column_2": Columns.START_DATE.value,
+    "column_3": Columns.END_DATE.value,
+    "column_4": Columns.HEIGHT.value,
+    "column_5": Columns.LATITUDE.value,
+    "column_6": Columns.LONGITUDE.value,
+    "column_7": Columns.NAME.value,
+    "column_8": Columns.STATE.value,
 }
 
 METADATA_COLUMNS = [
@@ -190,21 +189,28 @@ def _read_meta_df(file: BytesIO) -> pl.LazyFrame:
         # Skip first line if it contains a header
         lines = lines[1:]
     lines = [line.decode("latin-1") for line in lines]
-    df = pl.DataFrame(lines)
-    column_specs = (
-        (0, 4),
-        (21, 29),
-        (30, 38),
-        (40, 53),
-        (55, 65),
-        (67, 75),
-        (76, 156),
-        (157, 200),
-    )
-    df = read_fwf_from_df(df, column_specs)
+    lines = [_create_csv_line(line.split()) for line in lines]
+    text = "\n".join(lines)
+    df = pl.read_csv(StringIO(text), has_header=False, infer_schema_length=0)
     return df.rename(mapping=lambda col: DWD_COLUMN_NAMES_MAPPING.get(col, col)).lazy()
 
 
+def _create_csv_line(columns: list[str]) -> str:
+    """Each column is typically separated by a whitespace and has 7 columns.
+    If it has more than 7 columns, the columns from second last column and previous columns are joined together so that
+    there's only 7 columns in the line.
+    """
+    num_columns = len(columns)
+    if num_columns > 8:
+        excess_columns = num_columns - 8
+        station_name = " ".join(columns[-excess_columns - 2 : -1])
+        columns = columns[: -excess_columns - 2] + [station_name] + columns[-1:]
+    station_name = columns[-2]
+    if "," in station_name:
+        columns[-2] = f'"{station_name}"'
+    return ",".join(columns)
+
+
 def _create_meta_index_for_subdaily_extreme_wind(period: Period, settings: Settings) -> pl.LazyFrame:
     """Create metadata DataFrame for subdaily wind extreme