Skip to content

Commit

Permalink
Fix DWD Obs station list parsing again
Browse files Browse the repository at this point in the history
  • Loading branch information
gutzbenj committed May 15, 2024
1 parent 9bdecee commit 9e7d9bd
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 21 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ Changelog
Development
***********

- Fix DWD Obs station list parsing again

0.83.0 (26.04.2024)
*******************

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from wetterdienst.metadata.resolution import Resolution
from wetterdienst.provider.dwd.observation import DwdObservationDataset
from wetterdienst.provider.dwd.observation.metaindex import (
_create_csv_line,
create_meta_index_for_climate_observations,
)

Expand Down Expand Up @@ -59,3 +60,26 @@ def test_meta_index_1mph_creation(default_settings):
"Nordrhein-Westfalen",
)
)


def test_create_csv_line():
assert (
_create_csv_line("00001 19370101 19860630 478 47.8413 8.8493 Aach Baden-Württemberg".split())
== "00001,19370101,19860630,478,47.8413,8.8493,Aach,Baden-Württemberg"
)
assert (
_create_csv_line("00126 19791101 20101130 330 49.5447 10.2213 Uffenheim (Schulstr.) Bayern".split())
== "00126,19791101,20101130,330,49.5447,10.2213,Uffenheim (Schulstr.),Bayern"
)
assert (
_create_csv_line("00102 19980101 20240514 0 53.8633 8.1275 Leuchtturm Alte Weser Niedersachsen".split())
== "00102,19980101,20240514,0,53.8633,8.1275,Leuchtturm Alte Weser,Niedersachsen"
)
assert (
_create_csv_line("00197 19900801 20240514 365 51.3219 9.0558 Arolsen-Volkhardinghausen, Bad Hessen".split())
== """00197,19900801,20240514,365,51.3219,9.0558,"Arolsen-Volkhardinghausen, Bad",Hessen"""
)
assert (
_create_csv_line("01332 19660701 20240514 471 48.4832 12.7241 Falkenberg,Kr.Rottal-Inn Bayern".split())
== """01332,19660701,20240514,471,48.4832,12.7241,"Falkenberg,Kr.Rottal-Inn",Bayern"""
)
48 changes: 27 additions & 21 deletions wetterdienst/provider/dwd/observation/metaindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,22 +23,21 @@
)
from wetterdienst.util.cache import CacheExpiry
from wetterdienst.util.network import download_file, list_remote_files_fsspec
from wetterdienst.util.polars_util import read_fwf_from_df

if TYPE_CHECKING:
from wetterdienst.settings import Settings

log = logging.getLogger(__name__)

DWD_COLUMN_NAMES_MAPPING = {
"column_0": Columns.STATION_ID.value,
"column_1": Columns.START_DATE.value,
"column_2": Columns.END_DATE.value,
"column_3": Columns.HEIGHT.value,
"column_4": Columns.LATITUDE.value,
"column_5": Columns.LONGITUDE.value,
"column_6": Columns.NAME.value,
"column_7": Columns.STATE.value,
"column_1": Columns.STATION_ID.value,
"column_2": Columns.START_DATE.value,
"column_3": Columns.END_DATE.value,
"column_4": Columns.HEIGHT.value,
"column_5": Columns.LATITUDE.value,
"column_6": Columns.LONGITUDE.value,
"column_7": Columns.NAME.value,
"column_8": Columns.STATE.value,
}

METADATA_COLUMNS = [
Expand Down Expand Up @@ -190,21 +189,28 @@ def _read_meta_df(file: BytesIO) -> pl.LazyFrame:
# Skip first line if it contains a header
lines = lines[1:]
lines = [line.decode("latin-1") for line in lines]
df = pl.DataFrame(lines)
column_specs = (
(0, 4),
(21, 29),
(30, 38),
(40, 53),
(55, 65),
(67, 75),
(76, 156),
(157, 200),
)
df = read_fwf_from_df(df, column_specs)
lines = [_create_csv_line(line.split()) for line in lines]
text = "\n".join(lines)
df = pl.read_csv(StringIO(text), has_header=False, infer_schema_length=0)
return df.rename(mapping=lambda col: DWD_COLUMN_NAMES_MAPPING.get(col, col)).lazy()


def _create_csv_line(columns: list[str]) -> str:
"""Each column is typically separated by a whitespace and has 7 columns.
If it has more than 7 columns, the columns from second last column and previous columns are joined together so that
there's only 7 columns in the line.
"""
num_columns = len(columns)
if num_columns > 8:
excess_columns = num_columns - 8
station_name = " ".join(columns[-excess_columns - 2 : -1])
columns = columns[: -excess_columns - 2] + [station_name] + columns[-1:]
station_name = columns[-2]
if "," in station_name:
columns[-2] = f'"{station_name}"'
return ",".join(columns)


def _create_meta_index_for_subdaily_extreme_wind(period: Period, settings: Settings) -> pl.LazyFrame:
"""Create metadata DataFrame for subdaily wind extreme
Expand Down

0 comments on commit 9e7d9bd

Please sign in to comment.