In [1]:
import logging
import sys
from pathlib import Path

import geopandas as gpd
import pandas as pd

# Add parent directory to Python path to access src modules
sys.path.append(str(Path("..").resolve()))
from src.interpolation import (
    create_interpolated_datasets,
    load_all_meteo_data,
)
from src.utils.logger import setup_logger

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Load meteorological stations
meteo_stations = gpd.read_file("../data/Geometry/meteo_stations.gpkg")
meteo_stations.set_index("gauge_id", inplace=True)

### Parse from initial .txt

In [3]:
station_ids = [
    i.stem
    for i in Path("../data/MeteoData/meteo_ru_2007_2022/").glob("*.txt")
    if i.stem not in ["statlist332041", "fld332041"]
]
# Read the raw meteorological data file with proper column names
column_names = [
    "station_id",
    "year",
    "month",
    "day",
    "t_min",
    "col5",
    "t_mean",
    "col7",
    "t_max",
    "col9",
    "prcp",
]
for station_id in station_ids:
    try:
        meteo_raw = pd.read_csv(
            f"../data/MeteoData/meteo_ru_2007_2022/{station_id}.txt", sep=";", header=None
        )
        meteo_raw.rename(
            columns=dict(zip(range(len(column_names)), column_names, strict=False)), inplace=True
        )
        meteo_raw["date"] = pd.to_datetime(
            meteo_raw[["year", "month", "day"]].astype(str).agg("-".join, axis=1), format="%Y-%m-%d"
        )
        meteo_raw.set_index("date", inplace=True)
        meteo_raw.drop(
            columns=["station_id", "year", "month", "day", "col5", "col7", "col9", 11, 12], inplace=True
        )
        meteo_raw[["t_min", "t_mean", "t_max", "prcp"]] = meteo_raw[
            ["t_min", "t_mean", "t_max", "prcp"]
        ].apply(pd.to_numeric, errors="coerce")
        meteo_raw.to_csv(f"../data/MeteoData/meteo_ru/{station_id}.csv")
    except pd.errors.EmptyDataError:
        print(f"Empty data for station {station_id}, skipping.")

### Based on station geometry and data create .nc grids

In [4]:
# Load all meteorological data using new module
data_dir = Path("../data/MeteoData/meteo_ru")
logger = setup_logger(name="MeteoGridder", log_file="../logs/meteo_grider.log")
meteo_data_dict = load_all_meteo_data(meteo_stations, data_dir)



In [5]:
# Define extent and parameters
extent = (20.0, 50.0, 40.0, 70.0)  # West, East, South, North
resolution = 0.2  # degrees

# Get common date range from all stations
all_dates = set()
for df in meteo_data_dict.values():
    all_dates.update(df.index)

# Convert to sorted list and create date range
common_dates = sorted(list(all_dates))
if common_dates:
    date_range = pd.date_range(start=min(common_dates), end=max(common_dates), freq="D")
    # Filter to dates that have data
    date_range = pd.DatetimeIndex([d for d in date_range if d in all_dates])

    print(f"Date range: {date_range[0]} to {date_range[-1]}")
    print(f"Total dates with data: {len(date_range)}")
else:
    print("No common dates found in meteorological data")
    date_range = pd.DatetimeIndex([])

Date range: 2007-01-01 00:00:00 to 2024-10-31 00:00:00
Total dates with data: 6514


In [6]:
# Create interpolated datasets with extrapolation option
if len(date_range) > 0:
    print(f"Creating interpolated datasets for {len(date_range)} dates...")

    # Option to test with border extrapolation
    use_extrapolation = True
    method = "cubic"

    stations = meteo_stations
    data_dict = meteo_data_dict

    temp_dataset, prcp_dataset = create_interpolated_datasets(
        meteo_stations=stations,
        meteo_data_dict=data_dict,
        extent=extent,
        date_range=date_range,
        resolution=resolution,
        method=method,
    )

    print("Temperature dataset:")
    print(temp_dataset)
    print("\nPrecipitation dataset:")
    print(prcp_dataset)

    # Save datasets
    Path("../data/MeteoData/parsed_meteo/meteo_ru_single").mkdir(parents=True, exist_ok=True)
    temp_output_path = (
        f"../data/MeteoData/parsed_meteo/meteo_ru_single/interpolated_temperature_{method}.nc"
    )
    prcp_output_path = (
        f"../data/MeteoData/parsed_meteo/meteo_ru_single/interpolated_precipitation_{method}.nc"
    )

    # Clean time coordinate attributes before saving
    temp_dataset_clean = temp_dataset.copy()
    prcp_dataset_clean = prcp_dataset.copy()

    if "units" in temp_dataset_clean.time.attrs:
        del temp_dataset_clean.time.attrs["units"]
    if "units" in prcp_dataset_clean.time.attrs:
        del prcp_dataset_clean.time.attrs["units"]

    temp_dataset_clean.to_netcdf(temp_output_path)
    prcp_dataset_clean.to_netcdf(prcp_output_path)

    print("\nDatasets saved to:")
    print(f"Temperature: {temp_output_path}")
    print(f"Precipitation: {prcp_output_path}")
else:
    print("No data available for interpolation")

INFO:src.interpolation.gridding:Starting interpolation for 6514 dates using cubic method
INFO:src.interpolation.gridding:Processing date 1/6514: 2007-01-01 00:00:00


Creating interpolated datasets for 6514 dates...


INFO:src.interpolation.gridding:Processing date 101/6514: 2007-04-11 00:00:00
INFO:src.interpolation.gridding:Processing date 201/6514: 2007-07-20 00:00:00
INFO:src.interpolation.gridding:Processing date 301/6514: 2007-10-28 00:00:00
INFO:src.interpolation.gridding:Processing date 401/6514: 2008-02-05 00:00:00
INFO:src.interpolation.gridding:Processing date 501/6514: 2008-05-15 00:00:00
INFO:src.interpolation.gridding:Processing date 601/6514: 2008-08-23 00:00:00
INFO:src.interpolation.gridding:Processing date 701/6514: 2008-12-01 00:00:00
INFO:src.interpolation.gridding:Processing date 801/6514: 2009-03-11 00:00:00
INFO:src.interpolation.gridding:Processing date 901/6514: 2009-06-19 00:00:00
INFO:src.interpolation.gridding:Processing date 1001/6514: 2009-09-27 00:00:00
INFO:src.interpolation.gridding:Processing date 1101/6514: 2010-01-05 00:00:00
INFO:src.interpolation.gridding:Processing date 1201/6514: 2010-04-15 00:00:00
INFO:src.interpolation.gridding:Processing date 1301/6514: 20

Temperature dataset:
<xarray.Dataset> Size: 4GB
Dimensions:  (time: 6514, lat: 151, lon: 151)
Coordinates:
  * time     (time) datetime64[ns] 52kB 2007-01-01 2007-01-02 ... 2024-10-31
  * lat      (lat) float64 1kB 40.0 40.2 40.4 40.6 40.8 ... 69.4 69.6 69.8 70.0
  * lon      (lon) float64 1kB 20.0 20.2 20.4 20.6 20.8 ... 49.4 49.6 49.8 50.0
Data variables:
    t_min    (time, lat, lon) float64 1GB nan nan nan ... -1.076 -1.008 -0.9418
    t_mean   (time, lat, lon) float64 1GB nan nan nan ... 0.09253 0.1231 0.1545
    t_max    (time, lat, lon) float64 1GB nan nan nan ... 0.8735 0.9231 0.9773
Attributes:
    title:                 Interpolated Temperature Data
    extent:                West=20.0, East=50.0, South=40.0, North=70.0
    resolution:            0.2 degrees
    source:                Meteorological station data
    interpolation_method:  cubic

Precipitation dataset:
<xarray.Dataset> Size: 1GB
Dimensions:  (time: 6514, lat: 151, lon: 151)
Coordinates:
  * time     (time) dat

In [7]:
import xarray as xr

In [12]:
with xr.open_dataset(
    "/home/dmbrmv/Development/MeteoSources/data/MeteoData/parsed_meteo/meteo_ru_single/interpolated_precipitation_cubic.nc"
) as prcp_file:
    pass

with xr.open_dataset(
    "/home/dmbrmv/Development/MeteoSources/data/MeteoData/parsed_meteo/meteo_ru_single/interpolated_temperature_cubic.nc"
) as temp_file:
    pass


meteo_nc = Path("../data/MeteoData/parsed_meteo/meteo_ru_nc_02")
temp_path = meteo_nc / "2m_temperature"
prcp_path = meteo_nc / "total_precipitation"
temp_path.mkdir(parents=True, exist_ok=True)
prcp_path.mkdir(parents=True, exist_ok=True)

for year in range(2007, 2023):
    temp_file.sel(time=slice(f"{year}-01-01", f"{year}-12-31"))
    prcp_file.sel(time=slice(f"{year}-01-01", f"{year}-12-31"))
    temp_file.to_netcdf(temp_path / f"{year}.nc")
    prcp_file.to_netcdf(prcp_path / f"{year}.nc")
