<a href="https://colab.research.google.com/github/esb-index/Barka-AV/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import xarray as xr
xr.open_dataset("/content/era5_extracted/dania4/data_stream-oper_stepType-instant.nc")


In [1]:
# ==========================
# ERA5 "light-RAM" feldolgoz√≥ pipeline
# ==========================
# minden .zip m√°r ki van bontva /content/era5_extracted al√°

!pip install xarray netCDF4 pandas numpy tqdm

import xarray as xr
import pandas as pd
import numpy as np
import glob, os
from tqdm import tqdm

EXTRACT_DIR = "/content/era5_extracted"
OUT_DIR = "/content/era5_processed_light"
os.makedirs(OUT_DIR, exist_ok=True)

def region_from_path(p):
    p_low = p.lower()
    if "dania" in p_low: return "dania"
    if "nemet" in p_low: return "nemet"
    if "uk" in p_low: return "uk"
    if "tajvan" in p_low: return "tajvan"
    if "usa" in p_low: return "usa"
    return "other"

all_files = sorted(glob.glob(os.path.join(EXTRACT_DIR, "**", "*.nc"), recursive=True))
region_files = {}
for f in all_files:
    r = region_from_path(f)
    region_files.setdefault(r, []).append(f)

print({r: len(v) for r,v in region_files.items()})

# ---- kis seg√©df√ºggv√©nyek ----
def safe_open_dataset(path):
    """megnyitja a NetCDF-et √©s √°tnevezi valid_time -> time"""
    ds = xr.open_dataset(path)
    if "valid_time" in ds.dims and "time" not in ds.dims:
        ds = ds.rename({"valid_time":"time"})
    ds = ds.chunk({"time":200})
    return ds

def winsorize(arr, low=1, high=99):
    lo = np.nanpercentile(arr, low)
    hi = np.nanpercentile(arr, high)
    return np.clip(arr, lo, hi)

# ---- f≈ë feldolgoz√≥ ciklus ----
for region, files in region_files.items():
    if region == "other" or len(files)==0:
        continue
    print(f"\n=== Feldolgoz√°s: {region} ===")

    region_dailies = []
    for f in tqdm(files, desc=f"{region} f√°jlok"):
        try:
            ds = safe_open_dataset(f)
        except Exception as e:
            print(f"Hiba {f}: {e}")
            continue

        # napi √°tlag, sum, max
        daily = xr.Dataset()
        for v in ds.data_vars:
            data = ds[v]
            if "time" not in data.dims:
                continue
            if v.lower().startswith(("tp","precip")):
                daily[v+"_sum"] = data.resample(time="1D").sum()
            elif v.lower().startswith(("u10","v10","wind","gust","10fg")):
                daily[v+"_max"] = data.resample(time="1D").max()
            else:
                daily[v+"_mean"] = data.resample(time="1D").mean()

        # winsorize 1‚Äì99% minden v√°ltoz√≥ra
        for v in daily.data_vars:
            vals = daily[v].values
            daily[v].values = winsorize(vals)

        region_dailies.append(daily)
        ds.close()

    if len(region_dailies)==0:
        print(f"Nincs feldolgozhat√≥ adat: {region}")
        continue

    # egyes√≠t√©s id≈ë szerint
    merged = xr.concat(region_dailies, dim="time")
    merged = merged.sortby("time")

    # t√©rbeli √°tlag -> id≈ësor
    data_dict = {}
    for v in merged.data_vars:
        spatial_dims = [d for d in merged[v].dims if d not in ["time"]]
        data_dict[v+"_mean"] = merged[v].mean(dim=spatial_dims).to_pandas()

    df = pd.DataFrame(data_dict)
    df.index.name = "date"

    # ment√©s
    nc_out = os.path.join(OUT_DIR, f"{region}_daily_light.nc")
    csv_out = os.path.join(OUT_DIR, f"{region}_daily_light.csv")
    merged.to_netcdf(nc_out)
    df.to_csv(csv_out)
    print(f"{region}: k√©sz ‚Üí {csv_out}")

print("\nPipeline lefutott. Ellen≈ërizd az era5_processed_light mapp√°t.")


{'dania': 8, 'nemet': 8, 'tajvan': 12, 'uk': 8, 'usa': 13}

=== Feldolgoz√°s: dania ===


dania f√°jlok: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [04:55<00:00, 36.88s/it]


dania: k√©sz ‚Üí /content/era5_processed_light/dania_daily_light.csv

=== Feldolgoz√°s: nemet ===


nemet f√°jlok: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [05:12<00:00, 39.04s/it]


nemet: k√©sz ‚Üí /content/era5_processed_light/nemet_daily_light.csv

=== Feldolgoz√°s: tajvan ===


tajvan f√°jlok: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [04:54<00:00, 24.50s/it]


tajvan: k√©sz ‚Üí /content/era5_processed_light/tajvan_daily_light.csv

=== Feldolgoz√°s: uk ===


uk f√°jlok: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [05:05<00:00, 38.17s/it]


uk: k√©sz ‚Üí /content/era5_processed_light/uk_daily_light.csv

=== Feldolgoz√°s: usa ===


usa f√°jlok: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 13/13 [05:25<00:00, 25.00s/it]
  merged = xr.concat(region_dailies, dim="time")
  merged = xr.concat(region_dailies, dim="time")


usa: k√©sz ‚Üí /content/era5_processed_light/usa_daily_light.csv

Pipeline lefutott. Ellen≈ërizd az era5_processed_light mapp√°t.
