In [None]:
import pandas as pd
import numpy as np
import xarray as xr
import os
import glob

def extract_era5_point_to_csv(region_name, lat, lon, creepmeter_id=None,
                               data_dir="./era5_data", out_dir="./era5_csv", overwrite=False):
    os.makedirs(out_dir, exist_ok=True)
    file_pattern = os.path.join(data_dir, f"era5_{region_name}_*.grib")
    file_list = sorted(glob.glob(file_pattern))

    if not file_list:
        print(f"Skipping {region_name}: no GRIB files found yet in {data_dir}")
        return None

    # Prepare output filename
    filename_base = creepmeter_id if creepmeter_id else f"{region_name}_lat{lat}_lon{lon}"
    csv_filename = f"{filename_base}_timeseries.csv"
    csv_path = os.path.join(out_dir, csv_filename)

    # Skip if already exists and overwrite is False
    if os.path.exists(csv_path) and not overwrite:
        print(f"Skipping {csv_filename}: file already exists.")
        return None

    all_data = []

    for f in file_list:
        print(f"Processing {os.path.basename(f)} for {filename_base}...")

        try:
            # Load total_precipitation with decode_timedelta set
            ds_tp = xr.open_dataset(
                f,
                engine="cfgrib",
                backend_kwargs={"filter_by_keys": {"shortName": "tp"}},
                decode_timedelta=True
            )
            ts_tp = ds_tp["tp"].sel(latitude=lat, longitude=lon, method="nearest").to_dataframe().reset_index()[["time", "tp"]]

            # Load surface_pressure with decode_timedelta set
            ds_sp = xr.open_dataset(
                f,
                engine="cfgrib",
                backend_kwargs={"filter_by_keys": {"shortName": "sp"}},
                decode_timedelta=True
            )
            ts_sp = ds_sp["sp"].sel(latitude=lat, longitude=lon, method="nearest").to_dataframe().reset_index()[["time", "sp"]]

            # Merge on time
            ts_merged = pd.merge(ts_tp, ts_sp, on="time")
            all_data.append(ts_merged)
        except Exception as e:
            print(f"Error reading {os.path.basename(f)}: {e}")
            continue

    if not all_data:
        print(f"No data could be extracted for {filename_base}")
        return None

    df_full = pd.concat(all_data).sort_values("time").reset_index(drop=True)
    df_full = df_full.rename(columns={"tp": "precipitation_m", "sp": "surface_pressure_pa"})
    df_full.to_csv(csv_path, index=False)
    print(f"Saved time series to {csv_path}")

    return df_full


# --- Main execution ---
Creepmeter_dataframe = pd.read_csv('../../Data/DATA_tidied/creepmeter_metadata_post_standardisation_sac_codes_updated.csv', index_col=0)
Creepmeter_dataframe.drop_duplicates('Creepmeter_abbrv', inplace=True)
Creepmeter_dataframe = Creepmeter_dataframe[~Creepmeter_dataframe['Creepmeter_abbrv'].isin(['XMBC', 'TABC'])].reset_index(drop=True)

regions = {
    'CAL': 'Hollister', 'CHAF': 'Pakistan', 'DSF': 'Israel', 'EAF': 'EAF',
    'HAY': 'Hayward', 'HOL': 'Hollister', 'NAF': 'NAF', 'PARK': 'Parkfield',
    'RID': 'Ridgecrest', 'SOCAL': 'SoCal', 'UTA': 'UTA'
}

Creepmeter_dataframe['Rain_region_file'] = Creepmeter_dataframe['Network'].map(regions)
rain_networks = np.unique(Creepmeter_dataframe['Rain_region_file'])

# Set this flag to control overwrite behaviour
OVERWRITE_EXISTING = False

for region in rain_networks:
    region_creepmeters = Creepmeter_dataframe[Creepmeter_dataframe['Rain_region_file'] == region]

    for _, row in region_creepmeters.iterrows():
        result = extract_era5_point_to_csv(
            region_name=region,
            lat=row['Latitude'],
            lon=row['Longitude'],
            creepmeter_id=row['Creepmeter_abbrv'],
            data_dir="./era5_data",
            out_dir="./era5_csv",
            overwrite=False
        )

        if result is None:
            print(f"Skipping creepmeter {row['Creepmeter_abbrv']} — no data or file already exists.")



Processing era5_EAF_2017_2020.grib for BAL1...
Processing era5_EAF_2021_2024.grib for BAL1...
Saved time series to ./era5_csv/BAL1_timeseries.csv
Processing era5_EAF_2017_2020.grib for GOK1...
Processing era5_EAF_2021_2024.grib for GOK1...
Saved time series to ./era5_csv/GOK1_timeseries.csv
Processing era5_EAF_2017_2020.grib for GOZ1...
Processing era5_EAF_2021_2024.grib for GOZ1...
Saved time series to ./era5_csv/GOZ1_timeseries.csv
Processing era5_EAF_2017_2020.grib for HAS1...
Processing era5_EAF_2021_2024.grib for HAS1...
Saved time series to ./era5_csv/HAS1_timeseries.csv
Processing era5_EAF_2017_2020.grib for HAT1...
Processing era5_EAF_2021_2024.grib for HAT1...
Saved time series to ./era5_csv/HAT1_timeseries.csv
Processing era5_EAF_2017_2020.grib for KAR1...
Processing era5_EAF_2021_2024.grib for KAR1...
Saved time series to ./era5_csv/KAR1_timeseries.csv
Processing era5_EAF_2017_2020.grib for KIR1...
Processing era5_EAF_2021_2024.grib for KIR1...
Saved time series to ./era5_cs

In [6]:
def extract_era5_point_to_csv(region_name, lat, lon, data_dir="./era5_data", out_dir="./era5_csv"):
    os.makedirs(out_dir, exist_ok=True)
    file_pattern = os.path.join(data_dir, f"era5_{region_name}_*.grib")
    file_list = sorted(glob.glob(file_pattern))

    if not file_list:
        raise FileNotFoundError(f"No GRIB files found for region '{region_name}' in {data_dir}")

    all_data = []

    for f in file_list:
        print(f"Processing {os.path.basename(f)}...")

        # Load total_precipitation
        ds_tp = xr.open_dataset(f, engine="cfgrib", backend_kwargs={"filter_by_keys": {"shortName": "tp"}})
        ts_tp = ds_tp["tp"].sel(latitude=lat, longitude=lon, method="nearest").to_dataframe().reset_index()[["time", "tp"]]

        # Load surface_pressure
        ds_sp = xr.open_dataset(f, engine="cfgrib", backend_kwargs={"filter_by_keys": {"shortName": "sp"}})
        ts_sp = ds_sp["sp"].sel(latitude=lat, longitude=lon, method="nearest").to_dataframe().reset_index()[["time", "sp"]]

        # Merge on time
        ts_merged = pd.merge(ts_tp, ts_sp, on="time")
        all_data.append(ts_merged)

    # Combine all chunks
    df_full = pd.concat(all_data).sort_values("time").reset_index(drop=True)
    df_full = df_full.rename(columns={"tp": "precipitation_m", "sp": "surface_pressure_pa"})

    # Save to CSV
    csv_filename = f"{region_name}_lat{lat}_lon{lon}_timeseries.csv".replace(".", "p")
    csv_path = os.path.join(out_dir, csv_filename)
    #df_full.to_csv(csv_path, index=False)
    print(f"Saved time series to {csv_path}")

    return df_full


In [7]:
Creepmeter_dataframe = pd.read_csv('../../Data/DATA_tidied/creepmeter_metadata_post_standardisation_sac_codes_updated.csv',index_col=0)
Creepmeter_dataframe.drop_duplicates('Creepmeter_abbrv',inplace=True)
Creepmeter_dataframe.drop(Creepmeter_dataframe[Creepmeter_dataframe['Creepmeter_abbrv']=='XMBC'].index,inplace=True)
Creepmeter_dataframe.drop(Creepmeter_dataframe[Creepmeter_dataframe['Creepmeter_abbrv']=='TABC'].index,inplace=True)
Creepmeter_dataframe.reset_index(inplace=True,drop=True)

regions = {'CAL':'Hollister', 'CHAF':'Pakistan', 'DSF':'Israel','EAF':'EAF', 'HAY':'Hayward', 'HOL':'Hollister', 'NAF':'NAF',
            'PARK':'Parkfield', 'RID':'Ridgecrest','SOCAL':'SoCal', 'UTA':'UTA'}

regions_rain = []
for i in range(len(Creepmeter_dataframe)):
    rain_file = regions[Creepmeter_dataframe['Network'].iloc[i]]
    regions_rain.append(rain_file)
Creepmeter_dataframe['Rain_region_file'] = regions_rain

In [8]:
rain_networks = np.unique(Creepmeter_dataframe['Rain_region_file'])

In [9]:
for i in range(len(rain_networks)):
    creepmeters_in_region = Creepmeter_dataframe.drop(Creepmeter_dataframe[Creepmeter_dataframe['Rain_region_file']!=rain_networks[i]].index)
    for j in range(len(creepmeters_in_region)):
        df = extract_era5_point_to_csv(region_name=rain_networks[i],lat=creepmeters_in_region['Latitude'].iloc[j],
                                       lon=creepmeters_in_region['Longitude'].iloc[j],data_dir="./era5_data",out_dir="./era5_csv", overwrite=True)

TypeError: extract_era5_point_to_csv() got an unexpected keyword argument 'overwrite'

In [3]:
df = extract_era5_point_to_csv(
    region_name="Hollister",
    lat=36.75,
    lon=-121.25,
    data_dir="./era5_data",
    out_dir="./era5_csv"
)

print(df.head())

Processing era5_Hollister_1980_1983.grib...


  vars, attrs, coord_names = xr.conventions.decode_cf_variables(
  vars, attrs, coord_names = xr.conventions.decode_cf_variables(


Processing era5_Hollister_1984_1987.grib...


  vars, attrs, coord_names = xr.conventions.decode_cf_variables(
  vars, attrs, coord_names = xr.conventions.decode_cf_variables(


Processing era5_Hollister_1992_1995.grib...


  vars, attrs, coord_names = xr.conventions.decode_cf_variables(
  vars, attrs, coord_names = xr.conventions.decode_cf_variables(


Saved time series to ./era5_csv/Hollister_lat36p75_lon-121p25_timeseriespcsv
                 time         step_tp  number_tp  surface_tp  latitude_tp  \
0 1980-01-01 06:00:00 0 days 01:00:00          0         0.0        36.75   
1 1980-01-01 06:00:00 0 days 12:00:00          0         0.0        36.75   
2 1980-01-01 06:00:00 0 days 11:00:00          0         0.0        36.75   
3 1980-01-01 06:00:00 0 days 10:00:00          0         0.0        36.75   
4 1980-01-01 06:00:00 0 days 08:00:00          0         0.0        36.75   

   longitude_tp       valid_time_tp        tp  number_sp step_sp  surface_sp  \
0       -121.25 1980-01-01 07:00:00  0.000003          0  0 days         0.0   
1       -121.25 1980-01-01 18:00:00  0.000004          0  0 days         0.0   
2       -121.25 1980-01-01 17:00:00  0.000002          0  0 days         0.0   
3       -121.25 1980-01-01 16:00:00  0.000005          0  0 days         0.0   
4       -121.25 1980-01-01 14:00:00  0.000000          0  0 

In [None]:

print(Creepmeter_dataframe)

   Network Creepmeter_abbrv File_code                   Start Time  \
0      EAF             BAL1   BAL1_1T  2023-03-12T10:47:55.000000Z   
1     PARK             C461  C461_10T  2004-09-29T02:20:00.000000Z   
2     PARK             C462   C462_1T  2021-08-19T19:10:11.000000Z   
3      UTA             CAN1  CAN1_30T  2012-06-12T15:33:00.000000Z   
4      NAF             CER1  CER1_30S  2019-10-09T06:46:00.000000Z   
..     ...              ...       ...                          ...   
73     EAF             YZW1   YZW1_1T  2023-03-22T10:59:37.000000Z   
74   SOCAL             COML  COML_30T          2010-08-15T00:30:00   
75   SOCAL             COLE  COLE_30T          2010-08-10T07:01:56   
76   SOCAL             COLW   COLW_5T          2010-04-10T00:10:00   
77   SOCAL             EASH   EASH_5T          2016-12-30T19:47:39   

                       End Time  Sampling rate, Hz Sampling rate, mins  \
0   2023-09-05T13:14:55.000000Z           0.016667                  1T   
1   2022-06

array(['CAL', 'CHAF', 'DSF', 'EAF', 'HAY', 'HOL', 'NAF', 'PARK', 'RID',
       'SOCAL', 'UTA'], dtype=object)

In [6]:
regions['HAY']

'Hayward'

EAF
Parkfield
Parkfield
UTA
NAF
Hayward
Pakistan
NAF
Hayward
Hayward
Hayward
Parkfield
Hayward
Hollister
Hollister
Israel
SoCal
NAF
Hollister
SoCal
Ridgecrest
Ridgecrest
EAF
EAF
Ridgecrest
NAF
EAF
EAF
NAF
SoCal
NAF
EAF
EAF
SoCal
EAF
EAF
EAF
NAF
SoCal
SoCal
Ridgecrest
Ridgecrest
Ridgecrest
SoCal
SoCal
NAF
SoCal
EAF
Hollister
SoCal
EAF
NAF
Parkfield
NAF
Parkfield
Parkfield
Hollister
Hollister
Hollister
Parkfield
Parkfield
Parkfield
Hollister
Parkfield
Parkfield
Parkfield
Parkfield
Hollister
Hollister
Hollister
Parkfield
Parkfield
EAF
EAF
SoCal
SoCal
SoCal
SoCal
