In [2]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
def get_url(
    date: str,
    source: {'IFS', 'IFS_old', 'IFS_fc', 'IFS_old_fc', 'GFS', 'GDAS', 'FNL', 'SYNOP'} = 'IFS',
    store: {'dodsC', 'ncss/grid', 'fileServer'} = 'dodsC',
    df_var_codes: pd.DataFrame = pd.DataFrame(),
    var_code: int = None,
    var_short: str = None,
    engine: {'sfc', 'pl'} = 'sfc',
    hour: {'00', '06', '12', '18'} = '00'
):
    """Retrieve URL for downloading data from NCAR RDA.
    
    Args:
        date: Requested date (YYYY-MM-DD).
        source: Data source ('IFS', 'IFS_old', 'IFS_fc', 'IFS_old_fc', 'GFS', 'GDAS', 'FNL', 'SYNOP').
        store: Server path ('dodsC', 'ncss/grid', 'fileServer').
        df_var_codes: DataFrame with variable codes and short names.
        var_code: ECMWF variable code.
        var_short: Variable short name.
        engine: Domain ('sfc' for surface, 'pl' for pressure levels).
        hour: Hour for pressure level data ('00', '06', '12', '18').
    
    Returns:
        URL to access data.
    """
    if source == 'IFS':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d113001/ec.oper.an.{engine}'
    elif source == 'IFS_old':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d113000/{"plev" if engine == "pl" else "surface"}/od_oper_an_{engine}'
    elif source == 'IFS_fc':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d113001/ec.oper.fc.{engine}'
    elif source == 'IFS_old_fc':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d113000/supl/od_oper_fc_{engine}'
    elif source == 'GFS':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d084001'
    elif source == 'GDAS':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d083003'
    elif source == 'FNL':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d083002'
    elif source == 'SYNOP':
        NCAR = f'https://thredds.rda.ucar.edu/thredds/{store}/files/g/d336000'
    else:
        raise ValueError('Source must be one of IFS, IFS_old, GFS, GDAS, or FNL')

    if source == 'IFS':
        if not var_code and not var_short:
            raise ValueError('Must provide var_code or var_short for IFS source.')
        if var_code and var_short:
            pass
        elif len(df_var_codes) > 0:
            if var_code in df_var_codes['id'].values:
                var_short = df_var_codes[df_var_codes['id'] == var_code]['shortname'].iloc[0]
            elif var_short in df_var_codes['shortname'].values:
                var_code = df_var_codes[df_var_codes['shortname'] == var_short]['id'].iloc[0]
            else:
                raise ValueError('Variable not found in df_var_codes.')

    req_date = pd.to_datetime(date)
    year = req_date.year
    yearmon = req_date.strftime('%Y%m')
    yeardotmon = req_date.strftime('%Y.%m')
    short_date = req_date.strftime('%Y%m%d')
    var_code = str(var_code).rjust(3, '0') if var_code and len(str(var_code)) < 3 else str(var_code)
    prefix = 'uv' if var_short in ('u', 'v') else 'sc'

    if source == 'IFS':
        if engine == 'sfc':
            file_link = f'{NCAR}/{yearmon}/ec.oper.an.{engine}.128_{var_code}_{var_short}.regn1280{prefix}.{short_date}.nc'
        elif engine == 'pl':
            file_link = f'{NCAR}/{yearmon}/ec.oper.an.{engine}.128_{var_code}_{var_short}.regn1280{prefix}.{short_date}{hour}.nc'
    elif source == 'IFS_old':
        if engine == 'sfc':
            file_link = f'{NCAR}_{short_date}.regn640.grb'
        elif engine == 'pl':
            file_link = f'{NCAR}_{short_date}{hour}.regn640.grb'
    elif source == 'IFS_fc':
        file_link = f'{NCAR}/{yearmon}/ec.oper.fc.{engine}.128_{var_code}_{var_short}.regn1280sc.{short_date}.nc'
    elif source == 'IFS_old_fc':
        file_link = f'{NCAR}_{short_date}{hour}.regn640.grb'
    elif source == 'GFS':
        file_link = f'{NCAR}/{year}/{short_date}/gfs.0p25.{short_date}{hour}.f000.grib2'
    elif source == 'GDAS':
        file_link = f'{NCAR}/{year}/{yearmon}/gdas1.fnl0p25.{short_date}{hour}.f00.grib2'
    elif source == 'FNL':
        if req_date > pd.to_datetime('2007-12-06') or (req_date == pd.to_datetime('2007-12-06') and hour in ('12', '18')):
            file_link = f'{NCAR}/grib2/{year}/{yeardotmon}/fnl_{short_date}_{hour}_00.grib2'
        else:
            file_link = f'{NCAR}/grib1/{year}/{yeardotmon}/fnl_{short_date}_{hour}_00.grib1'
    elif source == 'SYNOP':
        if engine  == 'sfc':
            file_link = f'{NCAR}/surface/{yearmon}/{short_date}/Surface_Synoptic_{short_date}_0000.nc'

    return file_link

In [4]:
# check variables in the dataset
#ds = xr.open_dataset(get_url('2025-01-01', source='SYNOP'))
#ds.variables


In [7]:
def process_synop_data(year, lat_min, lat_max, lon_min, lon_max, variable='Tmax'):
    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'
    
    results = []
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    station_coords = {}

    for date in dates:
        try:
            print(date.strftime('%Y-%m-%d'))
            ds = xr.open_dataset(get_url(date.strftime('%Y-%m-%d'), source='SYNOP'))
            
            df = pd.DataFrame({
                'wmoId': ds['wmoId'].values,
                'lat': ds['Lat'].values,
                'lon': ds['Lon'].values,
                variable: ds[variable].values
            })

            df = df[
                (df['lat'] >= lat_min) & (df['lat'] <= lat_max) &
                (df['lon'] >= lon_min) & (df['lon'] <= lon_max)
            ]

            df = df.drop_duplicates(subset='wmoId')

            for _, row in df.iterrows():
                wmo = row['wmoId']
                if wmo not in station_coords:
                    station_coords[wmo] = {'lat': row['lat'], 'lon': row['lon']}

            df['date'] = date
            results.append(df[['date', 'wmoId', variable]])

        except Exception as e:
            print(f"Error en {date.strftime('%Y-%m-%d')}: {e}")
            continue

    df_all = pd.concat(results)
    df_pivot = df_all.pivot(index='date', columns='wmoId', values=variable)

    df_pivot.to_csv(f"../data/processed/{variable.lower()}_diaria_por_estacion_{lat_min}_{lat_max}_{lon_min}_{lon_max}_{year}.csv")

    coords_df = pd.DataFrame.from_dict(station_coords, orient='index')
    coords_df.index.name = 'wmoId'
    coords_df.to_csv(f"../data/processed/{variable.lower()}_coordenadas_estaciones_{lat_min}_{lat_max}_{lon_min}_{lon_max}_{year}.csv")
    
    return df_pivot, coords_df


In [None]:
list_years = ['2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024', '2025']

for year in list_years:
    print(f"Processing year: {year}")
    df_pivot, coords_df = process_synop_data(year, lat_min=5, lat_max=45, lon_min=-95, lon_max=-65, variable='PRECIP_amt24')

Processing year: 2015
2015-01-01
2015-01-02
2015-01-03
2015-01-04
2015-01-05
2015-01-06
2015-01-07
2015-01-08
2015-01-09
2015-01-10
2015-01-11
2015-01-12
2015-01-13
2015-01-14
2015-01-15
2015-01-16
2015-01-17
2015-01-18
2015-01-19
2015-01-20
2015-01-21
2015-01-22
2015-01-23
2015-01-24
2015-01-25
2015-01-26
2015-01-27
2015-01-28
2015-01-29
2015-01-30
2015-01-31
2015-02-01
2015-02-02
2015-02-03
2015-02-04
2015-02-05
2015-02-06
2015-02-07
2015-02-08
2015-02-09
2015-02-10
2015-02-11
2015-02-12
2015-02-13
2015-02-14
2015-02-15
2015-02-16
2015-02-17
2015-02-18
2015-02-19
2015-02-20
2015-02-21
2015-02-22
2015-02-23
2015-02-24
2015-02-25
2015-02-26
2015-02-27
2015-02-28
2015-03-01
2015-03-02
2015-03-03
2015-03-04
2015-03-05
2015-03-06
2015-03-07
2015-03-08
2015-03-09
2015-03-10
2015-03-11
2015-03-12
2015-03-13
2015-03-14
2015-03-15
2015-03-16
2015-03-17
2015-03-18
2015-03-19
2015-03-20
2015-03-21
2015-03-22
2015-03-23
2015-03-24
2015-03-25
2015-03-26
2015-03-27
2015-03-28
2015-03-29
2015-03-30

In [None]:
import cartopy.crs as ccrs
import cartopy.feature as cfeature


fig = plt.figure(figsize=(12, 8))
ax = plt.axes([0.05, 0.05, 0.9, 0.9], projection=ccrs.PlateCarree())
obs_count = df_pivot.count()
coords_df['obs_count'] = obs_count
coords_plot = coords_df.dropna(subset=['lat', 'lon', 'obs_count'])

sc = ax.scatter(
    coords_plot['lon'], coords_plot['lat'],
    s=coords_plot['obs_count']/2,   # tamaño proporcional a cantidad
    c=coords_plot['obs_count'],       # color también
    cmap='viridis',
    alpha=0.7,
    marker='.',
    edgecolors='black', 
    transform=ccrs.PlateCarree()
    )


ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
ax.add_feature(cfeature.BORDERS, linewidth=0.5)
ax.set_extent([lon_min, lon_max, lat_min, lat_max])

ax.set_xticks(np.arange(lon_min, lon_max + 1, 5), crs=ccrs.PlateCarree())
ax.set_yticks(np.arange(lat_min, lat_max + 1, 5), crs=ccrs.PlateCarree())
ax.set_xlabel("Longitude")
ax.set_ylabel("Latitude")

plt.colorbar(sc, label='Cantidad de observaciones')
ax.set_title('Cantidad de observaciones', loc='left')
# bx = plt.axes([0.1, 0.2, 0.15, 0.4])
# df_unique.plot(x='lon', y='lat', kind='scatter', marker='.', color='blue', alpha=0.5, ax=bx)
# bx.set_xlim(area[1], area[3])
# bx.set_ylim(area[2], area[0])