# Script 1 - Access MUR and ERA5 data

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt 
import matplotlib.patches as patches
import datetime as dt
import cartopy.feature as cfeature
import cartopy.crs as ccrs
import math
import warnings 
warnings.simplefilter('ignore') 
import datetime as dt
import datetime
import geopandas as gpd
import rioxarray
from shapely.geometry import mapping
import glob

from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
from calendar import month_abbr
from geopy import distance

from harmony import BBox, Client, Collection, Request, Environment, LinkType 

## Read in and make a map of the masks

In [None]:
#open California mask
cali = xr.open_dataset('../data/masks/mask_California_MURSST.nc')
cali.close()

#open Benguela mask
beng = xr.open_dataset('../data/masks/mask_Benguela_MURSST.nc')
beng.close()

#open Humboldt mask
hum = xr.open_dataset('../data/masks/mask_Humboldt_MURSST.nc')
hum.close()

#open Iberian-Canary mask
iber = xr.open_dataset('../data/masks/mask_Iberian-Canary_MURSST.nc')
iber.close()

In [None]:
#create copies of extending each by 100 km
cali_df = iber.to_dataframe().reset_index().dropna()

cali_100 = iber.copy()
cali_100['lon'] = cali_100['lon'] - 1
cali_100 = cali_100.to_dataframe().reset_index().dropna()

cali_200 = iber.copy()
cali_200['lon'] = cali_200['lon'] - 2
cali_200 = cali_200.to_dataframe().reset_index().dropna()

cali_300 = iber.copy()
cali_300['lon'] = cali_300['lon'] - 3
cali_300 = cali_300.to_dataframe().reset_index().dropna()

cali_400 = iber.copy()
cali_400['lon'] = cali_400['lon'] - 4
cali_400 = cali_400.to_dataframe().reset_index().dropna()

cali_500 = iber.copy()
cali_500['lon'] = cali_500['lon'] - 5
cali_500 = cali_500.to_dataframe().reset_index().dropna()

cali_600 = iber.copy()
cali_600['lon'] = cali_600['lon'] - 6
cali_600 = cali_600.to_dataframe().reset_index().dropna()

new_cali = pd.concat([cali_df, cali_100])
new_cali = pd.concat([new_cali, cali_200])
new_cali = pd.concat([new_cali, cali_300])
new_cali = pd.concat([new_cali, cali_400])
new_cali = pd.concat([new_cali, cali_500])
new_cali = pd.concat([new_cali, cali_600]).set_index(['lat', 'lon'])
new_cali = new_cali.groupby(level=new_cali.index.names).mean().reset_index()
# new_cali.loc[(new_cali['lat'] >= 34) & (new_cali['lat'] <= 46) & 
#              (new_cali['lon'] >= -128.5) & (new_cali['lon'] <= -127.5), 'analysed_sst'] = 1

# new_cali = new_cali.fillna(100)

new_cali.set_index(['lat', 'lon']).to_xarray().to_netcdf('../data/masks/mask_Iberian_MURSST_extendedSubregion.nc')
new_cali.set_index(['lat', 'lon']).to_xarray().analysed_sst.plot()

## Download ZARR Data for Extended Subregion

In [None]:
###Download mur data
#open zarr
mur = xr.open_zarr('https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1')
#resample to make monthly
mur = mur.resample(time = '1MS').mean()
mur = mur.sel(time = ~mur.time.dt.year.isin([2020]))

#open shapefile 
shp = gpd.read_file('../data/masks/EBUS_Shapefiles/Iberian/Iberian_Mask_EBUS_shp.shp')

#prepare xarray data to be clipped
mur.rio.set_spatial_dims(x_dim = "lon", y_dim = "lat", inplace = True)
mur.rio.write_crs("epsg:4326", inplace = True)

#clip the data
mur_clipped = mur.rio.clip(shp.geometry.apply(mapping), shp.crs, drop = True)

# save the clipped data
# mur_clipped.to_netcdf('/Volumes/GoogleDrive/Shared drives/Commons/STAFF_folders/Marisol/Gammon-Sol/MURR/Humboldt/Extended_Subregion/Hum_ExtendedSubregions_ZARR.nc')
mur_clipped.analysed_sst.to_netcdf('../data/Iber_ExtendedSubregions_ZARR.nc')


In [None]:
mur = xr.open_dataset('../data/Iber_ExtendedSubregions_ZARR.nc')
mur.close()

mur = mur.where(~np.isnan(mur.analysed_sst), drop = True)

mur = mur.drop('spatial_ref')

mur['analysed_sst'] = mur['analysed_sst'] - 273.15

#replace / in the data variables if necessary and encode the data
encoding = {}
for variable in mur.data_vars:
    #temp_clip = temp_clip.rename({variable: variable.replace('/_', '')})
    encoding[variable] = {'dtype': 'float32', 'scale_factor': 0.1, '_FillValue': -9999}

#drop the missing MPAs and save with encoding variable
mur.to_netcdf('../data/Iber_ExtendedSubregions_ZARR_processed.nc', encoding = encoding)


## Create Harmony Job for MUR 

In [None]:
#list of coordinates for each EBUS/subregion
cali_lats = np.array([[28.5,34.5],[34.5,40.4],[40.4,46]]) 
cali_lons = np.array([[-130, -114],[-134.5, -120], [-134.5, -123]])
hum_lats = np.array([[-42,-28],[-28,-17],[-17,-10]]) 
hum_lons = np.array([[-79,-69],[-76,-69],[-81,-72]]) 
iber_lats = np.array([[15,21.33],[21.33,30],[37, 43.39]]) 
iber_lons = np.array([[-21,-16],[-21,-9],[-14,-7]]) 
beng_lats = np.array([[-34.8,-28.63],[-28.63, -22],[-22,-15]]) 
beng_lons = np.array([[13, 20],[10, 17],[8, 15]])


In [None]:
#open the shapefile
shp = gpd.read_file('../data/masks/EBUS_Shapefiles/Iberian/Iberian_Mask_EBUS_shp.shp')
shp.total_bounds

In [None]:
# Start the Harmony Client.
harmony_client = Client(env=Environment.PROD)

#specify the data
collection = Collection(id='MUR-JPL-L4-GLOB-v4.1')

#specify start and stop times
start_day = datetime.datetime(2020,1,1,0,0,0)
end_day = datetime.datetime(2022,5,31,0,0,0)

#create a request 
request = Request(
    collection=collection,
    temporal={
        'start': start_day,
        'stop': end_day
    },
    spatial=BBox(shp.total_bounds[0], shp.total_bounds[1], shp.total_bounds[2], shp.total_bounds[3]), #total boundary based on shapefile
    variables=['analysed_sst'],
    # granule_id=granuleIDs,
    # concatenate = True,
)

request.is_valid()

Job IDs:
* California: a05723f8-92d7-49d3-ac67-4256fa70b2fa
* Benguela: fd3df6f8-890c-4f4c-bc12-b32fcdac396f
* Humboldt: 2a5ceec4-dd89-4759-ae9d-b372e696507e
* Iberian-Canary: b2172a9b-8820-418a-be38-fb609308c334

In [None]:
print(harmony_client.request_as_curl(request))
job_id = harmony_client.submit(request)
print(f'Job ID: {job_id}') # This job id is shareable:show how to do this
response = harmony_client.result_json(job_id, show_progress=True)

In [None]:
harmony_client.resume(job_id)
#harmony_client.status(job_id) 
harmony_client.wait_for_processing(job_id, show_progress=True)

In [None]:
#link to directory to download data
down_dir = '../data/data_download/'

#download the files
futures = harmony_client.download_all(job_id, directory=down_dir, overwrite=False)
file_names = [f.result() for f in futures]
sorted(file_names)

In [None]:
##Combine the ZARR and Cloud Data
long_name = 'Iberian'
short_name = 'Iber'

#open cloud data
files = glob.glob('../data/data_download/*.nc4')
files.sort()

mur_cloud = xr.open_mfdataset(files, engine = 'netcdf4')
mur_cloud.close()
print('opened all files')

#resample monthly
mur_cloud = mur_cloud.resample(time = '1MS').mean()

#open shapefile 
shp = gpd.read_file('../data/masks/EBUS_Shapefiles/'+long_name+'/'+long_name+'_Mask_EBUS_shp.shp')

#prepare xarray data to be clipped
mur_cloud.rio.set_spatial_dims(x_dim = "lon", y_dim = "lat", inplace = True)
mur_cloud.rio.write_crs("epsg:4326", inplace = True)

#clip the cloud data
mur_cloud = mur_cloud.rio.clip(shp.geometry.apply(mapping), shp.crs, drop = True)
print('clipped data')

#remove spatial reference
mur_cloud = mur_cloud.drop('spatial_ref')

#convert K to C
mur_cloud['analysed_sst'] = mur_cloud['analysed_sst'] - 273.15
print('converted temperature')

#encode the data to save space
encoding = {}
for variable in mur_cloud.data_vars:
    #temp_clip = temp_clip.rename({variable: variable.replace('/_', '')})
    encoding[variable] = {'dtype': 'float32', 'scale_factor': 0.1, '_FillValue': -9999}

#save the clipped cloud data
mur_cloud.to_netcdf('../data/'+short_name+'_ExtendedSubregions_Cloud.nc', encoding = encoding)
print('saved cloud file')

#open zarr data
mur_zarr = xr.open_dataset('../data/'+short_name+'_ExtendedSubregions_ZARR.nc')
mur_zarr.close()

#join data together
mur = xr.merge([mur_zarr, mur_cloud])
print('merged zarr and cloud files')

#save data
mur.to_netcdf('../data/'+short_name+'_ExtendedSubregions_All.nc')
print('saved merged file')
mur

## Read in MURR data and Filter it

In [None]:
# first determine the file name using, in the format:
# the s3 bucket [mur-sst], and the region [us-west-2], and the folder if applicable [zarr-v1] 
file_location = 'https://mur-sst.s3.us-west-2.amazonaws.com/zarr-v1'

ds_sst = xr.open_zarr(file_location,consolidated=True) # open a zarr file using xarray
# it is similar to open_dataset but it only reads the metadata

ds_sst

In [None]:
aggr_url = "https://thredds.jpl.nasa.gov/thredds/dodsC/OceanTemperature/MUR-JPL-L4-GLOB-v4.1.nc"
ds_sst = xr.open_dataset(aggr_url)
ds_sst

#### Humbolt Data

##### Thrreds data

In [None]:
# time range. data range available: 2002-06-01 to 2020-01-20. [start with a short period]
dater = ['2022-06-01','2022-06-30'] # dates on the format 'YYYY-MM-DD' as string
latr = [np.nanmin(hum.lat), np.nanmax(hum.lat)]
lonr = [np.nanmin(hum.lon), np.nanmax(hum.lon)]

#remove all values that are for lakes
sst = ds_sst.analysed_sst.sel(time = slice(dater[0],dater[1]), lat = slice(latr[0],latr[1]), lon = slice(lonr[0],lonr[1]))

sst = sst-273.15 # transform units from Kelvin to Celsius
sst.attrs['units']='deg C' #add attributes to the data
sst.to_netcdf('../data/data_download/Humboldt/20220601-20220630-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc')

In [None]:
sst_20 = xr.open_mfdataset('../data/data_download/Humboldt/2020*.nc'.format(dir))
sst_20.close()

#remove all values that are for lakes
#sst_filtered = sst_20.where(sst_20.mask != 5, np.nan)

#mask the MURR data
sst_hum = sst_20*hum.analysed_sst

#filter the data using time from above and take the monthly means
sst = sst_hum.resample(time = 'M').mean(dim={'time'}, skipna=True, keep_attrs=True)

sst.to_netcdf('../data/data_download/Humbolt/hum_MURR_2020.nc')

##### ZARR data

In [None]:
%%time
year = input("Input year (03-21):")

# time range. data range available: 2002-06-01 to 2020-01-20. [start with a short period]
dater = ['20'+year+'-01-01','20'+year+'-12-31'] # dates on the format 'YYYY-MM-DD' as string

#remove all values that are for lakes
sst_filtered = ds_sst.where(ds_sst.mask != 5, np.nan)

#mask the MURR data
sst_hum = sst_filtered*hum.analysed_sst

#filter the data using time from above and take the monthly means
sst = sst_hum['analysed_sst'].sel(time = slice(dater[0],dater[1])).resample(time = 'M').mean(dim={'time'}, skipna=True, keep_attrs=True)

sst = sst-273.15 # transform units from Kelvin to Celsius
sst.attrs['units']='deg C' #add attributes to the data
sst
#sst.to_netcdf('..data/data_download/Humbolt/hum_MURR_20'+year+'.nc')

#### California Data

In [None]:
# time range. data range available: 2002-06-01 to 2020-01-20. [start with a short period]
dater = ['2022-05-01','2022-05-31'] # dates on the format 'YYYY-MM-DD' as string
latr = [np.nanmin(cali.lat), np.nanmax(cali.lat)]
lonr = [np.nanmin(cali.lon), np.nanmax(cali.lon)]

#remove all values that are for lakes
sst = ds_sst.analysed_sst.sel(time = slice(dater[0],dater[1]), lat = slice(latr[0],latr[1]), lon = slice(lonr[0],lonr[1]))

sst = sst-273.15 # transform units from Kelvin to Celsius
sst.attrs['units']='deg C' #add attributes to the data
sst.to_netcdf('../data/data_download/California/20220501-20220531-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc')

In [None]:
sst_20 = xr.open_mfdataset('../data/data_download/California/2022*.nc'.format(dir))
sst_20.close()

#remove all values that are for lakes
#sst_filtered = sst_20.where(sst_20.mask != 5, np.nan)

#mask the MURR data
sst_hum = sst_20*cali.analysed_sst

#filter the data using time from above and take the monthly means
sst = sst_hum.resample(time = 'M').mean(dim={'time'}, skipna=True, keep_attrs=True)

sst.to_netcdf('..data/data_download/California/cali_MURR_2022.nc')

#### Iberian-Canary

In [None]:
for i in range(2002,2023):

    year = str(i)

    if year == '2002': #if it's the first year of MURR
        dates = [[year+'-06-01',year+'-06-30'],
                 [year+'-07-01',year+'-07-31'],
                 [year+'-08-01',year+'-08-31'],
                 [year+'-09-01',year+'-09-30'],
                 [year+'-10-01',year+'-10-31'],
                 [year+'-11-01',year+'-11-30'],
                 [year+'-12-01',year+'-12-31']]
    elif year == '2022': #if it's the most recent year of MURR
        dates = [[year+'-01-01',year+'-01-31'],
                [year+'-02-01',year+'-02-28'],
                [year+'-03-01',year+'-03-31'],
                [year+'-04-01',year+'-04-30'],
                [year+'-05-01',year+'-05-31']]
    elif year in ['2004','2008','2012','2016','2020']: #if it's a leap year
        dates = [[year+'-01-01',year+'-01-31'],
                [year+'-02-01',year+'-02-29'],
                [year+'-03-01',year+'-03-31'],
                [year+'-04-01',year+'-04-30'],
                [year+'-05-01',year+'-05-31'],
                [year+'-06-01',year+'-06-30'],
                [year+'-07-01',year+'-07-31'],
                [year+'-08-01',year+'-08-31'],
                [year+'-09-01',year+'-09-30'],
                [year+'-10-01',year+'-10-31'],
                [year+'-11-01',year+'-11-30'],
                [year+'-12-01',year+'-12-31']]
    else: #if it's any other year
        dates = [[year+'-01-01',year+'-01-31'],
                [year+'-02-01',year+'-02-28'],
                [year+'-03-01',year+'-03-31'],
                [year+'-04-01',year+'-04-30'],
                [year+'-05-01',year+'-05-31'],
                [year+'-06-01',year+'-06-30'],
                [year+'-07-01',year+'-07-31'],
                [year+'-08-01',year+'-08-31'],
                [year+'-09-01',year+'-09-30'],
                [year+'-10-01',year+'-10-31'],
                [year+'-11-01',year+'-11-30'],
                [year+'-12-01',year+'-12-31']]

    for j in range(12):
        # time range. data range available: 2002-06-01 to 2022-06-30. [start with a short period]
        dater = dates[j] # dates on the format 'YYYY-MM-DD' as string
        latr = [np.nanmin(iber.lat), np.nanmax(iber.lat)]
        lonr = [np.nanmin(iber.lon), np.nanmax(iber.lon)]

        start_date = ''.join(dates[j][0].split('-'))
        end_date = ''.join(dates[j][1].split('-'))

        #remove all values that are for lakes
        sst = ds_sst.analysed_sst.sel(time = slice(dater[0],dater[1]), lat = slice(latr[0],latr[1]), lon = slice(lonr[0],lonr[1]))

        sst = sst-273.15 # transform units from Kelvin to Celsius
        sst.attrs['units']='deg C' #add attributes to the data
        sst.to_netcdf('../data/data_download/Iberian-Canary/'+start_date+'-'+end_date+'-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc')

In [None]:
sst_20 = xr.open_mfdataset('../data/data_download/Iberian-Canary/2022*.nc'.format(dir))
sst_20.close()

#remove all values that are for lakes
#sst_filtered = sst_20.where(sst_20.mask != 5, np.nan)

#mask the MURR data
sst_hum = sst_20*iber.analysed_sst

#filter the data using time from above and take the monthly means
sst = sst_hum.resample(time = 'M').mean(dim={'time'}, skipna=True, keep_attrs=True)

sst.to_netcdf('../data/data_download/Iberian-Canary/iber_MURR_2022.nc')



Benguela

In [None]:
for i in [2022]:

    year = str(i)

    if year == '2002': #if it's the first year of MURR
        dates = [[year+'-06-01',year+'-06-30'],
                 [year+'-07-01',year+'-07-31'],
                 [year+'-08-01',year+'-08-31'],
                 [year+'-09-01',year+'-09-30'],
                 [year+'-10-01',year+'-10-31'],
                 [year+'-11-01',year+'-11-30'],
                 [year+'-12-01',year+'-12-31']]
    elif year == '2022': #if it's the most recent year of MURR
        dates = [[year+'-01-01',year+'-01-31'],
                [year+'-02-01',year+'-02-28'],
                [year+'-03-01',year+'-03-31'],
                [year+'-04-01',year+'-04-30'],
                [year+'-05-01',year+'-05-31']]
    elif year in ['2004','2008','2012','2016','2020']: #if it's a leap year
        dates = [[year+'-01-01',year+'-01-31'],
                [year+'-02-01',year+'-02-29'],
                [year+'-03-01',year+'-03-31'],
                [year+'-04-01',year+'-04-30'],
                [year+'-05-01',year+'-05-31'],
                [year+'-06-01',year+'-06-30'],
                [year+'-07-01',year+'-07-31'],
                [year+'-08-01',year+'-08-31'],
                [year+'-09-01',year+'-09-30'],
                [year+'-10-01',year+'-10-31'],
                [year+'-11-01',year+'-11-30'],
                [year+'-12-01',year+'-12-31']]
    else: #if it's any other year
        dates = [[year+'-01-01',year+'-01-31'],
                [year+'-02-01',year+'-02-28'],
                [year+'-03-01',year+'-03-31'],
                [year+'-04-01',year+'-04-30'],
                [year+'-05-01',year+'-05-31'],
                [year+'-06-01',year+'-06-30'],
                [year+'-07-01',year+'-07-31'],
                [year+'-08-01',year+'-08-31'],
                [year+'-09-01',year+'-09-30'],
                [year+'-10-01',year+'-10-31'],
                [year+'-11-01',year+'-11-30'],
                [year+'-12-01',year+'-12-31']]

    for j in range(12):
        # time range. data range available: 2002-06-01 to 2022-06-30. [start with a short period]
        dater = dates[j] # dates on the format 'YYYY-MM-DD' as string
        latr = [np.nanmin(beng.lat), np.nanmax(beng.lat)]
        lonr = [np.nanmin(beng.lon), np.nanmax(beng.lon)]

        start_date = ''.join(dates[j][0].split('-'))
        end_date = ''.join(dates[j][1].split('-'))

        #remove all values that are for lakes
        sst = ds_sst.analysed_sst.sel(time = slice(dater[0],dater[1]), lat = slice(latr[0],latr[1]), lon = slice(lonr[0],lonr[1]))

        sst = sst-273.15 # transform units from Kelvin to Celsius
        sst.attrs['units']='deg C' #add attributes to the data
        sst.to_netcdf('../data/data_download/Benguela/'+start_date+'-'+end_date+'-JPL-L4_GHRSST-SSTfnd-MUR-GLOB-v02.0-fv04.1.nc')

In [None]:
sst_20 = xr.open_mfdataset('../data/data_download/Benguela/2022*.nc'.format(dir))
sst_20.close()

#remove all values that are for lakes
#sst_filtered = sst_20.where(sst_20.mask != 5, np.nan)

#mask the MURR data
sst_hum = sst_20*beng.analysed_sst

#filter the data using time from above and take the monthly means
sst = sst_hum.resample(time = 'M').mean(dim={'time'}, skipna=True, keep_attrs=True)

sst.to_netcdf('../data/data_download/Bengula/beng_MURR_2022.nc')



## ERA5 Download

In [None]:
%%time

#open Benguela mask
mask = xr.open_dataset('../data/mask/mask_Benguela_MURSST.nc')
mask.close()
#ebus = 'benguela'

if ebus != 'benguela':
    mask['lon'] = mask.lon + 360

#create empty xarrays to store output
vds_merged = xr.Dataset()
sst_ds_merged = xr.Dataset()
iyr = 2002 
fyr = 2022

#loop through each year and month to access the ERA5 data 
for iy, y in enumerate(range(iyr, fyr+1)): # for loop over the selected years
    
    #if the year is 2002
    if y == 2002:
        
        for i in range(6,13): # for loop for each month
            mon = i

            ###acquiring meridional wind v10m
            #open data
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(mon).zfill(2)+'/data/northward_wind_at_10_metres.zarr'
            ds = xr.open_zarr(file_location,consolidated=True) # open access to data
            ds = ds.sortby('time0')
            ds = ds.sel(lat = slice(mask.analysed_sst.lat.max(), mask.analysed_sst.lat.min()), lon = slice(mask.analysed_sst.lon.min(), mask.analysed_sst.lon.max()))
            
            #upsample the mask to match the era5 data
            mask_era  = mask.analysed_sst.interp_like(ds.northward_wind_at_10_metres.isel(time0 = 0))

            #mask the monthly era5 data and calculate the daily mean
            ds_subset = mask_era * ds
            vds = ds_subset.groupby('time0.date').mean().load() 
            vds['date'] = pd.DatetimeIndex(vds['date'].values)
            vds = vds.rename({'date':'time0'})
            vds_merged = xr.merge([vds_merged, vds])

            ###acquiring sea surface temperature
            #open data
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(mon).zfill(2)+'/data/sea_surface_temperature.zarr'
            ds = xr.open_zarr(file_location,consolidated=True)
            ds = ds.sortby('time0')
            ds = ds.sel(lat = slice(mask.analysed_sst.lat.max(), mask.analysed_sst.lat.min()), lon = slice(mask.analysed_sst.lon.min(), mask.analysed_sst.lon.max()))

            #upsample the mask to match the era5 data
            mask_era  = mask.analysed_sst.interp_like(ds.sea_surface_temperature.isel(time0 = 0))

            #mask the monthly era5 data and calculate the daily mean
            ds_subset = mask_era * ds
            sst_ds = ds_subset.groupby('time0.date').mean().load() 
            sst_ds['date'] = pd.DatetimeIndex(sst_ds['date'].values)
            sst_ds = sst_ds.rename({'date':'time0'})
            sst_ds_merged = xr.merge([sst_ds_merged, sst_ds]) 
            
    #if the year is 2022        
    elif y == 2022: 
        for i in range(1,6): # for loop for each month
            mon = i
            
            ###acquiring meridional wind v10m
            #open data
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(mon).zfill(2)+'/data/northward_wind_at_10_metres.zarr'
            ds = xr.open_zarr(file_location,consolidated=True) # open access to data
            ds = ds.sortby('time0')
            ds = ds.sel(lat = slice(mask.analysed_sst.lat.max(), mask.analysed_sst.lat.min()), lon = slice(mask.analysed_sst.lon.min(), mask.analysed_sst.lon.max()))
            
            #upsample the mask to match the era5 data
            mask_era  = mask.analysed_sst.interp_like(ds.northward_wind_at_10_metres.isel(time0 = 0))

            #mask the monthly era5 data and calculate the daily mean
            ds_subset = mask_era * ds
            vds = ds_subset.groupby('time0.date').mean().load() 
            vds['date'] = pd.DatetimeIndex(vds['date'].values)
            vds = vds.rename({'date':'time0'})
            vds_merged = xr.merge([vds_merged, vds])

            ###acquiring sea surface temperature
            #open data
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(mon).zfill(2)+'/data/sea_surface_temperature.zarr'
            ds = xr.open_zarr(file_location,consolidated=True)
            ds = ds.sortby('time0')
            ds = ds.sel(lat = slice(mask.analysed_sst.lat.max(), mask.analysed_sst.lat.min()), lon = slice(mask.analysed_sst.lon.min(), mask.analysed_sst.lon.max()))

            #upsample the mask to match the era5 data
            mask_era  = mask.analysed_sst.interp_like(ds.sea_surface_temperature.isel(time0 = 0))

            #mask the monthly era5 data and calculate the daily mean
            ds_subset = mask_era * ds
            sst_ds = ds_subset.groupby('time0.date').mean().load() 
            sst_ds['date'] = pd.DatetimeIndex(sst_ds['date'].values)
            sst_ds = sst_ds.rename({'date':'time0'})
            sst_ds_merged = xr.merge([sst_ds_merged, sst_ds]) 

    else: 
        for i in range(1,13): # for loop for each month
            mon = i
            
            ###acquiring meridional wind v10m
            #open data
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(mon).zfill(2)+'/data/northward_wind_at_10_metres.zarr'
            ds = xr.open_zarr(file_location,consolidated=True) # open access to data
            ds = ds.sortby('time0')
            ds = ds.sel(lat = slice(mask.analysed_sst.lat.max(), mask.analysed_sst.lat.min()), lon = slice(mask.analysed_sst.lon.min(), mask.analysed_sst.lon.max()))
            
            #upsample the mask to match the era5 data
            mask_era = mask.analysed_sst.interp_like(ds.northward_wind_at_10_metres.isel(time0 = 0))

            #mask the monthly era5 data and calculate the daily mean
            ds_subset = mask_era * ds
            vds = ds_subset.groupby('time0.date').mean().load() 
            vds['date'] = pd.DatetimeIndex(vds['date'].values)
            vds = vds.rename({'date':'time0'})
            vds_merged = xr.merge([vds_merged, vds])

            ###acquiring sea surface temperature
            #open data
            file_location = 'https://era5-pds.s3.us-east-1.amazonaws.com/zarr/'+str(y)+'/'+str(mon).zfill(2)+'/data/sea_surface_temperature.zarr'
            ds = xr.open_zarr(file_location,consolidated=True)
            ds = ds.sortby('time0')
            ds = ds.sel(lat = slice(mask.analysed_sst.lat.max(), mask.analysed_sst.lat.min()), lon = slice(mask.analysed_sst.lon.min(), mask.analysed_sst.lon.max()))

            #upsample the mask to match the era5 data
            mask_era  = mask.analysed_sst.interp_like(ds.sea_surface_temperature.isel(time0 = 0))

            #mask the monthly era5 data and calculate the daily mean
            ds_subset = mask_era * ds
            sst_ds = ds_subset.groupby('time0.date').mean().load() 
            sst_ds['date'] = pd.DatetimeIndex(sst_ds['date'].values)
            sst_ds = sst_ds.rename({'date':'time0'})
            sst_ds_merged = xr.merge([sst_ds_merged, sst_ds]) 

# calculate to v wind stress
ra = 1.225
cd_v = xr.where(vds_merged['northward_wind_at_10_metres'] < 11, 0.49+0.065*abs(vds_merged['northward_wind_at_10_metres']), 1.2)
cd_v = cd_v/1000
vds_merged['northward_wind_stress']= cd_v * ra * vds_merged['northward_wind_at_10_metres']        

#merge all datasets into one xarray file
era5 = xr.merge([vds_merged, sst_ds_merged])

#take the monthly average
era5 = era5.resample(time0 = 'M').mean()

#save file
era5.to_netcdf('../data/data_download/ERA5_Chile/Benguela_ERA5.nc') #save data
