In [23]:
import os
import requests
import datetime
from zipfile import ZipFile
import ftplib 
import cdsapi
import yaml
import xarray as xr
import glob

In [2]:
#This file contains configuration details like API keys and passwords
global_vars = yaml.safe_load(open('../config.yml', 'r') )

In [14]:
#This has custom functions - specifically the "download" function 
%run ./00_custom_functions.ipynb

In [4]:
#Data will be saved at this location in folders such as root/SST/originals/ and root/SSS/orginals/
#Note that these folder locations must already be created as a manual validation step
download_folder_root = global_vars['download_folder']
print(download_folder_root)

/data/artemis/workspace/ds4114/online_data/


In [5]:
#The following two variables are used to acquire select data when it is uploaded by month or year.
    #Data that is not uploaded by year includes: SST (NOAA); MLD (deBoyer & Argo), fCO2 (SOCAT), xCO2, Coastal, SeaFlux
#These set the start and end years (inclusive) and do not need to be changed.
#Some years/months of data may not available (because prior to when data was gathered or too recent for the source).
    #In those cases any available data is obtained in this range. Specifically, 
        #SST (NOAA) data only 1981-present
        #SST (ERA5) data only 1979-present
        #SST (JRA55) data only 1958-2023
        #SSS data only 1900-present
        #MLD (deBoyer and Argo) data only an averaged 12 months
        #CHL data only 1997-present
        #fCO2 data only 1970-2022
        #SLP data only 1979-2022
        #xCO2 data only 1979-present
        #Coastal data only an averaged 12 months 
        #SeaFlux data only 1982-2022
    #These limitations are hardcoded so other sources or links would be needed to download outside of this range 
acquisition_start_year = 1979 
acquisition_end_year = 2023  

#This variable sets the output filetype for SSS and CHL data and needs to specified explicitly because of the unique way the are downloaded.
#By default, all data source downloads will default to the netcdf format unless using cloud storage. 
#When using cloud storage, it is recommended to use ARCO (Analysis-Ready Cloud-Optimized) formats like Zarr over NetCDF
output_file_type = '.zarr' if download_folder_root[0:5] == 'gs://' else '.nc'

#This variable is used for naming files (SST & xCO2)
today_yearmonth = datetime.datetime.now().strftime('%Y%m')

In [7]:
#Some data (SLP, SST) requires an account on European Centre for Medium-Range Weather Forecasts and an API key
#More info can be found https://cds.climate.copernicus.eu/api-how-to
#Once the packages is installed (conda install -c conda-forge cdsapi), we need to install your API key using code below

cds_url= "url: https://cds.climate.copernicus.eu/api/v2"
cds_key= 'key: '+global_vars['cds_api_key']  #from the configuration file
file = os.path.expanduser('~')+'/.cdsapirc'
if not (os.path.isfile(file)): 
    cds_file = open(file, "w")
    cds_file.write(cds_url+'\n')
    cds_file.write(cds_key)
    cds_file.close()
else:
    print("API key already installed")

API key already installed


## Temperature (SST)
#### NOAA

In [None]:
#NOAA SST is easy because NOAA uses the same URL for all data across time periods (1981-09 to present only)
#More info found on https://psl.noaa.gov/data/gridded/data.noaa.oisst.v2.highres.html
sst_direct_url = 'https://downloads.psl.noaa.gov/Datasets/noaa.oisst.v2.highres/sst.mon.mean.nc' 
sst_destination_folder = download_folder_root+r'SST/originals/'
sst_destination_filename = 'SST_NOAA_OI-V2-HighRes_198109-'+today_yearmonth+'.nc'  #data is only from 1981
download(sst_direct_url, sst_destination_folder, sst_destination_filename)

#### ECMWF

In [19]:
#This SST dataset comes from European Centre for Medium-Range Weather Forecasts (1979-01 to present only). It is the same as the one used for SLP below.
#More info can be found on https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=form
#We installed the API and have an account, so now we can download the ERA5 data via a loop (yearly)
#While this data can be downloaded together with SLP, we have chosen to break up the downloads for clarity and organization
sst_destination_folder = download_folder_root+r'SST/originals/'
for year in range(acquisition_start_year, acquisition_end_year+1, 1):
    sst_destination_filename = ''
    months = []
    if year == datetime.datetime.now().year:   #if a partial year; this prevents erorrs trying to use the API to get future/non-existant data    
        months = [i for i in range(1,datetime.datetime.now().month -1)]  
        sst_destination_filename = 'SST_ECMWF_ERA5-monthly-reanalysis-SST_'+str(year)+'-partial.nc'  
        #Note, partial years will not automatically be overwritten so you must manually clean up unneeded files
    else:
        sst_destination_filename = 'SST_ECMWF_ERA5-monthly-reanalysis-SST_'+str(year)+'.nc'
        months = [i for i in range(1,12+1)]

    cdsapi_custom_download(year, months, 'sea_surface_temperature', sst_destination_folder, sst_destination_filename, overwrite=False, create_dest=False)
    
print("ERA5 SST Complete")

File SST_ECMWF_ERA5-monthly-reanalysis-SST_1999.nc already exists - (skipping download from 1999 )
File SST_ECMWF_ERA5-monthly-reanalysis-SST_2000.nc already exists - (skipping download from 2000 )
Complete


#### JRA55

In [20]:
#This SST data source comes from Japan Meteorological Agency (JMA) (195801 - to 2023 only)
#JRA55-do (Tsujino et al., 2018)  corrects the atmospheric reanalysis product JRA-55 (Kobayashi et al., 2015) 
#More info can be found on https://climate.mri-jma.go.jp/pub/ocean/JRA55-do/

sst_destination_folder = download_folder_root+r'SST/originals/'
for year in range(acquisition_start_year, acquisition_end_year+1, 1):
    sst_jra55_destination_filename = 'SST_JMA_JRA55-do-daily-reanalysis-SST_'+str(year)+'.nc'
    direct_link = None
    
    #downloading via direct links. Recent data (2020+) is hosted on a different site
    if year == 2020:
        direct_link = 'https://climate.mri-jma.go.jp/pub/ocean/JRA55-do/ocean/day/tos/gn/v20210315/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0-1_gn_20200101-20201231.nc'
    elif year == 2021:
        direct_link = 'https://climate.mri-jma.go.jp/pub/ocean/JRA55-do/ocean/day/tos/gn/latest/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0-1_gn_20210101-20211231.nc'
    elif year == 2022:
        direct_link = 'https://climate.mri-jma.go.jp/pub/ocean/JRA55-do/ocean/day/tos/gn/latest/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0-1_gn_20220101-20221231.nc'
    elif year == 2023:
        direct_link = 'https://climate.mri-jma.go.jp/pub/ocean/JRA55-do/ocean/day/tos/gn/latest/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0-1_gn_20230101-20230629.nc'
        sst_jra55_destination_filename = 'SST_JMA_JRA55-do-daily-reanalysis-SST_'+str(year)+'-partial.nc' #because only part of the year available
    else:
        base = 'https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/input4MIPs/CMIP6/OMIP/MRI/MRI-JRA55-do-1-5-0/ocean/day/tos/gn/v20200916/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0_gn_'
        end = str(year)+'0101-'+str(year)+'1231.nc'
        direct_link = base + end  #ex: #https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/input4MIPs/CMIP6/OMIP/MRI/MRI-JRA55-do-1-5-0/ocean/day/tos/gn/v20200916/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0_gn_19580101-19581231.nc
    
    download(direct_link, sst_destination_folder, sst_jra55_destination_filename)
print("JRA55 SST Complete")

File SST_JMA_JRA55-do-daily-reanalysis-SST_1999.nc already exists - (skipping download from https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/input4MIPs/CMIP6/OMIP/MRI/MRI-JRA55-do-1-5-0/ocean/day/tos/gn/v20200916/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0_gn_19990101-19991231.nc )
File SST_JMA_JRA55-do-daily-reanalysis-SST_2000.nc already exists - (skipping download from https://esgf-data2.llnl.gov/thredds/fileServer/user_pub_work/input4MIPs/CMIP6/OMIP/MRI/MRI-JRA55-do-1-5-0/ocean/day/tos/gn/v20200916/tos_input4MIPs_atmosphericState_OMIP_MRI-JRA55-do-1-5-0_gn_20000101-20001231.nc )
Complete


## Salinity (SSS)

In [27]:
#SST requires downloading yearly zip files and extracting each month from each  (1900-01 to present only)
#More info found on https://www.metoffice.gov.uk/hadobs/en4/download-en4-2-2.html

sss_direct_url_base = 'https://www.metoffice.gov.uk/hadobs/en4/data/en4-2-1/EN.4.2.2/'
sss_destination_filename_base = 'SSS_Met-Office-Hadley-Centre_EN422f-g10-analyses_' #_197901+
sss_destination_folder = download_folder_root+r'SSS/originals/'
for year in range(acquisition_start_year, acquisition_end_year+1, 1):
    if year >= 2021: sss_direct_url_base = 'https://www.metoffice.gov.uk/hadobs/en4/data/en4-2-1/' #2021 and onward the URL changes, though both URL zips exists. Not clear why.
    url_file = 'EN.4.2.2.analyses.g10.'+str(year)+'.zip'
    sss_direct_url = sss_direct_url_base + url_file
    sss_destination_filename = sss_destination_filename_base+str(year)+'.zip'
    
    #download to tmp, extract a year, then transfer all to destination
    tmp_folder = '../tmp/'
    download_to_folder(sss_direct_url, tmp_folder, sss_destination_filename,overwrite=False, create_dest=True)
    ZipFile(tmp_folder+sss_destination_filename).extractall(tmp_folder)
    files = glob.glob(tmp_folder+'EN.4.2.2.f.analysis.g10.*.nc')
    for f in files:  #for each file extracted
        basename = os.path.basename(f)
        tmp_xr = xr.open_dataset(tmp_folder + basename)
        output_xarray_with_date(tmp_xr, sss_destination_folder, basename, filetype=output_file_type, with_date=False) 
            #Note the filename is kept as the original from the .zip here to be compatible with other work. To output the file with a different name, edit the dest_file string parameter.
        os.remove(os.path.join(tmp_folder,f)) #remove tmp file
    
    #remove zip file too
    os.remove(os.path.join(tmp_folder,sss_destination_filename)) #remove tmp file
print("SSS Complete")

File SSS_Met-Office-Hadley-Centre_EN422f-g10-analyses_2000.zip already exists - (skipping download from https://www.metoffice.gov.uk/hadobs/en4/data/en4-2-1/EN.4.2.2/EN.4.2.2.analyses.g10.2000.zip )
Cancelling output - EN.4.2.2.f.analysis.g10.200001.nc already exists in /data/artemis/workspace/ds4114/online_data/SSS/originals/
Cancelling output - EN.4.2.2.f.analysis.g10.200002.nc already exists in /data/artemis/workspace/ds4114/online_data/SSS/originals/
Cancelling output - EN.4.2.2.f.analysis.g10.200003.nc already exists in /data/artemis/workspace/ds4114/online_data/SSS/originals/
Cancelling output - EN.4.2.2.f.analysis.g10.200004.nc already exists in /data/artemis/workspace/ds4114/online_data/SSS/originals/
Cancelling output - EN.4.2.2.f.analysis.g10.200005.nc already exists in /data/artemis/workspace/ds4114/online_data/SSS/originals/
Cancelling output - EN.4.2.2.f.analysis.g10.200006.nc already exists in /data/artemis/workspace/ds4114/online_data/SSS/originals/
Cancelling output - E

## Mixed Layer Depth (MLD)
#### deBoyer

In [21]:
#MLD data is just one year of data that was processed by the author using several years of temperature profiles (data from 1941-2008, partially missing)
#More info found on https://cerweb.ifremer.fr/deboyer/mld/Surface_Mixed_Layer_Depth.php

mld_direct_url = 'https://cerweb.ifremer.fr/deboyer/data/mld_DT02_c1m_reg2.0.nc'
mld_destination_folder = download_folder_root+r'MLD/originals/'
mld_destination_filename = 'MLD_IFREMER-deBoyer_DT02-c1m_2008.nc'   #2008 though data is blended across years 
download(mld_direct_url, mld_destination_folder, mld_destination_filename)

File MLD_IFREMER-deBoyer_DT02-c1m_2008.nc already exists - (skipping download from https://cerweb.ifremer.fr/deboyer/data/mld_DT02_c1m_reg2.0.nc )


#### Argo

In [29]:
# This MLD source is from UC San Diego who used Argo data to create the climatology
# More info can be found on http://mixedlayer.ucsd.edu/ and https://www.seanoe.org/data/00311/42182/#56126
mld_direct_url = 'http://mixedlayer.ucsd.edu/data/Argo_mixedlayers_monthlyclim_04142022.nc'
mld_destination_folder = download_folder_root+r'MLD/originals/'
mld_destination_filename = 'MLD_UCSD-Argo_mixedlayers-monthlyclim_2022.nc'   #2022 though data is blended across years 
download(mld_direct_url, mld_destination_folder, mld_destination_filename)

Saving http://mixedlayer.ucsd.edu/data/Argo_mixedlayers_monthlyclim_04142022.nc to /data/artemis/workspace/ds4114/online_data/MLD/originals/MLD_UCSD-Argo_mixedlayers-monthlyclim_2022.nc...
Complete


## Chlorophyll (CHL)

In [20]:
#CHL data needs to be downloaded via FTP in monthly files (1997-09 to present only)
#According to https://www.globcolour.info/products_description_mermet.html, we want to use GSM data only
#Note that the product changes over the years and the file names can vary. However there is a common pattern in naming used below.
#More info found on https://hermes.acri.fr/index.php?class=archive

#FTP requires an account and login:
chl_direct_ftp = 'ftp.hermes.acri.fr'
chl_destination_folder = download_folder_root+r'CHL/originals/'
chl_acquisition_start_year = acquisition_start_year if acquisition_start_year >= 1997 else 1997 #earliest we have is 1997-09 so set it here to prevent errors getting nonexistant data
usr = global_vars['chl_user'] 
psw = global_vars['chl_psw'] 

ftp_server = ftplib.FTP(chl_direct_ftp)
ftp_server.set_pasv(True)
ftp_server.login(usr, psw)

'230 User ftp_gc_VBennington logged in'

In [27]:
#Note the nlst command used below may throw a [WinError 10060] message after logging in. This may be due to how the FTP server is configured or how the local computer is configured. 
#If the following command does not work, there is a broader issue with the connection.
ftp_server.dir()

drwxrwxr-x   3 10048    45005           0 Aug 20  2021 animation
drwxrwsr-x  11 10048    45005           0 Apr 13 13:57 ATLNW
drwxrwsr-x  12 10048    45005           0 Sep 30  2022 EURO
drwxrwsr-x  13 10048    45005           3 Jan 30  2023 GLOB
drwxrwsr-x   5 10048    45005           0 Mar 27  2021 GLOBCOAST
drwxrwxr-x   5 10048    45005           0 Dec  2  2014 OSS2015


In [21]:
for year in range(chl_acquisition_start_year, acquisition_end_year+1, 1):
    chl_destination_filename = ''
    months = [i for i in range(1,12+1)]
     #if a partial year, reduce available months
    if year == datetime.datetime.now().year: months = [i for i in range(1,datetime.datetime.now().month)]
    elif year == 1997: months = [i for i in range(9,12+1)] #data only available part of 1997
    for month in months:
        chl_destination_filename = 'CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_'+str(year)+str(month).zfill(2)+'.nc'
        ftp_server.cwd(f'/GLOB/merged/month/{year}/{str(month).zfill(2)}/01') #ex. '/GLOB/merged/month/2022/02/01' for Feb 2022
        f = ftp_server.nlst('L3m*GLOB*100*GSM*CHL1*.nc')[0]    #Global view, 100 km resolution, GSM product, CHL1 data
        
        #download to tmp folder first, then check and transfer to destination
        tmp_folder = '../tmp/'
        if not os.path.exists(tmp_folder): os.makedirs(tmp_folder)
        print(f'Downloading {f} as {chl_destination_filename}')
        ftp_server.retrbinary("RETR " + f, open(tmp_folder + chl_destination_filename, 'wb').write)
        tmp_xr = xr.open_dataset(tmp_folder + chl_destination_filename)
        output_xarray_with_date(tmp_xr, chl_destination_folder, chl_destination_filename, filetype=output_file_type, with_date=False)
        del tmp_xr
        os.remove(os.path.join(tmp_folder , chl_destination_filename)) #remove tmp file
            
print("CHL Complete")

Downloading L3m_20000101-20000131__GLOB_100_GSM-SWF_CHL1_MO_00.nc as CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200001.nc
Cancelling output - CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200001.nc already exists in /data/artemis/workspace/ds4114/online_data/CHL/originals/
Downloading L3m_20000201-20000229__GLOB_100_GSM-SWF_CHL1_MO_00.nc as CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200002.nc
Cancelling output - CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200002.nc already exists in /data/artemis/workspace/ds4114/online_data/CHL/originals/
Downloading L3m_20000301-20000331__GLOB_100_GSM-SWF_CHL1_MO_00.nc as CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200003.nc
Cancelling output - CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200003.nc already exists in /data/artemis/workspace/ds4114/online_data/CHL/originals/
Downloading L3m_20000401-20000430__GLOB_100_GSM-SWF_CHL1_MO_00.nc as CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_200004.nc
Cancellin

In [22]:
ftp_server.quit()

'221 Goodbye.'

## pCO2 
### SOCAT fCO2

In [None]:
#pCO2 data is calculated from fCO2 and SST (SOCOVV) and sea level pressure (ECMRWF) so we need to obtain a dataset for both

#fCO2 and SST comes from the Surface Ocean CO2 Variability and Vulnerability group (1970-01 to end of last year only)
#We want to use the same SST data points related to fCO2 from the same source (not using the separate NOAA SST here).
#More info can be found at https://www.socat.info/index.php/data-access/
fco2_direct_url = 'https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0278913/SOCATv2023_Gridded_Dat/SOCATv2023_tracks_gridded_monthly.nc'  #thru 2022-12
      #previously 'https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0253659/SOCATv2022_Gridded_Dat/SOCATv2022_tracks_gridded_monthly.nc'  #thru 2021-12
fco2_destination_folder = download_folder_root+r'pCO2/originals/'    
fco2_destination_filename = 'fCO2_SOCOVV_SOCAT-gridded-monthly_2022.nc'    #filename year changed from source to denote data date, not publish date
download(fco2_direct_url, fco2_destination_folder, fco2_destination_filename)

### Coastal Filling (pCO2)

In [6]:
#This data set is used for enhancing coastal areas for flux calculation
#More info can be found https://www.ncei.noaa.gov/access/metadata/landing-page/bin/iso?id=gov.noaa.nodc:0209633 and https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0209633/
coastal_clim_direct = 'https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0209633/MPI-ULB-SOM_FFN_clim.nc'
coastal_clim_destination_folder = download_folder_root+r'pCO2/originals/' 
coastal_clim_filename = 'pCO2_NOAA-NCEI_MPI-ULB-SOM-FFN_1988-2020.nc'
download(coastal_clim_direct, coastal_clim_destination_folder, coastal_clim_filename)

Saving https://www.ncei.noaa.gov/data/oceans/ncei/ocads/data/0209633/MPI-ULB-SOM_FFN_clim.nc to /data/artemis/workspace/ds4114/online_data/pCO2/originals/pCO2_NOAA-NCEI_MPI-ULB-SOM-FFN_1988-2020.nc...
Complete


## Sea Level Pressure (SLP)

In [24]:
#Sea level pressure comes from European Centre for Medium-Range Weather Forecasts (1979-01 to present only). It is the same source the one used for one SST above.
#More info can be found on https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-single-levels-monthly-means?tab=form
#We installed the API and have an account, so now we can download the ERA5 data via a loop (yearly)
slp_destination_folder = download_folder_root+r'SLP/originals/'    
for year in range(acquisition_start_year, acquisition_end_year+1, 1):
    slp_destination_filename = ''
    months = []
    if year == datetime.datetime.now().year:   #if a partial year; this prevents erorrs trying to use the API to get future/non-existant data    
        months = [i for i in range(1,datetime.datetime.now().month)]  
        slp_destination_filename = 'SLP_ECMWF_ERA5-monthly-reanalysis-MSLP_'+str(year)+'-partial.nc'  
        #To consider, do we want to automatically overwrite parital years?
    else:
        slp_destination_filename = 'SLP_ECMWF_ERA5-monthly-reanalysis-MSLP_'+str(year)+'.nc'
        months = [i for i in range(1,12+1)]

    cdsapi_custom_download(year, months, 'mean_sea_level_pressure', slp_destination_folder, slp_destination_filename, overwrite=False, create_dest=False)
    
print("SLP Complete")

File SLP_ECMWF_ERA5-monthly-reanalysis-MSLP_1999.nc already exists - (skipping download from 1999 )
File SLP_ECMWF_ERA5-monthly-reanalysis-MSLP_2000.nc already exists - (skipping download from 2000 )
Complete


## Atmospheric CO2 (xCO2)

In [None]:
#xCO2 comes from NOAA. More information: https://gml.noaa.gov/ccgg/trends/gl_data.html

xco2_direct_url = "https://gml.noaa.gov/webdata/ccgg/trends/co2/co2_mm_gl.csv"
xco2_destination_folder = download_folder_root+r'xCO2/originals'
today_yearmonth = datetime.datetime.now().strftime('%Y%m')
xco2_destination_filename = 'xCO2_NOAA_xCO2-mm-gl-monthly_197901-'+today_yearmonth+'.csv'
download(xco2_direct_url, xco2_destination_folder, xco2_destination_filename) 

## SeaFlux
#### All (Wind, Ice, Ocean Area)

In [28]:
#This data is is not required for pCO2 reconstruction but is used for calculating flux
#LDEO product is updated through the end of the previous year
#More information can be found https://zenodo.org/record/8099928

seaflux_all_direct = 'https://zenodo.org/record/8099928/files/SeaFlux.v2023.01_all_1982-2022.nc'
seaflux_destination_folder = download_folder_root+r'SeaFlux/originals/'
seaflux_destination_filename = 'SeaFlux_LDEO_SeaFlux-v202301-all_1982-2022.nc'
download(seaflux_all_direct, seaflux_destination_folder, seaflux_destination_filename)

File SeaFlux_LDEO_SeaFlux-v202301-all_1982-2022.nc already exists - (skipping download from https://zenodo.org/record/8099928/files/SeaFlux.v2023.01_all_1982-2022.nc )
