# xarray analysis of 2017 data
---

I wrote the original analysis notebooks to use the pandas library, but also want to conduct this analysis in xarray to get more comfortable with the library. 

In [23]:
# Imports
import requests
import re
import os
import numpy as np
import xarray as xr

---
### Pull in data
Unfortunately, the `get_data` function from the OOI data lab turned the datasets into pandas dataframes before returning them, so I have to alter these functions to maintain xr dataset format. I can use the same links to grab the data, but it takes a few minutes to download it all. 

In [None]:
def get_data(url, variables, deployments=None):
    # Function to grab all data from specified directory
    tds_url = 'https://opendap.oceanobservatories.org/thredds/dodsC'
    dataset = requests.get(url).text
    ii = re.findall(r'href=[\'"]?([^\'" >]+)', dataset)
    # x = re.findall(r'(ooi/.*?.nc)', dataset)
    x = [y for y in ii if y.endswith('.nc')]
    for i in x:
        if i.endswith('.nc') == False:
            x.remove(i)
    for i in x:
        try:
            float(i[-4])
        except:
            x.remove(i)
    # dataset = [os.path.join(tds_url, i) for i in x]
    datasets = [os.path.join(tds_url, i.split('=')[-1]).replace("\\","/") for i in x]

    # remove deployments not in deployment list, if given
    if deployments is not None:
        deploy = ['deployment{:04d}'.format(j) for j in deployments]
        datasets = [k for k in datasets if k.split('/')[-1].split('_')[0] in deploy]

    # remove collocated data files if necessary
    catalog_rms = url.split('/')[-2][20:]
    selected_datasets = []
    for d in datasets:
        if catalog_rms == d.split('/')[-1].split('_20')[0][15:]:
            selected_datasets.append(d)

    # create a dictionary to populate with data from the selected datasets
    data_dict = {'time': np.array([], dtype='datetime64[ns]')}
    unit_dict = {}
    for v in variables:
        data_dict.update({v: np.array([])})
        unit_dict.update({v: []})
    print('Appending data from files')

    for sd in selected_datasets:
        try:
            url_with_fillmismatch = f'{sd}#fillmismatch'  # I had to add this line to get the function to work
            ds = xr.open_dataset(url_with_fillmismatch, mask_and_scale=False)
#             data_dict['time'] = np.append(data_dict['time'], ds['time'].values)
            
            for var in variables:
#                 data_dict[var] = np.append(data_dict[var], ds[var].values)
                units = ds[var].units
                if units not in unit_dict[var]:
                    unit_dict[var].append(units)
        except:
            pass

    # convert dictionary to a dataframe
#     df = pd.DataFrame(data_dict)
#     df.sort_values(by=['time'], inplace=True)  # make sure the timestamps are in ascending order

    ds = ds.swap_dims({'obs':'time'})
    ds = ds.sortby(ds.time)

    return ds, unit_dict

In [None]:
# Specify the variable(s) of interest
METBK_2017_var = ['sea_surface_temperature', 'met_windavg_mag_corr_east', 'met_windavg_mag_corr_north']
profiler_2017_var = ['seawater_pressure', 'density', 'practical_salinity', 'seawater_temperature', 'corrected_dissolved_oxygen']
platform_2017_var = ['seawater_pressure', 'density', 'practical_salinity', 'seawater_temperature', 'dissolved_oxygen']

In [None]:
METBK_2017_url = 'https://opendap.oceanobservatories.org/thredds/catalog/ooi/deryag@uw.edu/20210422T030752259Z-CE04OSSM-SBD11-06-METBKA000-recovered_host-metbk_a_dcl_instrument_recovered/catalog.html'
profiler_2017_url = 'https://opendap.oceanobservatories.org/thredds/catalog/ooi/deryag@uw.edu/20210422T030848056Z-CE04OSPS-SF01B-2A-CTDPFA107-streamed-ctdpf_sbe43_sample/catalog.html'
platform_2017_url = 'https://opendap.oceanobservatories.org/thredds/catalog/ooi/deryag@uw.edu/20210428T021551666Z-CE04OSPS-PC01B-4A-CTDPFA109-streamed-ctdpf_optode_sample/catalog.html'

In [None]:
# Get the data! 
METBK_2017_data, METBK_2017_units = get_data(METBK_2017_url, METBK_2017_var)
profiler_2017_data, profiler_2017_units = get_data(profiler_2017_url, profiler_2017_var)
platform_2017_data, platform_2017_units = get_data(platform_2017_url, platform_2017_var)

# Check the variable units
print(METBK_2017_units)
print(profiler_2017_units)
print(platform_2017_units)

Appending data from files
Appending data from files


In [None]:
METBK_2017_data

In [None]:
METBK_2017_data.to_netcdf('../../coastal_upwelling_output/metbk_data_2017.nc')
profiler_2017_data.to_netcdf('../../coastal_upwelling_output/profiler_data_2017.nc')
platform_2017_data.to_netcdf('../../coastal_upwelling_output/platform_data_2017.nc')