In [2]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob
import datetime as dt
import copy

In [2]:
variables = ['MLD', 'OHC100', 'OHC200', 'OHC300', 'OHC50', 'OHC700', 'SSH', 'SST',
            'OLR', 'SD', 'STL_1m', 'STL_full', 'SWVL_1m', 'SWVL_full', 'U10', 'U200', 'Z500',
            'IC', 'IT','SST']
origins = ['SODA', 'SODA', 'SODA', 'SODA', 'SODA', 'SODA', 'SODA', 'SODA',
          'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5', 'ERA5',
          'SODA', 'SODA','OISSTv2']

In [3]:
path_files = '/glade/work/jhayron/Data4Predictability/'

In [8]:
def get_full_dataset(path_files, var, origin):
    if origin == 'ERA5':
        path_var = f'{path_files}{origin}/Daily_05Deg/{var}/'
    elif (origin == 'SODA'):
        path_var = f'{path_files}{origin}_Daily/{var}/'
    elif (origin == 'OISSTv2'):
        path_var = f'{path_files}{origin}/{var}/'

    list_files = np.sort(glob.glob(f'{path_var}*.nc'))#[4000:4010]
    dataset = xr.open_mfdataset(list_files,combine='nested',concat_dim="time")
    if dataset.time.dtype==np.int64:
        dataset['time'] = np.array([dt.datetime.strptime(list_files[i].split('_')[-1],'%Y-%m-%d.nc')\
            for i in range(len(list_files))])
    dataset = dataset.where(dataset.time>np.datetime64('1981-01-01'),drop=True)
    dataset = dataset.load()
    if origin == 'SODA':
        if 'xt_ocean' in dataset.dims:
            dataset = dataset.rename({'xt_ocean': 'lon','yt_ocean': 'lat'})
        elif 'xt' in dataset.dims:
            dataset = dataset.rename({'xt': 'lon','yt': 'lat'})
    if origin == 'OISSTv2':
        dataset = dataset.assign_coords(lat=dataset.lat.values[:,0],lon=dataset.lon.values[0,:])
        dataset = dataset.rename({'x': 'lon', 'y': 'lat'})
    if origin == 'ERA5':
        dataset = dataset.assign_coords(lat=dataset.lat.values[:,0],lon=dataset.lon.values[0,:])
        dataset = dataset.rename({'x': 'lon', 'y': 'lat'})
    return dataset

In [5]:
def get_climatology(dataset,var_name_xarray,path_save_climatology):
    dataset_clima = dataset.isel(time = (pd.to_datetime(dataset.time).year>=1981)&\
                                 (pd.to_datetime(dataset.time).year<=2010))
    # return dataset_clima
    dataset_clima = dataset_clima.isel(time = ~((pd.to_datetime(dataset_clima.time).day == 29)&((pd.to_datetime(dataset_clima.time).month == 2))))
    
    doy = pd.to_datetime(dataset_clima.time).day_of_year
    climatology = []
    for i in range(1,366):
        climatology.append(dataset_clima.isel(time = doy == i)[var_name_xarray].mean('time'))
    attrs = dataset[var_name_xarray].attrs
    attrs['File Author'] = 'Jhayron S. Pérez-Carrasquilla'
    
    climatology = xr.Dataset({
                 f'{var_name_xarray}_climatology': (['day_of_year','lat','lon'], np.array(climatology)),
                },
                 coords =
                {'day_of_year': (['day_of_year'], np.arange(1,366)),
                 'lat' : (['lat'], dataset.lat.values),
                 'lon' : (['lon'], dataset.lon.values)
                },
                attrs = attrs)
    
    climatology = climatology.transpose('day_of_year','lat','lon')
    climatology.to_netcdf(path_save_climatology)
    return climatology
    

In [6]:
def get_anomalies(dataset,var_name_xarray,climatology,path_save_anomalies):
    anomalies = copy.deepcopy(dataset)
    for day in range(1,367):
        # print(day) 
        if day == 366:
            anomalies[var_name_xarray][{'time':(pd.to_datetime(dataset.time).day_of_year == day)}] = \
                (dataset[var_name_xarray].isel(time = (pd.to_datetime(dataset.time).day_of_year == day)) \
                - climatology[f'{var_name_xarray}_climatology'].sel(day_of_year = day-1))
        else:
            anomalies[var_name_xarray][{'time':(pd.to_datetime(dataset.time).day_of_year == day)}] = \
                (dataset[var_name_xarray].isel(time = (pd.to_datetime(dataset.time).day_of_year == day)) \
                - climatology[f'{var_name_xarray}_climatology'].sel(day_of_year = day))
    anomalies = anomalies.rename({var_name_xarray:f'{var_name_xarray}_anomalies'})
    anomalies.to_netcdf(path_save_anomalies)
    return anomalies

In [7]:
path_daily_datasets = '/glade/scratch/jhayron/Data4Predictability/DailyDatasets/'
path_daily_anoms_datasets = '/glade/scratch/jhayron/Data4Predictability/DailyAnoms/'
path_daily_detrended_anoms = '/glade/scratch/jhayron/Data4Predictability/DailyDetrendedAnoms/'
path_daily_climatologies = '/glade/scratch/jhayron/Data4Predictability/DailyClimatologies/'
path_trends = '/glade/scratch/jhayron/Data4Predictability/trends/'

# Save individual datasets per variable

In [None]:
for variable_i in range(len(variables)):
    print(variables[variable_i])
    dataset = get_full_dataset(path_files,variables[variable_i],origins[variable_i])
    var_name_xarray = list(dataset.data_vars.keys())[0]
    dataset.to_netcdf(f'{path_daily_datasets}{variables[variable_i]}.nc')

MLD
OHC100


# Compute climatologies

In [None]:
for variable_i in range(len(variables)):
    print(variables[variable_i])
    dataset = get_full_dataset(path_files,variables[variable_i],origins[variable_i])
    var_name_xarray = list(dataset.data_vars.keys())[0]
    dataset.to_netcdf(f'{path_daily_datasets}{variables[variable_i]}.nc')
    climatology = get_climatology(dataset,var_name_xarray,f'{path_daily_climatologies}{variables[variable_i]}.nc')
    anomalies = get_anomalies(dataset,var_name_xarray,climatology,f'{path_daily_anoms_datasets}{variables[variable_i]}.nc')

In [98]:
def get_trend(dataset,variable,unit):
    lat = dataset.lat.values
    lon = dataset.lon.values

    array_coefs = np.zeros([len(lat),len(lon)])
    for lati in range(len(lat)):
        # print(lati)
        for loni in range(len(lon)):
        # for loni in [0]:
            series = dataset.sel(lat=lat[lati],lon=lon[loni])[variable]
            X = [i for i in range(0, len(series))]
            X = np.reshape(X, (len(X), 1))
            y = series.values
            try:
                model = LinearRegression()
                model.fit(X, y)
                array_coefs[lati,loni] = model.coef_[0]
            except:
                array_coefs[lati,loni] = 0
    trend = xr.Dataset({
                 f'{variable}_trend': (['lat','lon'], array_coefs),
                },
                 coords =
                {
                 'lat' : (['lat'], lat),
                 'lon' : (['lon'], lon)
                },
                attrs = 
                {'File Author' : 'Jhayron S. Pérez-Carrasquilla','units':f'({unit})/day'})
    return trend

# Join ERA5 files

In [3]:
for variable in ['U10','U200','Z500']:
    print(variable)
    path_files = f'/glade/scratch/jhayron/Data4Predictability/DailyDatasets/{variable}/'
    files_list = np.sort(glob.glob(f'{path_files}*'))
    dataset = xr.open_mfdataset(files_list,combine='nested',concat_dim="time")
    dataset = dataset.where(dataset.time>np.datetime64('1981-01-01'),drop=True)
    dataset = dataset.load()
    dataset.to_netcdf(f'/glade/scratch/jhayron/Data4Predictability/DailyDatasets/{variable}_ERA5.nc')

U10
U200
Z500


In [29]:
dataset = xr.open_dataset('/glade/scratch/jhayron/Data4Predictability/DailyDatasets/OHC700_SODA.nc')

In [30]:
years = np.array([pd.to_datetime(dataset.time.values[i]).year for i in range(len(dataset.time))])

In [31]:
np.unique(years)

array([1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017, 2018, 2019, 2020])

In [32]:
len(np.unique(years))

39

In [11]:
dataset.time.values[0]

numpy.datetime64('1981-01-02T00:00:00.000000000')

# Join OHC700 SODA

In [3]:
for variable in ['OHC700']:
    print(variable)
    path_files = f'/glade/scratch/jhayron/Data4Predictability/DailyDatasets/{variable}/'
    files_list = np.sort(glob.glob(f'{path_files}*'))
    dataset = xr.open_mfdataset(files_list,combine='nested',concat_dim="time")
    dataset = dataset.where(dataset.time>np.datetime64('1981-01-01'),drop=True)
    dataset = dataset.load()
    dataset.to_netcdf(f'/glade/scratch/jhayron/Data4Predictability/DailyDatasets/{variable}_SODA.nc')

OHC700


In [None]:
glob.