In [1]:
import numpy as np
import pandas as pd
import xarray as xr
from datetime import datetime

pd.set_option('display.max_columns', 200)

The original header file is located at [https://met.wur.nl/veenkampen/data/10min_header.xlsm]() , it is also possible to read the excel directly from the link i.e. `pd.read_excel("link")`. But at the moment I have used a local, edited version of the file

In [None]:
data = pd.read_csv('https://met.wur.nl/veenkampen/data/2021/02/10min_20210210.txt', sep=",", 
                   header=None)

ceasar_df = xr.open_dataset('../data/veenkampen/cesar_surface_flux_lc1_t10_v1.0_201909.nc')

headers = pd.read_excel("../data/veenkampen/10min/headers_plain.xlsx", engine='openpyxl')

In [None]:
headers.columns = list(map(lambda x: x.replace(".", "_"), headers.columns))
headers.columns = list(map(lambda x: x.replace("(", "_"), headers.columns))
headers.columns = list(map(lambda x: x.replace(")", "_"), headers.columns))
headers.columns = list(map(lambda x: x.replace(" ", "_"), headers.columns))

In [None]:
headers

In [None]:
data.columns = headers.columns
data

---

## Coords

In [None]:
time_df = data[['date','time']]
time_df = time_df.copy()

In [None]:
time_df.head(3)

In [None]:
time_df['datetime'] = time_df['date'] + ' ' + time_df['time']
time_df = time_df.copy()

In [None]:
time = pd.to_datetime(time_df['datetime'])
time.values[:5]

In [None]:
reference_time = time[0].replace(day=1, minute=0)
reference_time

In [None]:
data.head(3)

In [None]:
TventDry_mean = data['T_Ventilated_dry']
TventDry_min = data['T_Ventilated_dry_1']
TventDry_max = data['T_Ventilated_dry_2']

TventWet_mean = data['T_ventilated_wet']
TventWet_min = data['T_ventilated_wet_1']
TventWet_max = data['T_ventilated_wet_2']

In [None]:
td = (time.apply(lambda x: x - reference_time))
time_hrs = td.apply(lambda x: x.total_seconds()/3600)

In [None]:
time_hrs

## data Variables

In [None]:
variables_attrs = {
            'TventDry_mean' :{'standard_name': 'T ventilated dry',
                 'measure' : 'mean',
                 'units' : ''},
                   
                }

In [None]:
coord_attr = {
                "units": f"hours since {reference_time}",
                "standard_name" : "time",
             }

In [None]:
# manual initialization
a = xr.Dataset(
    coords = {
        'time' : ("time", time_hrs.values.astype('float32'), coord_attr),
        'reference_time': (reference_time),
    },
    
    data_vars = {
        'TventDry_mean' : ("time", TventDry_mean, variables_attrs['TventDry_mean']),
        'TventDry_min' : ("time", TventDry_min,),
        'TventDry_max' : ("time", TventDry_max,),
        'TventWet_mean' : ("time", TventWet_mean,),
        'TventWet_min' : ("time", TventWet_min,),
        'TventWet_max' : ("time", TventWet_max,),
    },
    
    attrs = {
        'Conventions' : 'CF',
        'Title' : 'Veenkampen',
        'Institution' : 'Wageningen University',
    },
)

a

In [None]:
data = data.drop(['time'], axis=1)
data['date'] = data['date'].apply(lambda x: int(x.replace('-', '')))
d_no_time = data[data.columns].to_dict()
d_fin = {k:("time",pd.Series(v)) for k,v in d_no_time.items()}

In [None]:
# automatic initialization
b = xr.Dataset(
    coords = {
        'time' : ("time", time_hrs.values.astype('float32'), coord_attr),
    },
    
    data_vars = d_fin,
    
    attrs = {
        'Title' : 'Veenkampen',
        'Institution' : 'Wageningen University',
    },
)

b

Try to find standards for variables names e.g. WMO, CMOR, CF;
standards for units, attr...

approach
1. formatting as it's done now
2. focus on just some variables..
3. changing some variable from e.g. float 64 to float 32 could be a best practice to save disk space

In [None]:
b

## Writing Netcdf

In [None]:
comp = dict(zlib=True, complevel=5)
encoding = {var: comp for var in b.data_vars}
b.to_netcdf("../data/veenkampen/10min/10min_20210208.nc")

In [None]:
import xarray as xr
xr.open_dataset("../data/veenkampen/10min/10min_20210208.nc", decode_times='true')