In [1]:
# Standard Python modules
import os, sys
import glob
import numpy as np
import pandas as pd
import xarray as xr
import re

# extras
%matplotlib inline
import metpy.calc as mpcalc
from metpy.units import units
from scipy import stats
import dask.dataframe as dd

# Import my modules
sys.path.append('../modules') # Path to modules


pd.options.display.float_format = "{:,.2f}".format # makes it so pandas tables display only first two decimals

In [2]:
path_to_data = '/home/dnash/SEAK_clim_data/' 
path_to_out  = '../out/'       # output files (numerical results, intermediate datafiles) -- read & write
path_to_figs = '../figs/'      # figures

In [3]:
filename_pattern = '/data/downloaded/Reanalysis/MERRA2/ARScale/TimeSeries/MERRA_ARCats_*.txt'

filenames = []
for name in glob.glob(filename_pattern):
    filenames.append(name)
# sort filenames so they are in chronological order
filenames = sorted(filenames)
print(len(filenames))

207936


In [9]:
def preprocess_MERRA2_txt_file(fname):
    ## read just one file
    df = pd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
    ## put time info into single column in datetime format
    df['time'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    df = df.drop(['year', 'month', 'day', 'hour'], axis=1) # drop the other columns

    ## get lat and lon values from fname
    strp_fname = re.findall(r"[-+]?(?:\d*\.*\d+)", fname)
    lat_val = float(strp_fname[1])
    lon_val = float(strp_fname[2])
    # print(lat_val, lon_val)

    # convert to xarray
    ds = df.to_xarray() 
    ds = ds.assign(index=ds.time.values) # assign time values to index
    ds = ds.drop(['time']) # drop time variable
    ds = ds.rename({'index':'time'}) # rename index to time
    ds = ds.assign_coords(lat=lat_val, lon=lon_val) # reassign lat and lon as coords
    ds = ds.expand_dims(dim={"lat": 1, "lon": 1})
    
    return ds

def dask_2_xarray(ddf, indexname='index'):
    ds = xr.Dataset()
    ds[indexname] = ddf.index
    for key in ddf.columns:
        ds[key] = (indexname, ddf[key].to_dask_array().compute_chunk_sizes())
    
    return ds

def preprocess_MERRA2_txt_file_using_dask(fname, times):
    ## get time information from file
    df = pd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
    ## put time info into single column in datetime format
    times = pd.to_datetime(df[['year', 'month', 'day', 'hour']])
    
    ## create a dask dataframe
    ddf = dd.read_csv(fname, header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ',
                      dtype={"year": int, "month": int, "day": int, "hour": int, "ivt": float, "ar_scale": int, "tIVT": float, "duration": int})
    
    ddf = ddf.drop(['year', 'month', 'day', 'hour'], axis=1) # drop the other columns
    ## convert to xarray
    ds = dask_2_xarray(ddf)

    ### weirdly messy hack to get times to assign
    ds = ds.assign(time=times) # assign time values to index
    ds = ds.assign(index=ds.time.values) # assign time values to index
    ds = ds.drop(['time']) # drop time variable
    ds = ds.rename({'index':'time'}) # rename index to time
    ds = ds.drop(['dim_0']) # drop time variable

    ## get lat and lon values from fname
    strp_fname = re.findall(r"[-+]?(?:\d*\.*\d+)", fname)
    lat_val = float(strp_fname[1])
    lon_val = float(strp_fname[2])
    ds = ds.assign_coords(lat=lat_val, lon=lon_val) # reassign lat and lon as coords
    ds = ds.expand_dims(dim={"lat": 1, "lon": 1})
    
    return ds

In [None]:
## get time information from first file
df = pd.read_csv(filenames[0], header=None, names=['year', 'month', 'day', 'hour', 'ivt', 'ar_scale', 'tIVT', 'duration'], delimiter=' ')
## put time info into single column in datetime format
times = pd.to_datetime(df[['year', 'month', 'day', 'hour']])

ds_lst = []
for i, fname in enumerate(filenames):
    ds = preprocess_MERRA2_txt_file_using_dask(fname, times)
    ds_lst.append(ds)

## combine ds_lst by coordinates
ds_final = xr.combine_by_coords(ds_lst)
ds_final

In [30]:
## add attributes (units, etc.)

## save monthly files