In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
SWOB_DIR = ''
SWOB_FILE = ''

In [None]:
import dask
import dask.bag as db
import dask.dataframe as dd
import dask.distributed
import os
import pathlib
import pandas as pd
import xml.dom.minidom
import seaborn as sns

import itertools

In [None]:
import dask_jobqueue

In [None]:
DATA_DIR = pathlib.Path(os.getenv('DATA_DIR'))

In [None]:
cluster = dask_jobqueue.SLURMCluster(
    cores=12,
    processes=6,
    memory='128G',
    env_extra=['source ~/.bash_profile','conda activate smc01'],
    name='smc01-dask',
    local_directory=DATA_DIR / 'dask',
    walltime='3:00:00'
)

In [None]:
cluster.scale(jobs=2)  # Scale to two working nodes as configured.
client = dask.distributed.Client(cluster)

In [None]:
client

# With dask bag

In [None]:
def string_to_dict(obs_xml_string):
    obs_data = xml.dom.minidom.parseString(obs_xml_string)
    metadata = obs_data.getElementsByTagName('identification-elements')[0]

    metadata_dict = {}

    for element in metadata.childNodes:
        variable = element.attributes['name'].value
        value = element.attributes['value'].value
        metadata_dict[variable] = value
        
    obs_dict = {}

    elements = obs_data.getElementsByTagName('elements')[0]
    for element in elements.childNodes:
        variable = element.attributes['name'].value
        value = element.attributes['value'].value
        obs_dict[variable] = value
        
    return {**metadata_dict, **obs_dict}

In [None]:
bag = db.read_text(SWOB_DIR + '20201125/*/*.xml')

In [None]:
bag

In [None]:
obs_dicts = bag.map(string_to_dict)

In [None]:
obs_dicts

In [None]:
sample = obs_dicts.random_sample(0.01).compute()

In [None]:
sample_df = sample.to_dataframe()

In [None]:
len(sample)

In [None]:
for obs in sample:
    if 'air_temp' in obs:
        print(obs['air_temp'])

In [None]:
obs_df = obs_dicts.to_dataframe()

In [None]:
computed = obs_df.compute()

In [None]:
computed.columns

In [None]:
computed['avg_air_temp_pst10mts'] = pd.to_numeric(computed['avg_air_temp_pst10mts'])

In [None]:
computed.groupby(by='wmo_synop_id').mean()

In [None]:
computed[['date_tm', 'air_temp']]

In [None]:
computed['air_temp'] = pd.to_numeric(computed['air_temp'])

In [None]:
computed['air_temp'].plot()

In [None]:
sns.lineplot(data=computed, x='date_tm', y='air_temp')