# Example Analysis

In [1]:
import xarray as xr
import pandas as pd
import dask.dataframe as dd
import hvplot.pandas  # noqa
import hvplot.dask  # noqa

# Build Dask Cluster
1. Use gui interface to create a new cluster with ~10 workers
2. Use < > to insert an "import Client" statement. This is critical because it is how your script knows to use yoru cluster. 
3. Execute client cell
4. Execute your Dask Cell. Note that once you have a cluster running you do not need to re-import the client. 
5. When finished, always shut down your cluster. 

In [2]:
from dask.distributed import Client

client = Client("tcp://10.0.128.146:42701")
client

0,1
Client  Scheduler: tcp://10.0.128.146:42701  Dashboard: /user/daxsoule/proxy/8787/status,Cluster  Workers: 8  Cores: 8  Memory: 10.00 GB


In [3]:
from pylab import rcParams

In [4]:
60*24*7

10080

## Read in RS03ECAL-MJ03E-06-BOTPTA302 Data

In [5]:
#!head /home/jovyan/data/botpt/RS03ECAL-MJ03E-06-BOTPTA302/deployment0001_RS03ECAL-MJ03E-06-BOTPTA302-streamed-botpt_nano_sample_20140904T000000-20141128T060000.100000.nc\#fillmisma_resampled.nc

In [7]:
ds =xr.open_dataset('/home/jovyan/data/botpt/RS03ECAL-MJ03E-06-BOTPTA302/deployment0001_RS03ECAL-MJ03E-06-BOTPTA302-streamed-botpt_nano_sample_20191012T060000-20191014T235959.950000_resampled.nc')
ds

<xarray.Dataset>
Dimensions:          (index: 3960)
Coordinates:
  * index            (index) datetime64[ns] 2019-10-12T06:00:00 ... 2019-10-14T23:59:00
Data variables:
    bottom_pressure  (index) float32 ...

In [10]:
ds = xr.open_mfdataset('/home/jovyan/data/botpt/RS03ECAL-MJ03E-06-BOTPTA302/*_resampled.nc', parallel=True).chunk(10080)
ds

will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

  """Entry point for launching an IPython kernel.


<xarray.Dataset>
Dimensions:          (index: 2368081)
Coordinates:
  * index            (index) datetime64[ns] 2015-02-09T05:59:00 ... 2019-10-14T23:59:00
Data variables:
    bottom_pressure  (index) float32 dask.array<chunksize=(10080,), meta=np.ndarray>

## Create Dask DataFrame

In [11]:
df = ds.to_dask_dataframe()
df = df.rename(columns={"index": "time"});
df = df.set_index('time')

In [12]:
df.head()

Unnamed: 0_level_0,bottom_pressure
time,Unnamed: 1_level_1
2015-02-09 05:59:00,2239.624512
2015-02-09 06:00:00,2239.624512
2015-02-09 06:01:00,2239.625488
2015-02-09 06:02:00,2239.632568
2015-02-09 06:03:00,2239.635742


In [13]:
df.describe()

Unnamed: 0_level_0,bottom_pressure
npartitions=1,Unnamed: 1_level_1
,float64
,...


## Plot RS03ECAL-MJ03E-06-BOTPTA302 using Dask

In [16]:
df.hvplot(y='bottom_pressure', datashade =True, height=200,
                       flip_yaxis=True)

## Read in F Data

In [19]:
ds =xr.open_dataset('/home/jovyan/data/botpt/RS03CCAL-MJ03F-05-BOTPTA301/deployment0001_RS03CCAL-MJ03F-05-BOTPTA301-streamed-botpt_nano_sample_20140904T000000-20141128T115959.950000_resampled.nc')
ds

<xarray.Dataset>
Dimensions:          (index: 121362)
Coordinates:
  * index            (index) datetime64[ns] 2014-09-04 ... 2014-11-28T11:59:00
Data variables:
    bottom_pressure  (index) float32 ...

In [20]:
del ds
ds = xr.open_mfdataset('/home/jovyan/data/botpt/RS03CCAL-MJ03F-05-BOTPTA301/*_resampled.nc', parallel=True).chunk(10080)
ds

will change. To retain the existing behavior, pass
combine='nested'. To use future default behavior, pass
combine='by_coords'. See
http://xarray.pydata.org/en/stable/combining.html#combining-multi

  
to use the new `combine_by_coords` function (or the
`combine='by_coords'` option to `open_mfdataset`) to order the datasets
before concatenation. Alternatively, to continue concatenating based
on the order the datasets are supplied in future, please use the new
`combine_nested` function (or the `combine='nested'` option to
open_mfdataset).
  from_openmfds=True,


<xarray.Dataset>
Dimensions:          (index: 2606374)
Coordinates:
  * index            (index) datetime64[ns] 2014-09-04 ... 2019-10-14T23:59:00
Data variables:
    bottom_pressure  (index) float32 dask.array<chunksize=(10080,), meta=np.ndarray>

## Create Dask DataFrame for F

In [21]:
dff = ds.to_dask_dataframe()
dff = dff.rename(columns={"index": "time"});
dff = dff.set_index('time')

## Plot RS03CCAL-MJ03F-05-BOTPTA301 using Dask

In [22]:
dff.hvplot(y='bottom_pressure', datashade =True, height=200,
                       flip_yaxis=True)

## Merge E and F

In [26]:
test = dd.merge(df, dff,how='outer', indicator=True, left_index=True, right_index=True, suffixes=('_E', '_F'))

In [27]:
df_botptMerge = test[test['_merge'] == 'both']
del df_botptMerge['_merge']

In [28]:
df_botptMerge.head()

Unnamed: 0_level_0,bottom_pressure_E,bottom_pressure_F
time,Unnamed: 1_level_1,Unnamed: 2_level_1


In [35]:
merge = dd.merge(df, dff, left_index=True, right_index=True)

MergeError: Must pass left_on or left_index=True

In [33]:
merge.head(10)

Unnamed: 0_level_0,bottom_pressure_x,bottom_pressure_y
time,Unnamed: 1_level_1,Unnamed: 2_level_1
