# Example Analysis

In [117]:
import xarray as xr
import pandas as pd
import dask.dataframe as dd
import hvplot.pandas  # noqa
import hvplot.dask  # noqa
import numpy as np
from scipy import stats

In [118]:
import warnings
warnings.filterwarnings('ignore')

# Build Dask Cluster
1. Use gui interface to create a new cluster with ~10 workers
2. Use < > to insert an "import Client" statement. This is critical because it is how your script knows to use yoru cluster. 
3. Execute client cell
4. Execute your Dask Cell. Note that once you have a cluster running you do not need to re-import the client. 
5. When finished, always shut down your cluster. 

In [119]:
from dask.distributed import Client

client = Client("tcp://10.0.131.0:40485")
client

0,1
Client  Scheduler: tcp://10.0.131.0:40485  Dashboard: /user/daxsoule/proxy/8787/status,Cluster  Workers: 5  Cores: 5  Memory: 6.25 GB


## Read in RS03ECAL-MJ03E-06-BOTPTA302 Data

In [120]:
#!head /home/jovyan/data/botpt/RS03ECAL-MJ03E-06-BOTPTA302/deployment0001_RS03ECAL-MJ03E-06-BOTPTA302-streamed-botpt_nano_sample_20140904T000000-20141128T060000.100000.nc\#fillmisma_resampled.nc

In [121]:
# ds =xr.open_dataset('/home/jovyan/data/botpt/RS03ECAL-MJ03E-06-BOTPTA302/deployment0001_RS03ECAL-MJ03E-06-BOTPTA302-streamed-botpt_nano_sample_20191012T060000-20191014T235959.950000_resampled.nc')
# ds

In [122]:
ds1 = xr.open_mfdataset('/home/jovyan/data/botpt/RS03ECAL-MJ03E-06-BOTPTA302/*_resampled.nc', parallel=True).chunk(10080)
ds1['bottom_pressure_eastern'] = ds1['bottom_pressure']
del ds1['bottom_pressure']
ds1

<xarray.Dataset>
Dimensions:                  (index: 2368081)
Coordinates:
  * index                    (index) datetime64[ns] 2015-02-09T05:59:00 ... 2019-10-14T23:59:00
Data variables:
    bottom_pressure_eastern  (index) float32 dask.array<chunksize=(10080,), meta=np.ndarray>

## Read in Central Caldera

In [123]:
# ds =xr.open_dataset('/home/jovyan/data/botpt/RS03CCAL-MJ03F-05-BOTPTA301/deployment0001_RS03CCAL-MJ03F-05-BOTPTA301-streamed-botpt_nano_sample_20140904T000000-20141128T115959.950000_resampled.nc')
# ds

In [124]:
ds2 = xr.open_mfdataset('/home/jovyan/data/botpt/RS03CCAL-MJ03F-05-BOTPTA301/*_resampled.nc', parallel=True).chunk(10080)
ds2['bottom_pressure_central'] = ds2['bottom_pressure']
del ds2['bottom_pressure']
ds2

<xarray.Dataset>
Dimensions:                  (index: 2606374)
Coordinates:
  * index                    (index) datetime64[ns] 2014-09-04 ... 2019-10-14T23:59:00
Data variables:
    bottom_pressure_central  (index) float32 dask.array<chunksize=(10080,), meta=np.ndarray>

## Create DataFrame

In [125]:
#df = ds1.to_dask_dataframe()
df1 = ds1.to_dataframe()
df2= ds2.to_dataframe()
del ds1
del ds2

In [126]:
df1['Datetime']=np.datetime_as_string(df1.index)
df2['Datetime']=np.datetime_as_string(df2.index)

In [127]:
df1['Datetime']=df1['Datetime'].str[:19]
df2['Datetime']=df2['Datetime'].str[:19]

In [128]:
df_botpt=pd.merge(df1,df2, on = 'Datetime')
del df1
del df2
df_botpt.tail()

Unnamed: 0,bottom_pressure_eastern,Datetime,bottom_pressure_central
2362766,2240.960938,2019-10-14T23:55:00,2254.007324
2362767,2240.949463,2019-10-14T23:56:00,2253.995605
2362768,2240.938232,2019-10-14T23:57:00,2253.984375
2362769,2240.925781,2019-10-14T23:58:00,2253.970215
2362770,2240.910156,2019-10-14T23:59:00,2253.955811


## trimming data frame

In [129]:
df_botpt= df_botpt.set_index(pd.to_datetime(df_botpt['Datetime']))
del df_botpt['Datetime']
df_botpt['depthDiff']= df_botpt['bottom_pressure_eastern'] - df_botpt['bottom_pressure_central']
df_botpt =df_botpt.loc['2015-05-01 00:00:00':'2019-10-14 00:00:00']
df_botpt['spikes']=(df_botpt['depthDiff'] - df_botpt['depthDiff'].rolling(1000).median()).abs() < 0.1
df_botpt['cleanDiff']=df_botpt['depthDiff'].where(df_botpt.spikes)
#df_botpt['bottom_pressure_eastern']= df_botpt.bottom_pressure_eastern.round(3)
#df_botpt['bottom_pressure_central']= df_botpt.bottom_pressure_central.round(3)
df_botpt.sample(100)

Unnamed: 0_level_0,bottom_pressure_eastern,bottom_pressure_central,depthDiff,spikes,cleanDiff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-05-09 05:42:00,2241.908936,2256.647705,-14.738770,True,-14.738770
2016-06-17 04:25:00,2243.109619,2257.038086,-13.928467,True,-13.928467
2017-07-17 09:23:00,2240.665527,2254.285156,-13.619629,True,-13.619629
2018-09-22 19:32:00,2242.506348,2255.766846,-13.260498,True,-13.260498
2017-06-20 14:48:00,2241.615967,2255.259521,-13.643555,True,-13.643555
...,...,...,...,...,...
2019-04-02 11:35:00,2240.718750,2253.823730,-13.104980,True,-13.104980
2015-11-27 21:27:00,2244.106934,2258.300049,-14.193115,True,-14.193115
2017-06-15 17:24:00,2240.117432,2253.759766,-13.642334,True,-13.642334
2016-01-05 19:04:00,2242.676270,2256.812256,-14.135986,True,-14.135986


In [73]:
df_botpt.sample(100)

Unnamed: 0_level_0,bottom_pressure_eastern,bottom_pressure_central,depthDiff,spikes,cleanDiff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-06-18 20:59:00,2243.180420,2257.750488,-14.570068,True,-14.570068
2019-07-17 20:46:00,2242.213623,2255.305908,-13.092285,True,-13.092285
2018-03-19 04:26:00,2240.148193,2253.582275,-13.434082,True,-13.434082
2015-06-02 12:23:00,2240.910889,2255.552979,-14.642090,True,-14.642090
2017-08-25 05:58:00,2240.733398,2254.325684,-13.592285,True,-13.592285
...,...,...,...,...,...
2018-11-22 06:19:00,2242.085205,2255.277344,-13.192139,True,-13.192139
2017-05-06 05:50:00,2242.941162,2256.611328,-13.670166,True,-13.670166
2016-04-03 06:55:00,2242.550781,2256.576660,-14.025879,True,-14.025879
2017-06-05 12:14:00,2240.328369,2253.988770,-13.660400,True,-13.660400


In [14]:
sd_botpt = dd.from_pandas(df_botpt, npartitions=30)
print (sd_botpt)

Dask DataFrame Structure:
                    bottom_pressure_eastern bottom_pressure_central depthDiff spikes
npartitions=30                                                                      
2015-05-01 00:00:00                 float32                 float32   float32   bool
2015-06-22 14:07:00                     ...                     ...       ...    ...
...                                     ...                     ...       ...    ...
2019-08-21 02:29:00                     ...                     ...       ...    ...
2019-10-14 00:00:00                     ...                     ...       ...    ...
Dask Name: from_pandas, 30 tasks


## Plot RS03ECAL-MJ03E-06-BOTPTA302 using Dask

In [15]:
sd_botpt.hvplot(x= 'Datetime', y=['bottom_pressure_eastern', 'bottom_pressure_central'], 
                datashade =True,
                height=400,
                flip_yaxis=True,
               subplots =True,
               shared_axes=False).cols(1)

## Plot RS03CCAL-MJ03F-05-BOTPTA301 using Dask

In [130]:
df_botpt.hvplot(y='cleanDiff', datashade =True, height=400,
                       flip_yaxis=False)

In [None]:
df_botpt.hvplot(y='spikes', datashade =True, height=400,
                       flip_yaxis=False)

### Read in tide data Eastern Caldera 

In [None]:
df_grav=dd.read_csv('/home/jovyan/data/bravoseis_data/SADO/jan_2019/gravimetro_bruto.proc/*.proc', 
               parse_dates=['fecha'], date_parser=dateparse, 
                    dtype = {'fecha': object,'status': np.float64,
                                'gravimetria_bruta': np.float64, 'spring_tension': np.float64,
                                'longitud': np.float64, 'latitud': np.float64,
                                'velocidad': np.float64,'rumbo': np.float64 })
#df.partitions[5].compute()
df_grav=df_grav.set_index("fecha")
del df_grav['fecha_telegrama']
del df_grav['rumbo']
del df_grav['velocidad']
del df_grav['spring_tension']
del df_grav['status']
df_grav.head()