## Demo Notebook
# Coiled & MongoDB for Large-Scale Timeseries Analysis

In [16]:
import coiled
import dask
import dask.dataframe as dd
import dask.bag as db
from dask_mongo import read_mongo
from dask_mongo import to_mongo
import urllib

## 1. Spin up Coiled Cluster

In [5]:
cluster = coiled.Cluster(
    name="mongo",
    software="dask-nlp",
    n_workers=20,
    scheduler_options={'idle_timeout': '2 hours'},
    backend_options={'spot':'True'}
)

Output()

Found software environment build
Created fw rule: inbound [8786-8787] [0.0.0.0/0] []
Created FW rules: coiled-dask-rrpelgr71-124769-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-124769-firewall -> coiled-dask-rrpelgr71-124769-firewall]
Created FW rules: coiled-dask-rrpelgr71-124769-cluster-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-124769-cluster-firewall -> coiled-dask-rrpelgr71-124769-cluster-firewall]
Created scheduler VM: coiled-dask-rrpelgr71-124769-scheduler (type: t3a.medium, ip: ['44.200.126.241'])


distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


In [6]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://44.200.126.241:8787,

0,1
Dashboard: http://44.200.126.241:8787,Workers: 7
Total threads: 14,Total memory: 53.67 GiB

0,1
Comm: tls://10.4.8.88:8786,Workers: 7
Dashboard: http://10.4.8.88:8787/status,Total threads: 14
Started: Just now,Total memory: 53.67 GiB

0,1
Comm: tls://10.4.2.173:33689,Total threads: 2
Dashboard: http://10.4.2.173:37053/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.173:33135,
Local directory: /dask-worker-space/worker-8xfvkgpn,Local directory: /dask-worker-space/worker-8xfvkgpn

0,1
Comm: tls://10.4.15.201:37639,Total threads: 2
Dashboard: http://10.4.15.201:36367/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.201:44661,
Local directory: /dask-worker-space/worker-8bdu3f4q,Local directory: /dask-worker-space/worker-8bdu3f4q

0,1
Comm: tls://10.4.10.158:37415,Total threads: 2
Dashboard: http://10.4.10.158:36795/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.158:39823,
Local directory: /dask-worker-space/worker-qtbi5e84,Local directory: /dask-worker-space/worker-qtbi5e84

0,1
Comm: tls://10.4.14.162:35233,Total threads: 2
Dashboard: http://10.4.14.162:41021/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.162:41141,
Local directory: /dask-worker-space/worker-iw5gnmk0,Local directory: /dask-worker-space/worker-iw5gnmk0

0,1
Comm: tls://10.4.8.195:37337,Total threads: 2
Dashboard: http://10.4.8.195:38599/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.195:44673,
Local directory: /dask-worker-space/worker-etrgx3q7,Local directory: /dask-worker-space/worker-etrgx3q7

0,1
Comm: tls://10.4.13.210:42253,Total threads: 2
Dashboard: http://10.4.13.210:40549/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.210:40971,
Local directory: /dask-worker-space/worker-zk2pvup1,Local directory: /dask-worker-space/worker-zk2pvup1

0,1
Comm: tls://10.4.8.91:39705,Total threads: 2
Dashboard: http://10.4.8.91:42143/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.91:40039,
Local directory: /dask-worker-space/worker-vtyfvb3s,Local directory: /dask-worker-space/worker-vtyfvb3s


## 2. Write Small Synthetic Timeseries Data to MongoDB

This section tests writing a small (17MB) timeseries dataset to MongoDB using the `dask-mongo` connector.

### Create Synthetic Data

In [26]:
ddf = dask.datasets.timeseries(
    "1990-01-01",
    "1990-01-05",
)

In [27]:
ddf

Unnamed: 0_level_0,id,name,x,y
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-01,int64,object,float64,float64
1990-01-02,...,...,...,...
1990-01-03,...,...,...,...
1990-01-04,...,...,...,...
1990-01-05,...,...,...,...


In [41]:
# reset index to include timestamp
ddf2 = ddf.reset_index(drop=False)

In [42]:
ddf2.head()

Unnamed: 0,timestamp,id,name,x,y
0,1990-01-01 00:00:00,979,Dan,0.019922,-0.442995
1,1990-01-01 00:00:01,1036,Yvonne,0.31086,0.963893
2,1990-01-01 00:00:02,971,Wendy,-0.369821,-0.45156
3,1990-01-01 00:00:03,955,Charlie,-0.569156,0.473523
4,1990-01-01 00:00:04,1035,Xavier,0.162889,0.798787


### Convert to Dask Bag

In [43]:
bag2 = db.from_delayed(
    ddf2.map_partitions(lambda x: x.to_dict(orient="records")).to_delayed()
)

In [44]:
bag2.take(1)

({'timestamp': Timestamp('1990-01-01 00:00:00'),
  'id': 979,
  'name': 'Dan',
  'x': 0.019922319736423555,
  'y': -0.4429951240621721},)

### Write to MongoDB

In [29]:
# Replace the username, password, and cluster address with your own connection details
host_uri = "mongodb+srv://richard:" + urllib.parse.quote("Rp@976559MO") + "@cluster0.ffttf.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"

In [45]:
%%time
to_mongo(
    bag2,
    database="test_timeseries",
    collection="test_with_timestamp",
    connection_kwargs={"host": host_uri},
)

CPU times: user 87.9 ms, sys: 10.4 ms, total: 98.3 ms
Wall time: 20.4 s


## 3. Load Timeseries Data from MongoDB

### Load Data

In [46]:
%%time
bag = read_mongo(
    connection_kwargs={"host": host_uri},
    database="test_timeseries",
    collection="test_with_timestamp",
    chunksize=100,
)

CPU times: user 109 ms, sys: 15.6 ms, total: 125 ms
Wall time: 1.74 s


In [47]:
bag.take(1)

({'_id': ObjectId('622f13fb49748a5684bd259d'),
  'timestamp': datetime.datetime(1990, 1, 3, 0, 0),
  'id': 1019,
  'name': 'Bob',
  'x': 0.6967979134873756,
  'y': 0.8138281335172173},)

### Convert to Dask DataFrame

In [48]:
ddf3 = bag.to_dataframe()

In [49]:
ddf3.head()

Unnamed: 0,_id,timestamp,id,name,x,y
0,622f13fb49748a5684bd259d,1990-01-03 00:00:00,1019,Bob,0.696798,0.813828
1,622f13fb49748a5684bd259e,1990-01-03 00:00:01,1007,Sarah,-0.213769,-0.899712
2,622f13fb49748a5684bd259f,1990-01-03 00:00:02,985,Patricia,-0.846256,-0.678458
3,622f13fb49748a5684bd25a0,1990-01-03 00:00:03,997,Yvonne,0.456602,0.635687
4,622f13fb49748a5684bd25a1,1990-01-03 00:00:04,986,Sarah,-0.742017,-0.963478


In [51]:
ddf3 = ddf3.drop(labels=["_id"], axis=1)

In [52]:
ddf3.head()

Unnamed: 0,timestamp,id,name,x,y
0,1990-01-03 00:00:00,1019,Bob,0.696798,0.813828
1,1990-01-03 00:00:01,1007,Sarah,-0.213769,-0.899712
2,1990-01-03 00:00:02,985,Patricia,-0.846256,-0.678458
3,1990-01-03 00:00:03,997,Yvonne,0.456602,0.635687
4,1990-01-03 00:00:04,986,Sarah,-0.742017,-0.963478


In [55]:
ddf3

Unnamed: 0_level_0,timestamp,id,name,x,y
npartitions=3456,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],int64,object,float64,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [61]:
ddf3 = ddf3.set_index('timestamp')
ddf3.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-01 00:00:00,979,Dan,0.019922,-0.442995
1990-01-01 00:00:01,1036,Yvonne,0.31086,0.963893
1990-01-01 00:00:02,971,Wendy,-0.369821,-0.45156
1990-01-01 00:00:03,955,Charlie,-0.569156,0.473523
1990-01-01 00:00:04,1035,Xavier,0.162889,0.798787


distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/tcp.py", line 426, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/core.py", l

## 4. Process Timeseries Data

In [60]:
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf, pacf

In [None]:
lag_acf = 