## Demo Notebook
# Coiled & MongoDB for Large-Scale Timeseries Analysis

In [1]:
import coiled
import dask
import dask.dataframe as dd
import dask.bag as db
from dask_mongo import read_mongo
from dask_mongo import to_mongo
import urllib

## 1. Spin up Coiled Cluster

In [2]:
cluster = coiled.Cluster(
    name="mongo",
    software="dask-nlp",
    n_workers=20,
    scheduler_options={'idle_timeout': '2 hours'},
    backend_options={'spot':'True'}
)

Output()

Found software environment build
Created fw rule: inbound [8786-8787] [0.0.0.0/0] []
Created FW rules: coiled-dask-rrpelgr71-127487-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-127487-firewall -> coiled-dask-rrpelgr71-127487-firewall]
Created FW rules: coiled-dask-rrpelgr71-127487-cluster-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-127487-cluster-firewall -> coiled-dask-rrpelgr71-127487-cluster-firewall]
Created scheduler VM: coiled-dask-rrpelgr71-127487-scheduler (type: t3.medium, ip: ['3.237.94.138'])


In [3]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.237.94.138:8787,

0,1
Dashboard: http://3.237.94.138:8787,Workers: 7
Total threads: 14,Total memory: 52.34 GiB

0,1
Comm: tls://10.4.7.232:8786,Workers: 7
Dashboard: http://10.4.7.232:8787/status,Total threads: 14
Started: Just now,Total memory: 52.34 GiB

0,1
Comm: tls://10.4.2.146:36385,Total threads: 2
Dashboard: http://10.4.2.146:38073/status,Memory: 7.48 GiB
Nanny: tls://10.4.2.146:42397,
Local directory: /dask-worker-space/worker-6j7qu_q2,Local directory: /dask-worker-space/worker-6j7qu_q2

0,1
Comm: tls://10.4.1.184:42953,Total threads: 2
Dashboard: http://10.4.1.184:42919/status,Memory: 7.48 GiB
Nanny: tls://10.4.1.184:41689,
Local directory: /dask-worker-space/worker-_vacarxb,Local directory: /dask-worker-space/worker-_vacarxb

0,1
Comm: tls://10.4.1.24:44781,Total threads: 2
Dashboard: http://10.4.1.24:39603/status,Memory: 7.48 GiB
Nanny: tls://10.4.1.24:44765,
Local directory: /dask-worker-space/worker-sc8o_mfz,Local directory: /dask-worker-space/worker-sc8o_mfz

0,1
Comm: tls://10.4.9.113:34683,Total threads: 2
Dashboard: http://10.4.9.113:34601/status,Memory: 7.48 GiB
Nanny: tls://10.4.9.113:43469,
Local directory: /dask-worker-space/worker-tgmy09s1,Local directory: /dask-worker-space/worker-tgmy09s1

0,1
Comm: tls://10.4.1.40:46411,Total threads: 2
Dashboard: http://10.4.1.40:33505/status,Memory: 7.48 GiB
Nanny: tls://10.4.1.40:42149,
Local directory: /dask-worker-space/worker-kkt31t3x,Local directory: /dask-worker-space/worker-kkt31t3x

0,1
Comm: tls://10.4.8.142:36279,Total threads: 2
Dashboard: http://10.4.8.142:33583/status,Memory: 7.48 GiB
Nanny: tls://10.4.8.142:40861,
Local directory: /dask-worker-space/worker-a6j8ied6,Local directory: /dask-worker-space/worker-a6j8ied6

0,1
Comm: tls://10.4.7.108:33157,Total threads: 2
Dashboard: http://10.4.7.108:42143/status,Memory: 7.48 GiB
Nanny: tls://10.4.7.108:39953,
Local directory: /dask-worker-space/worker-tqhzj4d9,Local directory: /dask-worker-space/worker-tqhzj4d9


## 2. Write Small Synthetic Timeseries Data to MongoDB

This section tests writing a small (17MB) timeseries dataset to MongoDB using the `dask-mongo` connector.

### Create Synthetic Data

In [4]:
ddf = dask.datasets.timeseries(
    "1990-01-01",
    "1990-01-05",
)

In [5]:
ddf

Unnamed: 0_level_0,id,name,x,y
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-01,int64,object,float64,float64
1990-01-02,...,...,...,...
1990-01-03,...,...,...,...
1990-01-04,...,...,...,...
1990-01-05,...,...,...,...


In [6]:
# reset index to include timestamp
ddf2 = ddf.reset_index(drop=False)

In [7]:
ddf2.head()

Unnamed: 0,timestamp,id,name,x,y
0,1990-01-01 00:00:00,1048,Jerry,0.192962,-0.679646
1,1990-01-01 00:00:01,1026,Quinn,-0.912791,0.203186
2,1990-01-01 00:00:02,1010,Jerry,-0.20419,-0.454191
3,1990-01-01 00:00:03,1045,Patricia,-0.964195,-0.542405
4,1990-01-01 00:00:04,982,Norbert,-0.590039,0.060695


### Convert to Dask Bag (Dictionary Format)

In [9]:
#this is what we would pass to to_mongo
ddf_bag = ddf2.to_bag(format="dict")

In [10]:
ddf_bag.take(1)

({'timestamp': Timestamp('1990-01-01 00:00:00'),
  'id': 1048,
  'name': 'Jerry',
  'x': 0.1929615726454843,
  'y': -0.6796455922790694},)

### Write to MongoDB

In [11]:
import getpass

In [12]:
pw = getpass.getpass()

 ··············


In [24]:
# Replace the username, password, and cluster address with your own connection details
host_uri = "mongodb+srv://richard:" + urllib.parse.quote(pw) + "@cluster1.kmk7m.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"

In [29]:
%%time
to_mongo(
    ddf_bag,
    database="test",
    collection="test_with_timestamp",
    connection_kwargs={"host": host_uri},
)

ServerSelectionTimeoutError: cluster1-shard-00-01.kmk7m.mongodb.net:27017: timed out,cluster1-shard-00-02.kmk7m.mongodb.net:27017: timed out,cluster1-shard-00-00.kmk7m.mongodb.net:27017: timed out, Timeout: 30s, Topology Description: <TopologyDescription id: 6231c97b1101892f38eff014, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster1-shard-00-00.kmk7m.mongodb.net', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('cluster1-shard-00-00.kmk7m.mongodb.net:27017: timed out')>, <ServerDescription ('cluster1-shard-00-01.kmk7m.mongodb.net', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('cluster1-shard-00-01.kmk7m.mongodb.net:27017: timed out')>, <ServerDescription ('cluster1-shard-00-02.kmk7m.mongodb.net', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('cluster1-shard-00-02.kmk7m.mongodb.net:27017: timed out')>]>

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/tcp.py", line 426, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/core.py", l

No server found. That may be because it's in EU AWS region while my Coiled Cluster is in US-East-1.

In [26]:
import pymongo

In [27]:
client = pymongo.MongoClient("mongodb+srv://richard:" + urllib.parse.quote(pw) + "@cluster1.kmk7m.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")

db = client.test

In [28]:
db

Database(MongoClient(host=['cluster1-shard-00-01.kmk7m.mongodb.net:27017', 'cluster1-shard-00-02.kmk7m.mongodb.net:27017', 'cluster1-shard-00-00.kmk7m.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-k26qu7-shard-0', ssl=True), 'test')

- Changing Mongo cluster region to us-east-1 didn't resolve the issue
- running `db.test` does work

## 3. Load Timeseries Data from MongoDB

### Load Data

In [26]:
%%time
bag = read_mongo(
    connection_kwargs={"host": host_uri},
    database="test_timeseries",
    collection="test_with_timestamp_2",
    chunksize=100,
)

CPU times: user 107 ms, sys: 19.7 ms, total: 126 ms
Wall time: 3.75 s


In [27]:
bag.take(1)

({'_id': ObjectId('6231b783396ae3778e46fa70'),
  'timestamp': datetime.datetime(1990, 1, 4, 0, 0),
  'id': 947,
  'name': 'Hannah',
  'x': -0.5290492981556099,
  'y': 0.5540242522201178},)

### Convert to Dask DataFrame

In [28]:
ddf3 = bag.to_dataframe()

In [29]:
ddf3.head()

Unnamed: 0,_id,timestamp,id,name,x,y
0,6231b783396ae3778e46fa70,1990-01-04 00:00:00,947,Hannah,-0.529049,0.554024
1,6231b783396ae3778e46fa71,1990-01-04 00:00:01,1050,Frank,0.534544,-0.016464
2,6231b783396ae3778e46fa72,1990-01-04 00:00:02,1065,Patricia,0.499794,0.110423
3,6231b783396ae3778e46fa73,1990-01-04 00:00:03,1054,Norbert,-0.555151,0.86296
4,6231b783396ae3778e46fa74,1990-01-04 00:00:04,992,Alice,-0.115568,-0.343412


In [30]:
ddf3 = ddf3.drop(labels=["_id"], axis=1)

In [31]:
ddf3.head()

Unnamed: 0,timestamp,id,name,x,y
0,1990-01-04 00:00:00,947,Hannah,-0.529049,0.554024
1,1990-01-04 00:00:01,1050,Frank,0.534544,-0.016464
2,1990-01-04 00:00:02,1065,Patricia,0.499794,0.110423
3,1990-01-04 00:00:03,1054,Norbert,-0.555151,0.86296
4,1990-01-04 00:00:04,992,Alice,-0.115568,-0.343412


In [32]:
ddf3

Unnamed: 0_level_0,timestamp,id,name,x,y
npartitions=3456,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],int64,object,float64,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [33]:
ddf3 = ddf3.set_index('timestamp')
ddf3.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-01 00:00:00,1046,Sarah,0.477859,0.254645
1990-01-01 00:00:01,966,George,0.039212,0.342549
1990-01-01 00:00:02,1015,Tim,0.329459,0.674911
1990-01-01 00:00:03,1044,Ray,-0.516542,0.586507
1990-01-01 00:00:04,995,Charlie,0.574594,0.970574


## 4. Process Timeseries Data

In [60]:
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf, pacf

In [None]:
lag_acf = 