## Demo Notebook
# Coiled & MongoDB for Large-Scale Timeseries Analysis

In [1]:
import coiled
import dask
import dask.dataframe as dd
import dask.bag as db
from dask_mongo import read_mongo
from dask_mongo import to_mongo
import urllib

## 1. Spin up Coiled Cluster

In [2]:
cluster = coiled.Cluster(
    name="mongo",
    software="dask-nlp",
    n_workers=20,
    scheduler_options={'idle_timeout': '2 hours'},
    backend_options={'spot':'True'}
)

Output()

In [3]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.215.153.211:8787,

0,1
Dashboard: http://3.215.153.211:8787,Workers: 20
Total threads: 40,Total memory: 153.34 GiB

0,1
Comm: tls://10.4.4.148:8786,Workers: 20
Dashboard: http://10.4.4.148:8787/status,Total threads: 40
Started: 54 minutes ago,Total memory: 153.34 GiB

0,1
Comm: tls://10.4.15.208:41883,Total threads: 2
Dashboard: http://10.4.15.208:33075/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.208:46675,
Local directory: /dask-worker-space/worker-pwp48kkk,Local directory: /dask-worker-space/worker-pwp48kkk

0,1
Comm: tls://10.4.15.34:38535,Total threads: 2
Dashboard: http://10.4.15.34:32923/status,Memory: 7.67 GiB
Nanny: tls://10.4.15.34:41907,
Local directory: /dask-worker-space/worker-rpsw5cgr,Local directory: /dask-worker-space/worker-rpsw5cgr

0,1
Comm: tls://10.4.10.14:41053,Total threads: 2
Dashboard: http://10.4.10.14:38451/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.14:45563,
Local directory: /dask-worker-space/worker-awfvewes,Local directory: /dask-worker-space/worker-awfvewes

0,1
Comm: tls://10.4.7.248:45673,Total threads: 2
Dashboard: http://10.4.7.248:34129/status,Memory: 7.67 GiB
Nanny: tls://10.4.7.248:45353,
Local directory: /dask-worker-space/worker-b1i1q209,Local directory: /dask-worker-space/worker-b1i1q209

0,1
Comm: tls://10.4.7.192:43939,Total threads: 2
Dashboard: http://10.4.7.192:42803/status,Memory: 7.67 GiB
Nanny: tls://10.4.7.192:34065,
Local directory: /dask-worker-space/worker-huz9_jdq,Local directory: /dask-worker-space/worker-huz9_jdq

0,1
Comm: tls://10.4.13.242:39899,Total threads: 2
Dashboard: http://10.4.13.242:43461/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.242:45313,
Local directory: /dask-worker-space/worker-et7mwhy_,Local directory: /dask-worker-space/worker-et7mwhy_

0,1
Comm: tls://10.4.3.41:44621,Total threads: 2
Dashboard: http://10.4.3.41:38657/status,Memory: 7.67 GiB
Nanny: tls://10.4.3.41:35401,
Local directory: /dask-worker-space/worker-4cgqh645,Local directory: /dask-worker-space/worker-4cgqh645

0,1
Comm: tls://10.4.2.143:34785,Total threads: 2
Dashboard: http://10.4.2.143:46627/status,Memory: 7.67 GiB
Nanny: tls://10.4.2.143:36527,
Local directory: /dask-worker-space/worker-woawdqb1,Local directory: /dask-worker-space/worker-woawdqb1

0,1
Comm: tls://10.4.6.255:46583,Total threads: 2
Dashboard: http://10.4.6.255:37353/status,Memory: 7.67 GiB
Nanny: tls://10.4.6.255:42793,
Local directory: /dask-worker-space/worker-l9azml02,Local directory: /dask-worker-space/worker-l9azml02

0,1
Comm: tls://10.4.8.176:38477,Total threads: 2
Dashboard: http://10.4.8.176:45329/status,Memory: 7.67 GiB
Nanny: tls://10.4.8.176:32823,
Local directory: /dask-worker-space/worker-6r38_nj_,Local directory: /dask-worker-space/worker-6r38_nj_

0,1
Comm: tls://10.4.0.175:33993,Total threads: 2
Dashboard: http://10.4.0.175:44797/status,Memory: 7.67 GiB
Nanny: tls://10.4.0.175:44269,
Local directory: /dask-worker-space/worker-q3g2lyct,Local directory: /dask-worker-space/worker-q3g2lyct

0,1
Comm: tls://10.4.4.245:35633,Total threads: 2
Dashboard: http://10.4.4.245:44321/status,Memory: 7.67 GiB
Nanny: tls://10.4.4.245:42993,
Local directory: /dask-worker-space/worker-4nlr85g3,Local directory: /dask-worker-space/worker-4nlr85g3

0,1
Comm: tls://10.4.10.156:34593,Total threads: 2
Dashboard: http://10.4.10.156:38223/status,Memory: 7.67 GiB
Nanny: tls://10.4.10.156:38395,
Local directory: /dask-worker-space/worker-6fh52wdc,Local directory: /dask-worker-space/worker-6fh52wdc

0,1
Comm: tls://10.4.7.13:45973,Total threads: 2
Dashboard: http://10.4.7.13:36905/status,Memory: 7.67 GiB
Nanny: tls://10.4.7.13:34423,
Local directory: /dask-worker-space/worker-ozrzhx1t,Local directory: /dask-worker-space/worker-ozrzhx1t

0,1
Comm: tls://10.4.14.172:45595,Total threads: 2
Dashboard: http://10.4.14.172:37225/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.172:38201,
Local directory: /dask-worker-space/worker-98hih76s,Local directory: /dask-worker-space/worker-98hih76s

0,1
Comm: tls://10.4.4.123:37207,Total threads: 2
Dashboard: http://10.4.4.123:40761/status,Memory: 7.67 GiB
Nanny: tls://10.4.4.123:33205,
Local directory: /dask-worker-space/worker-zcofcxub,Local directory: /dask-worker-space/worker-zcofcxub

0,1
Comm: tls://10.4.14.240:35199,Total threads: 2
Dashboard: http://10.4.14.240:45453/status,Memory: 7.67 GiB
Nanny: tls://10.4.14.240:39399,
Local directory: /dask-worker-space/worker-_4b17xyb,Local directory: /dask-worker-space/worker-_4b17xyb

0,1
Comm: tls://10.4.13.175:36439,Total threads: 2
Dashboard: http://10.4.13.175:41335/status,Memory: 7.67 GiB
Nanny: tls://10.4.13.175:42541,
Local directory: /dask-worker-space/worker-2e_44t5n,Local directory: /dask-worker-space/worker-2e_44t5n

0,1
Comm: tls://10.4.5.4:36591,Total threads: 2
Dashboard: http://10.4.5.4:39585/status,Memory: 7.67 GiB
Nanny: tls://10.4.5.4:44071,
Local directory: /dask-worker-space/worker-1n84vnvs,Local directory: /dask-worker-space/worker-1n84vnvs

0,1
Comm: tls://10.4.9.156:35965,Total threads: 2
Dashboard: http://10.4.9.156:40281/status,Memory: 7.67 GiB
Nanny: tls://10.4.9.156:44271,
Local directory: /dask-worker-space/worker-t3cm6dnz,Local directory: /dask-worker-space/worker-t3cm6dnz


## 2. Write Small Synthetic Timeseries Data to MongoDB

This section tests writing a small (17MB) timeseries dataset to MongoDB using the `dask-mongo` connector.

### Create Synthetic Data

In [4]:
ddf = dask.datasets.timeseries(
    "1990-01-01",
    "1990-01-05",
)

In [5]:
ddf

Unnamed: 0_level_0,id,name,x,y
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-01,int64,object,float64,float64
1990-01-02,...,...,...,...
1990-01-03,...,...,...,...
1990-01-04,...,...,...,...
1990-01-05,...,...,...,...


In [6]:
# reset index to include timestamp
ddf2 = ddf.reset_index(drop=False)

In [7]:
ddf2.head()

Unnamed: 0,timestamp,id,name,x,y
0,1990-01-01 00:00:00,995,Edith,0.484207,0.528191
1,1990-01-01 00:00:01,1002,Alice,0.625093,-0.715067
2,1990-01-01 00:00:02,953,Quinn,-0.733051,0.362502
3,1990-01-01 00:00:03,977,Ingrid,0.628617,-0.403905
4,1990-01-01 00:00:04,1026,Patricia,-0.8591,-0.155408


### Convert to Dask Bag

In [8]:
#this is what we would pass to to_mongo
ddf_bag = ddf2.to_bag(format="dict")

In [9]:
ddf_bag.take(1)

({'timestamp': Timestamp('1990-01-01 00:00:00'),
  'id': 995,
  'name': 'Edith',
  'x': 0.48420699040548376,
  'y': 0.5281913834289487},)

### Write to MongoDB

In [10]:
import getpass

In [11]:
pw = getpass.getpass()

········


In [24]:
# Replace the username, password, and cluster address with your own connection details
host_uri = "mongodb+srv://richard:" + urllib.parse.quote(pw) + "@cluster1.kmk7m.mongodb.net/myFirstDatabase?retryWrites=true&w=majority"

In [29]:
%%time
to_mongo(
    ddf_bag,
    database="test",
    collection="test_with_timestamp",
    connection_kwargs={"host": host_uri},
)

ServerSelectionTimeoutError: cluster1-shard-00-01.kmk7m.mongodb.net:27017: timed out,cluster1-shard-00-02.kmk7m.mongodb.net:27017: timed out,cluster1-shard-00-00.kmk7m.mongodb.net:27017: timed out, Timeout: 30s, Topology Description: <TopologyDescription id: 6231c97b1101892f38eff014, topology_type: ReplicaSetNoPrimary, servers: [<ServerDescription ('cluster1-shard-00-00.kmk7m.mongodb.net', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('cluster1-shard-00-00.kmk7m.mongodb.net:27017: timed out')>, <ServerDescription ('cluster1-shard-00-01.kmk7m.mongodb.net', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('cluster1-shard-00-01.kmk7m.mongodb.net:27017: timed out')>, <ServerDescription ('cluster1-shard-00-02.kmk7m.mongodb.net', 27017) server_type: Unknown, rtt: None, error=NetworkTimeout('cluster1-shard-00-02.kmk7m.mongodb.net:27017: timed out')>]>

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/tcp.py", line 426, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/core.py", l

No server found. That may be because it's in EU AWS region while my Coiled Cluster is in US-East-1.

In [26]:
import pymongo

In [27]:
client = pymongo.MongoClient("mongodb+srv://richard:" + urllib.parse.quote(pw) + "@cluster1.kmk7m.mongodb.net/myFirstDatabase?retryWrites=true&w=majority")

db = client.test

In [28]:
db

Database(MongoClient(host=['cluster1-shard-00-01.kmk7m.mongodb.net:27017', 'cluster1-shard-00-02.kmk7m.mongodb.net:27017', 'cluster1-shard-00-00.kmk7m.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-k26qu7-shard-0', ssl=True), 'test')

- Changing Mongo cluster region to us-east-1 didn't resolve the issue
- running `db.test` does work

## 3. Load Timeseries Data from MongoDB

### Load Data

In [26]:
%%time
bag = read_mongo(
    connection_kwargs={"host": host_uri},
    database="test_timeseries",
    collection="test_with_timestamp_2",
    chunksize=100,
)

CPU times: user 107 ms, sys: 19.7 ms, total: 126 ms
Wall time: 3.75 s


In [27]:
bag.take(1)

({'_id': ObjectId('6231b783396ae3778e46fa70'),
  'timestamp': datetime.datetime(1990, 1, 4, 0, 0),
  'id': 947,
  'name': 'Hannah',
  'x': -0.5290492981556099,
  'y': 0.5540242522201178},)

### Convert to Dask DataFrame

In [28]:
ddf3 = bag.to_dataframe()

In [29]:
ddf3.head()

Unnamed: 0,_id,timestamp,id,name,x,y
0,6231b783396ae3778e46fa70,1990-01-04 00:00:00,947,Hannah,-0.529049,0.554024
1,6231b783396ae3778e46fa71,1990-01-04 00:00:01,1050,Frank,0.534544,-0.016464
2,6231b783396ae3778e46fa72,1990-01-04 00:00:02,1065,Patricia,0.499794,0.110423
3,6231b783396ae3778e46fa73,1990-01-04 00:00:03,1054,Norbert,-0.555151,0.86296
4,6231b783396ae3778e46fa74,1990-01-04 00:00:04,992,Alice,-0.115568,-0.343412


In [30]:
ddf3 = ddf3.drop(labels=["_id"], axis=1)

In [31]:
ddf3.head()

Unnamed: 0,timestamp,id,name,x,y
0,1990-01-04 00:00:00,947,Hannah,-0.529049,0.554024
1,1990-01-04 00:00:01,1050,Frank,0.534544,-0.016464
2,1990-01-04 00:00:02,1065,Patricia,0.499794,0.110423
3,1990-01-04 00:00:03,1054,Norbert,-0.555151,0.86296
4,1990-01-04 00:00:04,992,Alice,-0.115568,-0.343412


In [32]:
ddf3

Unnamed: 0_level_0,timestamp,id,name,x,y
npartitions=3456,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,datetime64[ns],int64,object,float64,float64
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [33]:
ddf3 = ddf3.set_index('timestamp')
ddf3.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1990-01-01 00:00:00,1046,Sarah,0.477859,0.254645
1990-01-01 00:00:01,966,George,0.039212,0.342549
1990-01-01 00:00:02,1015,Tim,0.329459,0.674911
1990-01-01 00:00:03,1044,Ray,-0.516542,0.586507
1990-01-01 00:00:04,995,Charlie,0.574594,0.970574


## 4. Process Timeseries Data

In [60]:
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import acf, pacf

In [None]:
lag_acf = 