# Debugging

In [1]:
import coiled
import dask.dataframe as dd
from dask.distributed import Client
import dask

In [None]:
# coiled.create_software_environment(
#     name='create-coiled-datasets-m1-test',
#     conda='/Users/rpelgrim/Documents/git/coiled-resources/create-coiled-datasets/environment-m1-test.yml'
# )

In [2]:
cluster = coiled.Cluster(
    name="data-import",
    software='rrpelgrim/create-coiled-datasets-m1-test',
    n_workers=20,
    worker_cpu=4,
    worker_memory='32Gb',
    scheduler_cpu=2,
    shutdown_on_close=False,
    scheduler_options={'idle_timeout':'1hour'},
)



In [3]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: Cluster
Dashboard: http://3.94.169.2:8787,

0,1
Dashboard: http://3.94.169.2:8787,Workers: 2
Total threads:  8,Total memory:  58.00 GiB

0,1
Comm: tls://10.4.0.179:8786,Workers: 2
Dashboard: http://10.4.0.179:8787/status,Total threads:  8
Started:  18 minutes ago,Total memory:  58.00 GiB

0,1
Comm: tls://10.4.1.201:36217,Total threads: 4
Dashboard: http://10.4.1.201:43185/status,Memory: 29.00 GiB
Nanny: None,
Local directory: /dask-worker-space/worker-gaxhw5c_,Local directory: /dask-worker-space/worker-gaxhw5c_

0,1
Comm: tls://10.4.1.16:44469,Total threads: 4
Dashboard: http://10.4.1.16:38777/status,Memory: 29.00 GiB
Nanny: None,
Local directory: /dask-worker-space/worker-hwci3jz0,Local directory: /dask-worker-space/worker-hwci3jz0


## This works

In [7]:
from pandas import DataFrame, to_datetime
from dask.dataframe import from_pandas

df = DataFrame({
     "index": ["1", "2", "3", "2", "3", "1", "4"],
     "date":  to_datetime(["2017-01-01", "2017-01-01", "2017-01-01", "2017-01-02", "2017-01-02", 
                                            "2017-01-06", "2017-01-09"]),
     "amount": [100, 200, 300, 400, 500, 600, 700]
    }
)

df = from_pandas(df, npartitions=2).set_index("index")

In [8]:
df.to_csv(
    "s3://coiled-datasets/dea-opioid/tests/mre-dummy-test.csv",
)

['coiled-datasets/dea-opioid/tests/mre-dummy-test.csv/0.part',
 'coiled-datasets/dea-opioid/tests/mre-dummy-test.csv/1.part']

In [10]:
df.to_parquet(
    "s3://coiled-datasets/dea-opioid/tests/mre-dummy-test.parquet",
    schema='infer'
)

## This Fails

In [11]:
data = dask.datasets.timeseries("1950")
#data = data.repartition(partition_size='100MB')
data.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1950-01-01 00:00:00,1021,Frank,-0.431293,0.108497
1950-01-01 00:00:01,1073,Patricia,-0.25668,0.142504
1950-01-01 00:00:02,993,Michael,-0.551465,0.178341
1950-01-01 00:00:03,960,Victor,0.233866,0.495149
1950-01-01 00:00:04,1086,Tim,0.865092,0.44904


In [10]:
data.to_parquet(
    "s3://coiled-datasets/dea-opioid/tests/dummy.parquet",
    engine='pyarrow',
    schema='infer'
)

  [["('to-parquet-5105e5051cbc4914f72f9e483ba7826f', ... dummy.parquet']
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good


KeyboardInterrupt: 

In [12]:
df_csv = dd.read_csv(
    "s3://nyc-tlc/trip data/yellow_tripdata_2019-*.csv", 
    parse_dates=["tpep_pickup_datetime", "tpep_dropoff_datetime"],
    dtype={
        "VendorID": "UInt8",
        "passenger_count": "UInt8",
        "RatecodeID": "UInt8",
        "store_and_fwd_flag": "category",
        "PULocationID": "UInt16",
        "DOLocationID": "UInt16",
        "payment_type": "UInt8",
    },
)


In [13]:
df_csv.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.0,1,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.0,2,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [17]:
df_csv.to_parquet(
    "s3://coiled-datasets/dea-opioid/tests/nyc-test-from-csv.parquet",
    engine='pyarrow'
)

CancelledError: to-parquet-458ff93c1a61d3b80a70daf91be2dbc6

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


## This works

In [15]:
df_csv_sub = df_csv.head(100000)

In [16]:
df_csv_sub.to_parquet(
    "s3://coiled-datasets/dea-opioid/tests/nyc-test-from-csv-sub.parquet",
    engine='pyarrow'
)

## This Works

In [8]:
df = dd.read_parquet(
    "s3://coiled-datasets/nyc-taxi/yellow_tripdata_2019.parquet/part.0.parquet", 
    blocksize='16MB',
    dtype={
        "VendorID": "UInt8",
        "passenger_count": "UInt8",
        "RatecodeID": "UInt8",
        "store_and_fwd_flag": "category",
        "PULocationID": "UInt16",
        "DOLocationID": "UInt16",
        "payment_type": "UInt8",
    },
)

df.to_parquet(
    "s3://coiled-datasets/dea-opioid/tests/nyc-test.parquet",
    engine='pyarrow'
)

In [9]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,1,2019-01-01 00:46:40,2019-01-01 00:53:20,1,1.5,1,N,151,239,1,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,1,2019-01-01 00:59:47,2019-01-01 01:18:59,1,2.6,1,N,239,246,1,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2,2018-12-21 13:48:30,2018-12-21 13:52:40,3,0.0,1,N,236,236,1,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2,2018-11-28 15:52:25,2018-11-28 15:55:45,5,0.0,1,N,193,193,2,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2,2018-11-28 15:56:57,2018-11-28 15:58:33,5,0.0,2,N,193,193,2,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


In [7]:
df.to_parquet(
    "s3://coiled-datasets/dea-opioid/tests/nyc-test.parquet",
    engine='pyarrow'
)