# Dask read_csv

In [1]:
import dask.dataframe as dd

In [2]:
ddf = dd.read_csv("dogs.csv")

In [3]:
ddf.compute()

Unnamed: 0,first_name,age
0,fido,3
1,lucky,4
2,gus,8


In [6]:
print(ddf.compute())

  first_name  age
0       fido    3
1      lucky    4
2        gus    8


In [5]:
print(ddf.compute().to_markdown(tablefmt="grid"))

+----+--------------+-------+
|    | first_name   |   age |
|  0 | fido         |     3 |
+----+--------------+-------+
|  1 | lucky        |     4 |
+----+--------------+-------+
|  2 | gus          |     8 |
+----+--------------+-------+


## Similar to syntax for reading pandas files

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("dogs.csv")

In [8]:
df

Unnamed: 0,first_name,age
0,fido,3
1,lucky,4
2,gus,8


## Reading large CSV file

In [9]:
ddf = dd.read_csv("data/G1_1e8_1e2_0_0.csv")

In [14]:
ddf.dtypes

id1     object
id2     object
id3     object
id4      int64
id5      int64
id6      int64
v1       int64
v2       int64
v3     float64
dtype: object

In [13]:
ddf.memory_usage(deep=True).compute()

Index         10496
id1      6200000000
id2      6200000000
id3      6900000000
id4       800000000
id5       800000000
id6       800000000
v1        800000000
v2        800000000
v3        800000000
dtype: int64

In [12]:
ddf.memory_usage(deep=True).sum().compute()

24100010496

In [10]:
ddf.npartitions

82

In [19]:
ddf = dd.read_csv("data/G1_1e8_1e2_0_0.csv", blocksize="16MB")

In [20]:
ddf.npartitions

325

In [23]:
ddf = dd.read_csv("data/G1_1e8_1e2_0_0.csv", blocksize="128MB")

In [24]:
ddf.npartitions

41

## inferring dtypes

In [25]:
ddf.dtypes

id1     object
id2     object
id3     object
id4      int64
id5      int64
id6      int64
v1       int64
v2       int64
v3     float64
dtype: object

In [26]:
ddf = dd.read_csv("data/G1_1e8_1e2_0_0.csv", sample_rows=5000)

In [27]:
ddf.dtypes

id1     object
id2     object
id3     object
id4      int64
id5      int64
id6      int64
v1       int64
v2       int64
v3     float64
dtype: object

## Explicitly specify dtypes

In [29]:
ddf = dd.read_csv(
    "data/G1_1e8_1e2_0_0.csv",
    dtype={
        "id1": "string[pyarrow]",
        "id2": "string[pyarrow]",
        "id3": "string[pyarrow]",
    },
)

In [30]:
ddf.dtypes

id1     string
id2     string
id3     string
id4      int64
id5      int64
id6      int64
v1       int64
v2       int64
v3     float64
dtype: object

## Read multiple CSVs to Dask DataFrame

In [34]:
ddf = dd.read_csv("data/csvs/*.part")

In [35]:
ddf.npartitions

162

## Read multiple CSVs to Pandas DataFrame

In [36]:
import glob
import pandas as pd

In [42]:
all_files = glob.glob("./data/csvs/*.part")

In [44]:
df = pd.concat((pd.read_csv(f) for f in all_files))

## Read CSV in S3 to localhost

In [46]:
ddf = dd.read_csv("s3://coiled-datasets/timeseries/20-years/csv/0000.part")

In [47]:
ddf.head()

Unnamed: 0,timestamp,id,name,x,y
0,2000-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
1,2000-01-01 00:00:01,987,Patricia,0.069601,0.755351
2,2000-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
3,2000-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
4,2000-01-01 00:00:04,967,Michael,-0.25146,0.81093


In [49]:
%%time
ddf = dd.read_csv("s3://coiled-datasets/timeseries/20-years/csv/*.part")

CPU times: user 1.02 s, sys: 117 ms, total: 1.14 s
Wall time: 1.56 s


In [51]:
%%time
ddf.describe().compute()

KeyboardInterrupt: 

## Read CSV data in S3 to cluster

In [2]:
import coiled
import dask.distributed

In [3]:
cluster = coiled.Cluster(name="powers", n_workers=5)

Found software environment build
Created fw rule: inbound [8786-8787] [0.0.0.0/0] []
Created FW rules: coiled-dask-matthew24-101256-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-matthew24-101256-firewall -> coiled-dask-matthew24-101256-firewall]
Created FW rules: coiled-dask-matthew24-101256-cluster-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-matthew24-101256-cluster-firewall -> coiled-dask-matthew24-101256-cluster-firewall]
Created scheduler VM: coiled-dask-matthew24-101256-scheduler (type: t3a.medium, ip: ['3.239.222.81'])


In [4]:
client = dask.distributed.Client(cluster)


+-------------+-----------+-----------+-----------+
| Package     | client    | scheduler | workers   |
+-------------+-----------+-----------+-----------+
| dask        | 2021.11.2 | 2022.01.0 | 2022.01.0 |
| distributed | 2021.11.2 | 2022.01.0 | 2022.01.0 |
| numpy       | 1.22.0    | 1.21.5    | 1.21.5    |
+-------------+-----------+-----------+-----------+


In [5]:
ddf = dd.read_csv("s3://coiled-datasets/timeseries/20-years/csv/*.part")

In [6]:
%%time
ddf.describe().compute()

CPU times: user 2.27 s, sys: 329 ms, total: 2.6 s
Wall time: 5min 10s


Unnamed: 0,id,x,y
count,662256000.0,662256000.0,662256000.0
mean,1000.0,-1.027206e-05,1.435176e-05
std,31.62324,0.577363,0.5773649
min,815.0,-1.0,-1.0
25%,979.0,-0.4963742,-0.4959386
50%,1000.0,0.004206484,0.003999444
75%,1021.0,0.5037865,0.504401
max,1193.0,1.0,1.0


In [7]:
ddf.npartitions

1095

In [8]:
ddf.memory_usage(deep=True).sum().compute()

107514843852

In [9]:
ddf.memory_usage(deep=True).compute()

Index             140160
id            5298048000
name         41289103692
timestamp    50331456000
x             5298048000
y             5298048000
dtype: int64

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
Traceback (most recent call last):
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/distributed/comm/tcp.py", line 398, in connect
    stream = await self.client.connect(
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/powers/opt/miniconda3/envs/standard-coiled/lib/python3.9/asyncio/tasks.py", line 492, in wait_for
    fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of th