In [2]:
import dask
import pandas as pd
import dask.dataframe as dd

In [3]:
df = pd.DataFrame({"nums":[1, 2, 3, 4, 5, 6], "letters":["a", "b", "c", "d", "e", "f"]})
ddf = dd.from_pandas(df, npartitions=2)

In [4]:
for i in range(ddf.npartitions):
    print(ddf.partitions[i].compute())

   nums letters
0     1       a
1     2       b
2     3       c
   nums letters
3     4       d
4     5       e
5     6       f


## Grab some rows

In [5]:
print(ddf.loc[[2, 5]].compute())

   nums letters
2     3       c
5     6       f


In [6]:
print(ddf.loc[3:5].compute())

   nums letters
3     4       d
4     5       e
5     6       f


In [7]:
ddf.known_divisions

True

In [8]:
ddf.divisions

(0, 3, 5)

## Grab a single column for certain rows

This returns a Dask Series object

In [9]:
ddf.loc[[2, 5], "letters"].compute()

2    c
5    f
Name: letters, dtype: object

In [10]:
ddf.loc[[2, 5], "letters"]

Dask Series Structure:
npartitions=2
2    object
5       ...
5       ...
Name: letters, dtype: object
Dask Name: loc, 4 tasks

## Filter on column values

In [11]:
print(ddf.loc[ddf["nums"] % 2 == 0].compute())

   nums letters
1     2       b
3     4       d
5     6       f


In [12]:
print(ddf.loc[(ddf["nums"] % 2 == 0) & (ddf["letters"].isin(["b", "f"]))].compute())

   nums letters
1     2       b
5     6       f


## Big data filtering on cloud

Start by creating cloud cluster

In [13]:
import coiled
cluster = coiled.Cluster(name="demo-cluster", n_workers=5)

Output()

Found software environment build


In [14]:
client = dask.distributed.Client(cluster)

### Read in big dataset and expore

In [15]:
ddf = dd.read_parquet(
    "s3://coiled-datasets/timeseries/20-years/parquet",
    storage_options={"anon": True, 'use_ssl': True}
)

In [16]:
len(ddf)

662256000

In [17]:
ddf.npartitions

1095

In [18]:
ddf.head()

Unnamed: 0_level_0,id,name,x,y
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-01 00:00:00,1008,Dan,-0.259374,-0.118314
2000-01-01 00:00:01,987,Patricia,0.069601,0.755351
2000-01-01 00:00:02,980,Zelda,-0.281843,-0.510507
2000-01-01 00:00:03,1020,Ursula,-0.569904,0.523132
2000-01-01 00:00:04,967,Michael,-0.25146,0.81093


### Filter based on column value

In [19]:
res = ddf.loc[ddf["id"] > 1150]

In [20]:
len(res)

1103

The number of partitions is unchanged after filtering

In [21]:
res.npartitions

1095

Look at the number of rows in each partition.  Lots of partitions are empty or only contain a couple rows of data.

In [22]:
length_partitions = res.map_partitions(len).compute()

In [23]:
length_partitions

0       0
1       1
2       0
3       0
4       2
       ..
1090    0
1091    2
1092    0
1093    0
1094    0
Length: 1095, dtype: int64

In [24]:
type(length_partitions)

pandas.core.series.Series

In [25]:
len(list(filter(lambda x: x == 0, length_partitions)))

390

### Performance benchmarking

Running computations on the data without and repartitioning or persisting is slow.  Lets see how much query times can be improved with these performance optimizations.

In [26]:
%%time
len(res)

CPU times: user 235 ms, sys: 40.2 ms, total: 275 ms
Wall time: 57.4 s


1103

In [27]:
res2 = res.repartition(2).persist()

In [28]:
%%time
len(res2)

CPU times: user 148 ms, sys: 21.3 ms, total: 170 ms
Wall time: 56.4 s


1103

In [32]:
%%time
len(res2)

CPU times: user 22.7 ms, sys: 3.97 ms, total: 26.7 ms
Wall time: 298 ms


1103

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError


In [29]:
res3 = res.compute()

In [30]:
%%time
len(res3)

CPU times: user 9 µs, sys: 0 ns, total: 9 µs
Wall time: 11.2 µs


1103