# Dask Example

This notebook follows the dask example outlined [here](https://tutorial.dask.org/00_overview.html)

In [None]:
import dask.dataframe as dd
from dask.distributed import Client

In [None]:
client = Client()
client

In [None]:
ddf = dd.read_parquet(
    "s3://dask-data/nyc-taxi/nyc-2015.parquet/part.*.parquet",
    columns=["passenger_count", "tip_amount"],
    storage_options={"anon": True},
)

In [None]:
result = ddf.groupby("passenger_count").tip_amount.mean().compute()
result

In [5]:
!pip install findspark

In [6]:
import findspark

findspark.init()


In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

RuntimeError: Java gateway process exited before sending its port number

## Testing with our data

This part moves out of the tutorial and tries to replicate the tutorial using our own parquet files

In [None]:
ddf = dd.read_parquet(
    '../../datasets/yellow_taxi_tripdata_2009-01.parquet',
    # columns=["Passenger_Count", "Tip_Amt"],
    storage_options={"anon": True},
)

# # If we don't filter initially the number of columns, the groupby doesn't work because it takes too much memory
# result = ddf.groupby("Passenger_Count").Tip_Amt.mean().compute()

# By filtering only the 2 columns used in the operation, dask works fine, as it doesn't require a lot of memory to compute it
result = ddf[['Passenger_Count', 'Tip_Amt']].groupby("Passenger_Count").Tip_Amt.mean().compute()