# Benchmark: Koalas (PySpark) and Dask - Data Preparation
The benchmark was performed against the 2009 - 2013 Yellow Taxi Trip Records (157 GB) from NYC Taxi and Limousine Commission (TLC) Trip Record Data.

The CSV files were downloaded into Databricks File System (DBFS), and then were converted into Parquet files via Koalas for better efficiency.

Download url: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page.

Data dictionary: https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf.

The scenario used in this benchmark was inspired by https://github.com/xdssio/big_data_benchmarks.

# Download CSV files to DBFS

In [4]:
import os

__file__ =  os.getcwd()
__file__ = __file__ + '\\file' if '\\' in __file__ else __file__ + '/file'

OUTPUT_FOLDER = __file__.replace('\\', '/')
OUTPUT_FOLDER = f"{'/'.join(OUTPUT_FOLDER.split('/')[:-3])}/datasets"

In [6]:
url_loc = {} # Map download url to the file location in DBFS
 
for year in range(2009, 2014):
  for m in range(1, 13):
    month = "{:02d}".format(m)
    fname = 'yellow_tripdata_%s-%s.csv' % (year, month)
    url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/%s' % fname
    loc = f'{OUTPUT_FOLDER}/{fname}'
    url_loc[url] = loc

In [7]:
import urllib.request
import tqdm

for url, loc in tqdm.tqdm(url_loc.items()):
  urllib.request.urlretrieve(url, loc)

  0%|          | 0/60 [00:00<?, ?it/s]


HTTPError: HTTP Error 403: Forbidden

In [None]:
import dbutils

total_bytes = 0
for fileInfo in dbutils.fs.ls(f'{OUTPUT_FOLDER}/taxi_csv'):
  total_bytes += fileInfo.size
print('%s GBs data in total' % (total_bytes * 1e-9))

# Convert to Parquet files
Convert downloaded CSV files into Parquet files via Koalas for better efficiency.

In [None]:
import databricks.koalas as ks
 
ks.set_option('compute.default_index_type', 'distributed-sequence') 

In [None]:
dtype_dict = {
  'Passenger_Count': 'int64', 
  'Start_Lon': 'float64', 
  'Start_Lat': 'float64',
  'End_Lon': 'float64', 
  'End_Lat': 'float64', 
  'Fare_Amt': 'float64', 
  'Tip_Amt': 'float64', 
  'Tolls_Amt': 'float64',
  'Total_Amt': 'float64'
}
ks_df= ks.read_csv(f'{OUTPUT_FOLDER}/taxi_csv', dtype=dtype_dict)


In [None]:
ks_df.columns = ks_df.columns.str.lower()

In [None]:
ks_df.dtypes

In [None]:
%sh rm -fr /dbfs/FileStore/ks_taxi_parquet

In [None]:
ks_df.to_parquet(f'{OUTPUT_FOLDER}/ks_taxi_parquet', index_col='index')

In [None]:
total_bytes = 0
for file_info in dbutils.fs.ls(f'{OUTPUT_FOLDER}/ks_taxi_parquet'):
  total_bytes += file_info.size
print('%s GBs data in total' % (total_bytes * 1e-9))

# Note: Filtering Size
(Size of filtered data / Size of total data) in the benchmark

In [None]:
import databricks.koalas as ks
koalas_data = ks.read_parquet(f'{OUTPUT_FOLDER}/ks_taxi_parquet')

In [None]:
expr_filter = (koalas_data.tip_amt >= 1) & (koalas_data.tip_amt <= 5)
 
print(f'In the benchmark, filtered data is {len(koalas_data[expr_filter]) / len(koalas_data) * 100}% of total data')