# Benchmark: Koalas (PySpark) and Dask - Local execution
The benchmark was performed against the 2009 - 2013 Yellow Taxi Trip Records (157 GB) from NYC Taxi and Limousine Commission (TLC) Trip Record Data. We identified common operations from our pandas workloads such as basic calculations of statistics, join, filtering and grouping on this dataset.

The operations were measured with/without filter operations to consider real world workloads.

## Set-up


In [1]:
from pyspark.sql import SparkSession

# Initialize the Spark session (if not already initialized)
# spark = SparkSession.builder \
#     .appName("bdcc") \
#     .config("spark.driver.maxResultSize", "2g") \
#     .config("spark.executor.memory", "10g") \
#     .config("spark.executor.instances", "2") \
#     .getOrCreate()

# spark = SparkSession.builder \
#     .appName("bdcc") \
#     .config("spark.executor.memory", "4g") \
#     .config("spark.driver.maxResultSize", "4g") \
#     .config("spark.driver.memory", "20g") \
#     .config("spark.driver.cores", "4") \
#     .config('PYSPARK_PYTHON', 'python') \
#     .getOrCreate()


spark = SparkSession.builder \
    .appName("bdcc") \
    .master("local[6]") \
    .config("spark.driver.memory", "20g") \
    .config("spark.executor.memory", "20g") \
    .config("spark.executor.cores", "6") \
    .config("spark.driver.cores", '1') \
    .getOrCreate()


spark.conf.set("spark.databricks.io.cache.enabled", "false")
print("spark.databricks.io.cache.enabled is %s" % spark.conf.get("spark.databricks.io.cache.enabled"))
print("Access spark session: http://localhost:4040 %s" % spark)

spark.databricks.io.cache.enabled is false
Access spark session: http://localhost:4040 <pyspark.sql.session.SparkSession object at 0x000001908C9FCD88>


In [2]:
# %pip install -U koalas dask[complete] numpy pandas pyarrow

In [3]:
import pandas as pd
import numpy as np
import databricks.koalas as ks
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
 
print('pandas version: %s' % pd.__version__)
print('numpy version: %s' % np.__version__)
print('koalas version: %s' % ks.__version__)
import dask
print('dask version: %s' % dask.__version__)
 
import time

def benchmark(f, df, benchmarks, name, **kwargs):
    """Benchmark the given function against the given DataFrame.
    
    Parameters
    ----------
    f: function to benchmark
    df: data frame
    benchmarks: container for benchmark results
    name: task name
    
    Returns
    -------
    Duration (in seconds) of the given operation
    """
    start_time = time.time()
    ret = f(df, **kwargs)
    benchmarks['duration'].append(time.time() - start_time)
    benchmarks['task'].append(name)
    print(f"{name} took: {benchmarks['duration'][-1]} seconds")
    return benchmarks['duration'][-1]
 
def get_results(benchmarks):
    """Return a pandas DataFrame containing benchmark results."""
    return pd.DataFrame.from_dict(benchmarks)

pandas version: 1.1.5
numpy version: 1.19.5
koalas version: 1.7.0
dask version: 2021.03.0


## Dask

### Preparation

In [4]:
cluster = LocalCluster(n_workers=1, threads_per_worker=4, memory_limit='20GiB') # Memory limit is set per worker
client = Client(cluster)

DATASETS_DIR = '../../datasets'
dask_data = dd.read_parquet(f'{DATASETS_DIR}/ks_taxi_parquet')

dask_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}
client

0,1
Client  Scheduler: tcp://127.0.0.1:61288  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 21.47 GB


### Standard operations

In [5]:
def read_file_parquet(df=None):
    return dd.read_parquet(f'{DATASETS_DIR}/ks_taxi_parquet')
  
def count(df=None):
    return len(df)
 
def count_index_length(df=None):
    return len(df.index)
 
def mean(df):
    return df.fare_amt.mean().compute()
 
def standard_deviation(df):
    return df.fare_amt.std().compute()
 
def mean_of_sum(df):
    return (df.fare_amt + df.tip_amt).mean().compute()
 
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
# def sum_columns(df):
#     return (df.fare_amt + df.tip_amt).compute()
 
def mean_of_product(df):
    return (df.fare_amt * df.tip_amt).mean().compute()
 
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
# def product_columns(df):
#     return (df.fare_amt * df.tip_amt).compute()
  
def value_counts(df):
    return df.fare_amt.value_counts().compute()
  
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.start_lon
    phi_1 = df.start_lat
    theta_2 = df.end_lon
    phi_2 = df.end_lat
    temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
           + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
    ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
    return ret.mean().compute()
  
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
# def complicated_arithmetic_operation(df):
#     theta_1 = df.start_lon
#     phi_1 = df.start_lat
#     theta_2 = df.end_lon
#     phi_2 = df.end_lat
#     temp = (np.sin((theta_2-theta_1)/2*np.pi/180)**2
#            + np.cos(theta_1*np.pi/180)*np.cos(theta_2*np.pi/180) * np.sin((phi_2-phi_1)/2*np.pi/180)**2)
#     ret = 2 * np.arctan2(np.sqrt(temp), np.sqrt(1-temp))
#     return ret.compute()
  
def groupby_statistics(df):
    return df.groupby(by='passenger_count').agg(
      {
        'fare_amt': ['mean', 'std'], 
        'tip_amt': ['mean', 'std']
      }
    ).compute()
  
other = groupby_statistics(dask_data)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
 
def join_count(df, other):
    return len(dd.merge(df, other, left_index=True, right_index=True))
 
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
def join_data(df, other):
    return dd.merge(df, other, left_index=True, right_index=True).compute()

In [6]:
benchmark(read_file_parquet, df=None, benchmarks=dask_benchmarks, name='read file')
benchmark(count, df=dask_data, benchmarks=dask_benchmarks, name='count')
benchmark(count_index_length, df=dask_data, benchmarks=dask_benchmarks, name='count index length')
benchmark(mean, df=dask_data, benchmarks=dask_benchmarks, name='mean')
benchmark(standard_deviation, df=dask_data, benchmarks=dask_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=dask_data, benchmarks=dask_benchmarks, name='mean of columns addition')
# benchmark(sum_columns, df=dask_data, benchmarks=dask_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=dask_data, benchmarks=dask_benchmarks, name='mean of columns multiplication')
# benchmark(product_columns, df=dask_data, benchmarks=dask_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=dask_data, benchmarks=dask_benchmarks, name='value counts')
benchmark(mean_of_complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='mean of complex arithmetic ops')
# benchmark(complicated_arithmetic_operation, df=dask_data, benchmarks=dask_benchmarks, name='complex arithmetic ops')
benchmark(groupby_statistics, df=dask_data, benchmarks=dask_benchmarks, name='groupby statistics')
benchmark(join_count, dask_data, benchmarks=dask_benchmarks, name='join count', other=other)
benchmark(join_data, dask_data, benchmarks=dask_benchmarks, name='join', other=other)

read file took: 0.012467145919799805 seconds
count took: 3.402935266494751 seconds
count index length took: 21.7066810131073 seconds
mean took: 4.680126428604126 seconds
standard deviation took: 6.021027326583862 seconds
mean of columns addition took: 6.579668283462524 seconds
mean of columns multiplication took: 6.468385934829712 seconds
value counts took: 4.679720878601074 seconds
mean of complex arithmetic ops took: 20.302884340286255 seconds
groupby statistics took: 25.221879243850708 seconds
join count took: 29.049068927764893 seconds
join took: 25.331951141357422 seconds


25.331951141357422

### Operations with filtering

In [7]:
expr_filter = (dask_data.tip_amt >= 1) & (dask_data.tip_amt <= 5)
 
def filter_data(df):
    return df[expr_filter]
  
dask_filtered = filter_data(dask_data)

In [8]:
benchmark(count, dask_filtered, benchmarks=dask_benchmarks, name='filtered count')
benchmark(count_index_length, dask_filtered, benchmarks=dask_benchmarks, name='filtered count index length')
benchmark(mean, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean')
benchmark(standard_deviation, dask_filtered, benchmarks=dask_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, dask_filtered, benchmarks=dask_benchmarks, name ='filtered mean of columns addition')
# benchmark(sum_columns, df=dask_filtered, benchmarks=dask_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, dask_filtered, benchmarks=dask_benchmarks, name ='filtered mean of columns multiplication')
# benchmark(product_columns, df=dask_filtered, benchmarks=dask_benchmarks, name='filtered multiplication of columns')
benchmark(mean_of_complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered mean of complex arithmetic ops')
# benchmark(complicated_arithmetic_operation, dask_filtered, benchmarks=dask_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, dask_filtered, benchmarks=dask_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, dask_filtered, benchmarks=dask_benchmarks, name='filtered groupby statistics')
 
other = groupby_statistics(dask_filtered)
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
 
benchmark(join_count, dask_filtered, benchmarks=dask_benchmarks, name='filtered join count', other=other)
benchmark(join_data, dask_filtered, benchmarks=dask_benchmarks, name='filtered join', other=other)

filtered count took: 25.656850814819336 seconds
filtered count index length took: 24.860188961029053 seconds
filtered mean took: 26.241296768188477 seconds
filtered standard deviation took: 26.413559198379517 seconds
filtered mean of columns addition took: 25.802118062973022 seconds
filtered mean of columns multiplication took: 25.537301540374756 seconds
filtered mean of complex arithmetic ops took: 29.425952196121216 seconds
filtered value counts took: 25.944531202316284 seconds
filtered groupby statistics took: 27.237637758255005 seconds
filtered join count took: 27.612762451171875 seconds
filtered join took: 27.089094638824463 seconds


27.089094638824463

In [9]:
client.restart()



0,1
Client  Scheduler: tcp://127.0.0.1:61288  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 21.47 GB


## Koalas

### Preparation

In [10]:
koalas_data = ks.read_parquet(F'{DATASETS_DIR}/ks_taxi_parquet')

koalas_benchmarks = {
    'duration': [],  # in seconds
    'task': [],
}

### Standard Operations

In [11]:
def read_file_parquet(df=None):
    return ks.read_parquet(f'{DATASETS_DIR}/ks_taxi_parquet')
  
def count(df=None):
    return len(df)
 
def count_index_length(df=None):
    return len(df.index)
 
def mean(df):
    return df.fare_amt.mean()
 
def standard_deviation(df):
    return df.fare_amt.std()
 
def mean_of_sum(df):
    return (df.fare_amt + df.tip_amt).mean()
 
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
# def sum_columns(df):
#     x = df.fare_amt + df.tip_amt
#     x.to_pandas()
#     return x
 
def mean_of_product(df):
    return (df.fare_amt * df.tip_amt).mean()
 
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
# def product_columns(df):
#     x = df.fare_amt * df.tip_amt
#     x.to_pandas()
#     return x
 
def value_counts(df):
    val_counts = df.fare_amt.value_counts()
    val_counts.to_pandas()
    return val_counts
  
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
# def complicated_arithmetic_operation(df):
#     theta_1 = df.start_lon
#     phi_1 = df.start_lat
#     theta_2 = df.end_lon
#     phi_2 = df.end_lat
#     temp = (np.sin((theta_2 - theta_1) / 2 * np.pi / 180) ** 2
#            + np.cos(theta_1 * np.pi / 180) * np.cos(theta_2 * np.pi / 180) * np.sin((phi_2 - phi_1) / 2 * np.pi / 180) ** 2)
#     ret = np.multiply(np.arctan2(np.sqrt(temp), np.sqrt(1-temp)),2)
#     ret.to_pandas()
#     return ret
  
def mean_of_complicated_arithmetic_operation(df):
    theta_1 = df.start_lon
    phi_1 = df.start_lat
    theta_2 = df.end_lon
    phi_2 = df.end_lat
    temp = (np.sin((theta_2 - theta_1) / 2 * np.pi / 180) ** 2
           + np.cos(theta_1 * np.pi / 180) * np.cos(theta_2 * np.pi / 180) * np.sin((phi_2 - phi_1) / 2 * np.pi / 180) ** 2)
    ret = np.multiply(np.arctan2(np.sqrt(temp), np.sqrt(1-temp)),2) 
    return ret.mean()
  
def groupby_statistics(df):
    gb = df.groupby(by='passenger_count').agg(
      {
        'fare_amt': ['mean', 'std'], 
        'tip_amt': ['mean', 'std']
      }
    )
    gb.to_pandas()
    return gb
  
other = ks.DataFrame(groupby_statistics(koalas_data).to_pandas())
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
 
# The "other.spark.hint('broadcast')" requires the environment variable PYSPARK_PYTHON=python for it to work
# issue: https://stackoverflow.com/questions/53252181/python-worker-failed-to-connect-back
def join_count(df, other):
    res = df.merge(other.spark.hint("broadcast"), left_index=True, right_index=True)
    return res.shape[0]
 
# In our opinion, it doesn't make sense to compute a column and show it in memory
# As we usually save these onto files or something similar (pagination) and this
# method would require for all the column to be in memory
def join_data(df, other):
    ret = df.merge(other.spark.hint("broadcast"), left_index=True, right_index=True)
    ret.to_pandas()
    return ret


In [12]:
benchmark(read_file_parquet, df=None, benchmarks=koalas_benchmarks, name='read file')
benchmark(count, df=koalas_data, benchmarks=koalas_benchmarks, name='count')
benchmark(count_index_length, df=koalas_data, benchmarks=koalas_benchmarks, name='count index length')
benchmark(mean, df=koalas_data, benchmarks=koalas_benchmarks, name='mean')
benchmark(standard_deviation, df=koalas_data, benchmarks=koalas_benchmarks, name='standard deviation')
benchmark(mean_of_sum, df=koalas_data, benchmarks=koalas_benchmarks, name='mean of columns addition')
# benchmark(sum_columns, df=koalas_data, benchmarks=koalas_benchmarks, name='addition of columns')
benchmark(mean_of_product, df=koalas_data, benchmarks=koalas_benchmarks, name='mean of columns multiplication')
# benchmark(product_columns, df=koalas_data, benchmarks=koalas_benchmarks, name='multiplication of columns')
benchmark(value_counts, df=koalas_data, benchmarks=koalas_benchmarks, name='value counts')
# benchmark(complicated_arithmetic_operation, df=koalas_data, benchmarks=koalas_benchmarks, name='complex arithmetic ops')
benchmark(mean_of_complicated_arithmetic_operation, df=koalas_data, benchmarks=koalas_benchmarks, name='mean of complex arithmetic ops')
benchmark(groupby_statistics, df=koalas_data, benchmarks=koalas_benchmarks, name='groupby statistics')
benchmark(join_count, koalas_data, benchmarks=koalas_benchmarks, name='join count', other=other)    # For this one to work, the environment variable "PYSPARK_PYTHON" must be set to "python"
benchmark(join_data, koalas_data, benchmarks=koalas_benchmarks, name='join', other=other)

read file took: 0.20592594146728516 seconds
count took: 0.4872777462005615 seconds
count index length took: 0.3835182189941406 seconds
mean took: 0.7849950790405273 seconds
standard deviation took: 1.1820640563964844 seconds
mean of columns addition took: 1.2565875053405762 seconds
mean of columns multiplication took: 1.2703442573547363 seconds
value counts took: 10.476154088973999 seconds
mean of complex arithmetic ops took: 21.048777103424072 seconds
groupby statistics took: 5.161369323730469 seconds
join count took: 197.5554277896881 seconds
join took: 374.29529333114624 seconds


374.29529333114624

### Operations with filtering

In [13]:
expr_filter = (koalas_data.tip_amt >= 1) & (koalas_data.tip_amt <= 5)
 
def filter_data(df):
    return df[expr_filter]
 
koalas_filtered = filter_data(koalas_data)

In [14]:
benchmark(count, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered count')
benchmark(count_index_length, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered count index length')
benchmark(mean, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered mean')
benchmark(standard_deviation, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered standard deviation')
benchmark(mean_of_sum, koalas_filtered, benchmarks=koalas_benchmarks, name ='filtered mean of columns addition')
# benchmark(sum_columns, df=koalas_filtered, benchmarks=koalas_benchmarks, name='filtered addition of columns')
benchmark(mean_of_product, koalas_filtered, benchmarks=koalas_benchmarks, name ='filtered mean of columns multiplication')
# benchmark(product_columns, df=koalas_filtered, benchmarks=koalas_benchmarks, name='filtered multiplication of columns')
benchmark(mean_of_complicated_arithmetic_operation, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered mean of complex arithmetic ops')
# benchmark(complicated_arithmetic_operation, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered complex arithmetic ops')
benchmark(value_counts, koalas_filtered, benchmarks=koalas_benchmarks, name ='filtered value counts')
benchmark(groupby_statistics, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered groupby statistics')
 
other = ks.DataFrame(groupby_statistics(koalas_filtered).to_pandas())
other.columns = pd.Index([e[0]+'_' + e[1] for e in other.columns.tolist()])
benchmark(join_count, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered join count', other=other)
benchmark(join_data, koalas_filtered, benchmarks=koalas_benchmarks, name='filtered join', other=other)

filtered count took: 1.3020622730255127 seconds
filtered count index length took: 0.9751574993133545 seconds
filtered mean took: 1.5368273258209229 seconds
filtered standard deviation took: 1.6064865589141846 seconds
filtered mean of columns addition took: 1.440826177597046 seconds
filtered mean of columns multiplication took: 1.4657783508300781 seconds
filtered mean of complex arithmetic ops took: 10.1757173538208 seconds
filtered value counts took: 8.297699451446533 seconds
filtered groupby statistics took: 3.2496275901794434 seconds
filtered join count took: 183.7692358493805 seconds
filtered join took: 401.5474445819855 seconds


401.5474445819855

## Result

In [15]:
koalas_res_temp = get_results(koalas_benchmarks).set_index('task')
dask_res_temp = get_results(dask_benchmarks).set_index('task')
df = pd.concat([koalas_res_temp.duration, dask_res_temp.duration], axis=1, keys=['koalas', 'dask'])

In [16]:
from datetime import datetime
 
filename = f'{DATASETS_DIR}/../results/koalas-benchmark-no-parquet-cache/single_node_' + datetime.now().strftime("%H%M%S")
print(filename)
 
df.to_parquet(filename)

../../datasets/../results/koalas-benchmark-no-parquet-cache/single_node_162240
