In [1]:
import dask.dataframe as dd
import time
import random
import pandas as pd

In [2]:
ddf = dd.read_csv('random_people.csv')

In [3]:
ddf.head()

Unnamed: 0,name,surname,salary
0,Henry,Joneson,5000
1,Albert,Goodman,10000
2,William,Goodman,10000
3,John,Joneson,10000
4,Albert,Black,10000


In [4]:
df = pd.read_csv('random_people.csv')

In [5]:
df.head()

Unnamed: 0,name,surname,salary
0,Henry,Joneson,5000
1,Albert,Goodman,10000
2,William,Goodman,10000
3,John,Joneson,10000
4,Albert,Black,10000


In [6]:
def benchmark(function, df, compute=False):
    start = time.time()
    function(df, compute)
    end = time.time()
    print("{} seconds for '{}' '{}'".format((end - start), df.__module__.split('.')[0], function.__name__))

In [7]:
def add_bonus(df, compute=False):
    df['bonus'] = df['salary'] * 0.2

#### simple test for adding new column 

In [8]:
benchmark(add_bonus, ddf)
benchmark(add_bonus, df)

0.011136054992675781 seconds for 'dask' 'add_bonus'
0.002496004104614258 seconds for 'pandas' 'add_bonus'


#### create large dataframe 

In [9]:
df = pd.concat([df for _ in range(1000)])
df = pd.concat([df for _ in range(500)])

df.head()

Unnamed: 0,name,surname,salary,bonus
0,Henry,Joneson,5000,1000.0
1,Albert,Goodman,10000,2000.0
2,William,Goodman,10000,2000.0
3,John,Joneson,10000,2000.0
4,Albert,Black,10000,2000.0


In [10]:
df.shape

(25000000, 4)

In [27]:
ddf = dd.from_pandas(df, npartitions=16)

In [28]:
ddf.shape[0].compute(), ddf.shape[1]

(25000000, 5)

In [29]:
def get_mean(df, compute=False):
    x = df.salary.mean()
    if compute:
        x = x.compute()
    return x

def get_max(df, compute=False):
    x = df.salary.max()
    if compute:
        x = x.compute()
    return x

def get_sum(df, compute=False):
    x = df.salary.sum()
    if compute:
        x = x.compute()
    return x

def filter_test(df, compute=False):
    x = df[df['salary']>5000]
    return x

def run_benchmarks():
    for i,f in enumerate([add_bonus,
                          get_mean,
                          get_max,
                          get_sum,
                          filter_test
                         ]):
        benchmark(f, ddf, True)
        benchmark(f, df)
                          

In [30]:
def f(x):
    return (13*x+5)%7

def apply_func(df, compute=False):
    df['random']= df['salary'].apply(f)
    if compute:
        df.compute()
    
def value_count(df, compute=False):
    x = df.salary.value_counts()
    if compute:
        x.compute()
    return x

In [31]:
run_benchmarks()

0.008981943130493164 seconds for 'dask' 'add_bonus'
0.36530303955078125 seconds for 'pandas' 'add_bonus'
2.7083661556243896 seconds for 'dask' 'get_mean'
0.04411911964416504 seconds for 'pandas' 'get_mean'
2.500271797180176 seconds for 'dask' 'get_max'
0.03551602363586426 seconds for 'pandas' 'get_max'
2.437288999557495 seconds for 'dask' 'get_sum'
0.020888090133666992 seconds for 'pandas' 'get_sum'
0.002231121063232422 seconds for 'dask' 'filter_test'
2.2981791496276855 seconds for 'pandas' 'filter_test'


In [32]:
benchmark(apply_func, ddf, True)
benchmark(apply_func, df)
benchmark(value_count, ddf, True)
benchmark(value_count, df)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('salary', 'int64'))



25.038264989852905 seconds for 'dask' 'apply_func'
15.947418928146362 seconds for 'pandas' 'apply_func'
20.31215000152588 seconds for 'dask' 'value_count'
0.45989203453063965 seconds for 'pandas' 'value_count'


In [21]:
ddf.salary.value_counts().compute()

10000    8000000
12000    4000000
11000    3500000
9500     3000000
13500    2500000
5500     2500000
5000     1500000
Name: salary, dtype: int64

In [33]:
df.salary.value_counts()

10000    8000000
12000    4000000
11000    3500000
9500     3000000
13500    2500000
5500     2500000
5000     1500000
Name: salary, dtype: int64