In [2]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import datetime
import random
import string
import matplotlib.pyplot as plt
import seaborn as sns

from numba import jit
from multiprocessing import cpu_count
from dask.multiprocessing import get
from functools import reduce

nCores = cpu_count()
sns.set()

%load_ext line_profiler

## Initializing Data

The computer I primarily use has 32GB of RAM (DDR4-3200,CAS14).  Additionally, I created a 64GB swap file partition on a Samsung 960 Evo M.2 Flash Drive (if anyone has any experience using Intel Optane drive for this, let me know about your experiences).  I wanted to create a dataframe that exceeded 32GB in memory to test the efficacy of Pandas vs Dask vs Spark.  The following parameters should accomplish that with 100 million rows.

The "data" contains a random date between 1900 and 2000, a random float between 0 and 1, a random int between 0 and 3333, and a "categorical" string of [a-z].

The random int column, c2, contains NaN values, 2% of the total.

The categorical column, c3, also contains NaN values, 2% of the total.

In [3]:
def gen_data(nRows, startdate):
    @jit
    def numerical_data(nRows, startdate):
        c1 = np.random.rand(nRows)
        c2 = np.random.randint(0,3333,nRows)
        c3 = np.random.choice(['a','b','c','d','e',
                   'f','g','h','i','j',
                   'k','l','m','n','o',
                   'p','q','r','s','t',
                   'u','v','w','x','y','z'],size=nRows)

        return c1, c2, c3

    c1, c2, c3 = numerical_data(nRows, startdate)
    date = [startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)]
    
    data = pd.DataFrame({'date':date,
                         'c1':c1,
                         'c2':c2,
                         'c3':c3
                        })

    data.date = pd.to_datetime(data.date)

    ### picking random null values
    data.loc[data.sample(frac=.02).index,'c2'] = np.nan
    data.loc[data.sample(frac=.02).index,'c3'] = np.nan
    
    return data

## Python, Pandas, Single-Processor

The following function extracts the various parts of time from a datetime column and returns the original dataframe with the new features:

In [4]:
def date_extractor(df, dateCol, interval=['year','month','week','day','dayofweek']):
    '''
    input: dataframe, column of datetime objects, desired list of time-interval features
    output: dataframe with new time-interval features appended
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for i in interval:
        df.loc[:, i] = eval('df.%s.dt.%s'% (dateCol, i))
    df.drop(dateCol, axis=1, inplace=True)
    
    return df

This next function fills our missing values and creates an addition feature: `feat_isnull`

In [5]:
def impute_null(df, cols):
    '''
    input: dataframe, numerical columns with null values
    output: dataframe with null values imputed with median and a new feature indicating that entry was null
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for col in cols:
        # creating the new feature which indicates isnull
        feat_is_null = col + '_isnull'
        df.loc[:, feat_is_null] = np.int8(df[col].isnull())
        
         # imputing median
        impute_value = np.nanmedian(df[col])
        df.loc[:, col].fillna(impute_value, inplace=True)
    
    return df

For categorical features, we'll treat them slightly differently.

In [6]:
def impute_null_cat(df, cols, naStr):
    '''
    input: dataframe, categorical column with null values, string to signify missing value
    output: dataframe with null values imputed with 'UNK' and a new feature indicating entry was null
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for col in cols:
        # creating the new feature which indicates isnull
        feat_is_null = col + '_isnull'
        df.loc[:, feat_is_null] = np.int8(df[col].isnull())
        
        # imputing missing code
        df.loc[:, col].fillna(naStr, inplace=True)
        
    return df

The following functions deal with encoding categorical features.  We will not be using the sklearn encoder as it has issues dealing with previously unseen values.  For example, let's say you have a feature with 6 levels: `{A, B, C, D, E, F}`, however, `F` is relatively rare.  After creating a validation set, it turns out that `F` is not seen in the `x_train` but is in `x_valid`.  

Sklearn's encoder will not be able to "train" on `x_train` and properly convert `x_valid` due to the varying number of values.  In the functions below, instead, we pass `unkStr` to recode a previously unseen value.  For this example, we will encode these as we did missing values.

In [102]:
def cat_encode_train(df, col, unkStr):
    '''
    input: dataframe, name of single categorical column
    output: dictionary representing previous levels and new numerical encodings
    '''
    keys = set(df[col])
    keys.add(unkStr)
    values = np.arange(0, len(keys)+1)
    
    return dict(zip(keys,values))


def cat_encoder(df, col, dict_enc, unkStr):
    '''
    input: dataframe, name of single categorical column, dictionary of encodings, string to use as unknown
    output: dataframe with categorical values encoded
    note: you probably want to match the unknown str with the string used as null in impute_null_cat
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    # need to replace unknown values with unkStr
    df.loc[~df[col].isin(set(dict_enc.keys())), col] = unkStr

    df.loc[:,col] = df.loc[:,col].map(lambda x : dict_enc[x])
    
    return df

In [201]:
results = pd.DataFrame(columns = ['pd_date',
                                  'pd_impute',
                                  'pd_encode',
                                  'pd_total',
                                  'nRows',
                                  'memory'
                                 ])

# parameters for generating data
startdate = datetime.date(1900,1,1)

for i,nIter in enumerate(10 ** np.arange(5,8)): 
    data = gen_data(nIter, startdate)
    
    print('Size: %s'% nIter)
    tmp_ = %timeit -n 1 -r 1 -o date_extractor(data, 'date')
    out1_ = tmp_.average
    
    tmp_ = %timeit -n 1 -r 1 -o impute_null(data, ['c2'])
    out2_ = tmp_.average
    tmp_ = %timeit -n 1 -r 1 -o impute_null_cat(data, ['c3'], 'UNK')
    out2_ += tmp_.average
    
    tmp_ = %timeit -n 1 -r 1 -o cat_encode_train(data, 'c3', 'UNK')
    out3_ = tmp_.average
    dict_ = cat_encode_train(data, 'c3', 'UNK')
    tmp_ = %timeit -n 1 -r 1 -o cat_encoder(data, 'c3', dict_, 'UNK')
    out3_ += tmp_.average
    
    row = {'pd_date':out1_,
           'pd_impute':out2_,
           'pd_encode':out3_,
           'pd_total':out1_ + out2_ + out3_,
           'nRows':nIter,
           'memory':data.memory_usage().sum() / 1e6
          }
    
    results.loc[i] = row

Size: 100000
39.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
3.86 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
8.48 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.17 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
39.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Size: 1000000
323 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
22.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
67.5 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
12.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
338 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Size: 10000000
4.58 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
387 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
825 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
137 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
3.51 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 l

In [50]:
results

Unnamed: 0,pd_date,pd_impute,pd_encode,pd_total,nRows,memory
0,0.043549,0.01344,0.041927,0.098916,100000.0,6.60008


## Dask, Multi-Processor

For this next section, we'll be using map_partitions to apply the previous written functions across data sets to parallelize their operations.  While we could re-write these functions, this is the quickest way to get the benefit of multi-core parallel operation.

However, we're going to have to rewrite our `cat_encoder`.  Our label encoder takes all of the possible values within a feature and creates a numerical mapping.  However, since our data will be split randomly into parititons, we cannot be certain that each partition will have an identical set of values.  In this example, I split the data into 12 partitions, but for this function, I do not want 12 potential encodings.

We'll change the function to, instead, aggregate all the potential encodings before creating a numerical mapping.

In [192]:
def cat_encode_train_dask(dd, col, unkStr):
    '''
    input: Dask Dataframe
    output: Dictionary containing categorical-to-numerical mappings
    note: Need to be careful to grab all potential mappings across all partitions
    '''
    tmp_ = dd.map_partitions(lambda df : set(df[col])).compute()
    keys = reduce(lambda x, y : x | y, tmp_)
    keys.add(unkStr)
    values = np.arange(0, len(keys)+1)
    
    return dict(zip(keys,values))

In [203]:
results = pd.DataFrame(columns = ['dd_date',
                                  'dd_impute',
                                  'dd_encode',
                                  'dd_total',
                                  'nRows'
                                 ])

# parameters for generating data
startdate = datetime.date(1900,1,1)

for i,nIter in enumerate(10 ** np.arange(5,8)): 
    data_ = gen_data(nIter, startdate)
    data = dd.from_pandas(data_,npartitions=6)
    
    print('Size: %s'% nIter)
    tmp_ = %timeit -n 1 -r 1 -o  data.\
                                    map_partitions(lambda df : date_extractor(df,'date')).\
                                    compute(num_workers=12)
    out1_ = tmp_.average
    data = dd.from_pandas(data_,npartitions=6)   
    
    tmp_ = %timeit -n 1 -r 1 -o  data.\
                                    map_partitions(lambda df : impute_null(df, ['c2'])).\
                                    compute(num_workers=12)
    out2_ = tmp_.average
    data = dd.from_pandas(data_,npartitions=6)
    tmp_ = %timeit -n 1 -r 1 -o  data.\
                                    map_partitions(lambda df : impute_null_cat(df, ['c3'], 'UNK')).\
                                    compute()
    out2_ += tmp_.average
    data = dd.from_pandas(data_,npartitions=6)
    
    tmp_ = %timeit -n 1 -r 1 -o cat_encode_train_dask(data, 'c3', 'UNK')
    out3_ = tmp_.average
    data = dd.from_pandas(data_,npartitions=6)
    dict_ = cat_encode_train_dask(data.\
                                   map_partitions(lambda df : impute_null_cat(df, ['c3'], 'UNK')),
                                  'c3',
                                  'UNK')
    data = dd.from_pandas(data_,npartitions=6)
    tmp_ = %timeit -n 1 -r 1 -o data.\
                                    map_partitions(lambda df : date_extractor(df,'date')).\
                                    map_partitions(lambda df : impute_null(df, ['c2'])).\
                                    map_partitions(lambda df : impute_null_cat(df, ['c3'], 'UNK')).\
                                    map_partitions(lambda df: cat_encoder(df, 'c3', dict_, 'UNK')).\
                                    compute()
    out3_ += tmp_.average
    
    row = {'dd_date':out1_,
           'dd_impute':out2_,
           'dd_encode':out3_-out2_-out1_,
           'dd_total':out3_,
           'nRows':nIter,
          }
    
    results.loc[i] = row

Size: 100000
106 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
24.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
43.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
6.16 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
278 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Size: 1000000
187 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
39.4 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
164 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
16 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
813 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Size: 10000000
1.71 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
310 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
981 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
223 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
6.74 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop e

In [204]:
results

Unnamed: 0,dd_date,dd_impute,dd_encode,dd_total,nRows,memory
0,0.106063,0.067379,0.110574,0.284016,100000,"dd.Scalar<truediv..., dtype=float64>"
1,0.187082,0.202911,0.439158,0.829151,1000000,"dd.Scalar<truediv..., dtype=float64>"
2,1.709617,1.291369,3.959555,6.96054,10000000,"dd.Scalar<truediv..., dtype=float64>"


## Spark

For this section, we will need to rewrite much of the previous work to utilize Spark.  Additionally, we'll need to configure the various components of Spark when we initialize a Spark Context.

In [323]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.types import *
from pyspark.sql.functions import *

conf = SparkConf().\
        setMaster('local[*]').\
        setAppName('parallel_preprocessing').\
        set('spark.driver.cores','1').\
        set('spark.num.executors','5').\
        set('spark.driver.memory', '4G').\
        set('spark.executor.memory', '5G').\
        set('spark.driver.maxResultSize', '5G')

sc = SparkContext.getOrCreate(conf=conf)
sqc = SQLContext.getOrCreate(sc)

In [210]:
data = gen_data(100_000,startdate)

data_schema = StructType([
                          StructField('c1',DoubleType(),True),
                          StructField('c2',DoubleType(),True),
                          StructField('c3',StringType(),True),
                          StructField('date',DateType(),True)
                         ])

data_spark = sqc.createDataFrame(data,data_schema)

Date extraction

In [326]:
data_spark.\
withColumn('year', year('date')).\
withColumn('month', month('date')).\
withColumn('day', dayofmonth('date')).\
withColumn('week', weekofyear('date')).\
withColumn('dayofweek', date_format('date','EEEE')).\
show()

+-------------------+------+---+----------+----+-----+---+----+---------+
|                 c1|    c2| c3|      date|year|month|day|week|dayofweek|
+-------------------+------+---+----------+----+-----+---+----+---------+
| 0.3366087583763079| 360.0|  g|1938-10-28|1938|   10| 28|  43|   Friday|
| 0.3527312924524688| 459.0|  v|1901-04-11|1901|    4| 11|  15| Thursday|
| 0.6492605202705966| 148.0|  e|1971-04-04|1971|    4|  4|  13|   Sunday|
| 0.5057923874073894|2765.0|NaN|1961-09-14|1961|    9| 14|  37| Thursday|
| 0.8096906716183114|1117.0|  n|1931-05-04|1931|    5|  4|  19|   Monday|
|  0.861992255824543|2375.0|  e|1952-10-14|1952|   10| 14|  42|  Tuesday|
| 0.5179602062487393|1551.0|  b|1950-04-15|1950|    4| 15|  15| Saturday|
| 0.7914145094368675|2967.0|  q|1938-12-27|1938|   12| 27|  52|  Tuesday|
| 0.3636863424960113|2996.0|  c|1908-12-09|1908|   12|  9|  50|Wednesday|
|  0.385381987473396| 960.0|  w|1916-07-27|1916|    7| 27|  30| Thursday|
|  0.299120805229263| 275.0|  o|1949-0

In [None]:
def spark_date_extractor(df, dateCol, interval=['year','month','week','day','dayofweek']):
    '''
    input: dataframe, column of datetime objects, desired list of time-interval features
    output: dataframe with new time-interval features appended
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for i in interval:
        df.loc[:, i] = eval('df.%s.dt.%s'% (dateCol, i))
    df.drop(dateCol, axis=1, inplace=True)
    
    return df

Imputing nulls

In [317]:
def spark_impute_null(sdf, cols):
    '''
    input: Spark Dataframe, numerical columns missing values
    output: Spark Dataframe with missing values filled with mean
    '''
    for col in cols:
        value = sdf.\
                filter(~isnan(col)).\
                select(avg(col)).\
                head()[0]
        
        sdf = sdf.na.fill({col:value})
    
    return sdf


def spark_impute_null_cat(sdf, cols, unkStr):
    '''
    input: Spark Dataframe, categorical columns missing values, new value for missing
    output: Spark Dataframe with missing value replaced with unkStr
    '''
    for col in cols:
        sdf = sdf.withColumn(col, regexp_replace(col, 'NaN', 'UNK'))
    
    return sdf

In [None]:
fig, ax = plt.subplots()
ax.plot((results_dates.nRows), results_dates.pandas, label='Python')
ax.set_xlabel("Rows")
ax.set_ylabel("Time(Seconds)")
ax.set_xscale('log')
ax.axis([1,100000,0,.05])
ax.legend()
#fig.savefig('./python_numba_only.png',dpi=300)

In [None]:
test = dd.from_pandas(data,npartitions=4)

In [None]:
def addone(x):
    return x + 1