In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import datetime
import random
import string
import matplotlib.pyplot as plt
import seaborn as sns


from numba import jit
from multiprocessing import cpu_count
from dask.multiprocessing import get

nCores = cpu_count()
sns.set()

## Initializing Data

The computer I primarily use has 32GB of RAM (DDR4-3200,CAS14).  Additionally, I created a 64GB swap file partition on a Samsung 960 Evo M.2 Flash Drive (if anyone has any experience using Intel Optane drive for this, let me know about your experiences).  I wanted to create a dataframe that exceeded 32GB in memory to test the efficacy of Pandas vs Dask vs Spark.  The following parameters should accomplish that with 100 million rows.

In [28]:
nRows = 10000000
startdate = datetime.date(1900,1,1)
geoCenter = (40.723270, -73.988371)

The "data" contains a random date between 1900 and 2000, a random float between 0 and 1, a random int between 0 and 3333, a "categorical" string of [a-z], and two random geo points, one slightly off the other.

The random int column, c2, contains NaN values, 2% of the total.

The categorical column, c3, also contains NaN values, 2% of the total.

In [29]:
### initializing random data
data = pd.DataFrame({'date':[startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)],
                     'c1':np.random.rand(nRows),
                     'c2':np.random.randint(0,3333,nRows),
                     'c3':[random.choice(string.ascii_letters).lower() for i in range(nRows)],
                     'lat_1':np.random.rand(nRows) + geoCenter[0],
                     'lon_1':np.random.rand(nRows) + geoCenter[1]
                    })

data['lat_2'] = data['lat_1']
data['lon_2'] = data['lon_1'] + 0.003

data.date = pd.to_datetime(data.date)

### picking random null values
data.loc[data.sample(frac=.02).index,'c2'] = np.nan
data.loc[data.sample(frac=.02).index,'c3'] = np.nan

In [30]:
data.to_feather('/tmp/data')

In [None]:
data = pd.read_feather('/tmp/data')

## Data Pre-Processing Steps

### Dates

In column `date` we have datetime objects as if they had been parsed by pandas `read_csv` with `parse_dates`.  These are YYYY-MM-DD style, with no additional time information.

In most machine learning algorithims, we would want to separate out the individual components of a date into their own feature.  For this date, we will create five features: the year, month, week, day, and day of the week.

### Filling Missing Values

For missing values in column `c2` we're going to impute the mean value.  Additionally, we're going to create a new feature `c2_isnull` so that our models will be able to, hopefully, pick up on the fact that value is artificial.

We'll deal with the missing values in column `c3` slightly differently, using a keyword to signify an unknown quantity.

### Encoding Categorical Variables

Scikit-learn comes with a `LabelEncoder` but we'll be using one from scratch.  Unfortunately, scikit-learn's encoder does not play well with new/missing values.  

We know that column `c3` contains letters `a-z`, but let's say that in our train-test split, our training set was missing the letter `z`.  `LabelEncoder` throws an error when attempting to encode `z` in our test set.  Instead, we would rather a separate value be encoded, like `-1`, to signify that the true encoding is unknown.  Likewise, since this column contains missing values, we would also impute `-1` for nulls in our training and test set.

## Python, Pandas, Single-Processor

The following function extracts the various parts of time from a datetime column and returns the original dataframe with the new features:

In [36]:
### input: dataframe, column of datetime objects, desired list of time-interval features
### output: dataframe with new time-interval features appended
def date_extractor(df, dateCol, interval=['year','month','week','day','dayofweek']):
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for i in interval:
        df.loc[:, i] = eval('df.%s.dt.%s'% (dateCol, i))
    df.drop(dateCol, axis=1, inplace=True)
    
    return 'Added %s as Features' % ' '.join(interval)

This next function fills our missing values and creates an addition feature: `feat_isnull`

In [5]:
### input: dataframe, numerical columns with null values
### output: dataframe with null values imputed with median and a new feature indicating that entry was null
def impute_null(df, cols):
    for col in cols:
        # creating the new feature which indicates isnull
        feat_is_null = col + '_isnull'
        df.loc[:, feat_is_null] = np.int8(df[col].isnull())
        
         # imputing median
        impute_value = np.nanmedian(df[col])
        df.loc[:, col].fillna(impute_value, inplace=True)
    
    return 'Nulls Imputed on %s' % ' '.join(cols)

For categorical features, we'll treat them slightly differently.

In [6]:
### input: dataframe, categorical column with null values, string to signify missing value
### output: dataframe with null values imputed with 'UNK' and a new feature indicating entry was null
def impute_null_cat(df, cols, naStr):
    for col in cols:
        # creating the new feature which indicates isnull
        feat_is_null = col + '_isnull'
        df.loc[:, feat_is_null] = np.int8(df[col].isnull())
        
        # imputing missing code
        df.loc[:, col].fillna(naStr, inplace=True)
        
    return 'Nulls Imputed on %s' % ' '.join(cols)

The following functions deal with encoding categorical features.

In [7]:
### input: dataframe, name of single categorical column
### output: dictionary representing previous levels and new numerical encodings
def cat_encode_train(df, col):
    keys = set(df[col])
    values = np.arange(0, len(keys))
    
    return dict(zip(keys,values))

### input: dataframe, name of single categorical column, dictionary of encodings, string to use as unknown
### output: dataframe with categorical values encoded
### note: you probably want to match the unknown str with the string used as null in impute_null_cat
def cat_encoder(df, col, dict_enc, unkStr):
    # need to replace unknown values with unkStr
    df.loc[~data[col].isin(set(dict_enc.keys())), col] = unkStr

    df.loc[:,col].replace(dict_enc, inplace=True)
    
    return '%s Encoded' % col

In [31]:
results = pd.DataFrame(columns = ['pd_date',
                                  'pd_impute',
                                  'pd_encode',
                                  'pd_total',
                                  'nRows'
                                 ])

for i,nIter in enumerate(10 ** np.arange(5,8)): 
    print('Size: %s'% nIter)
    tmp_ = %timeit -n 1 -r 3 -o date_extractor(data.head(nIter), 'date')
    out1_ = tmp_.average
    
    tmp_ = %timeit -n 1 -r 3 -o impute_null(data, ['c2'])
    out2_ = tmp_.average
    tmp_ = %timeit -n 1 -r 3 -o impute_null_cat(data, ['c3'], 'UNK')
    out2_ += tmp_.average
    
    tmp_ = %timeit -n 1 -r 3 -o dict_ = cat_encode_train(data, 'c3')
    out3_ = tmp_.average
    tmp_ = %timeit -n 1 -r 3 -o cat_encoder(data, 'c3', cat_encode_train(data,'c3'), 'UNK')
    out3_ += tmp_.average
    
    row = {'pd_date':out1_,
           'pd_impute':out2_,
           'pd_encode':out3_,
           'pd_total':out1_ + out2_ + out3_,
           'nRows':nIter
          }
    
    results.loc[i] = row

Size: 100000
73.2 ms ± 7.31 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
The slowest run took 7.54 times longer than the fastest. This could mean that an intermediate result is being cached.
470 ms ± 452 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
987 ms ± 470 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
326 ms ± 78.7 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
The slowest run took 7.95 times longer than the fastest. This could mean that an intermediate result is being cached.
5.92 s ± 5.79 s per loop (mean ± std. dev. of 3 runs, 1 loop each)
Size: 1000000
745 ms ± 53.2 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
153 ms ± 12.4 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
14.3 ms ± 485 µs per loop (mean ± std. dev. of 3 runs, 1 loop each)
189 ms ± 3.33 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
1.7 s ± 16.4 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)
Size: 10000000
10.3 s ± 504 ms per loop (mean ± std. dev. of

In [None]:
fig, ax = plt.subplots()
ax.plot((results_dates.nRows), results_dates.pandas, label='Python')
ax.set_xlabel("Rows")
ax.set_ylabel("Time(Seconds)")
ax.set_xscale('log')
ax.axis([1,100000,0,.05])
ax.legend()
#fig.savefig('./python_numba_only.png',dpi=300)

In [None]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster('local[*]').setAppName('parallel_preprocessing')
sc = SparkContext.getOrCreate(conf=conf)