In [62]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import datetime
import random
import string
import matplotlib.pyplot as plt
import seaborn as sns


from numba import jit
from multiprocessing import cpu_count
from dask.multiprocessing import get

nCores = cpu_count()
sns.set()

%load_ext line_profiler

## Initializing Data

The computer I primarily use has 32GB of RAM (DDR4-3200,CAS14).  Additionally, I created a 64GB swap file partition on a Samsung 960 Evo M.2 Flash Drive (if anyone has any experience using Intel Optane drive for this, let me know about your experiences).  I wanted to create a dataframe that exceeded 32GB in memory to test the efficacy of Pandas vs Dask vs Spark.  The following parameters should accomplish that with 100 million rows.

The "data" contains a random date between 1900 and 2000, a random float between 0 and 1, a random int between 0 and 3333, a "categorical" string of [a-z], and two random geo points, one slightly off the other.

The random int column, c2, contains NaN values, 2% of the total.

The categorical column, c3, also contains NaN values, 2% of the total.

In [120]:
@jit
def gen_data(nRows, startdate, geoCenter):
    #date = [startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)]
    c1 = np.random.rand(nRows)
    c2 = np.random.randint(0,3333,nRows)
    c3 = np.random.choice(['a','b','c','d','e',
               'f','g','h','i','j',
               'k','l','m','n','o',
               'p','q','r','s','t',
               'u','v','w','x','y','z'],size=nRows)
    lat_1 = np.random.rand(nRows) + geoCenter[0]
    lon_1 = np.random.rand(nRows) + geoCenter[1]
                        
    return c1, c2, c3, lat_1, lon_1

gen_data(nRows,startdate,geoCenter)

(array([0.04105638, 0.62226792, 0.07476097, 0.98413162, 0.70467256,
        0.37125388, 0.20766254, 0.15509163, 0.31475989, 0.97976033]),
 array([  80, 3087, 1059, 2892, 2739, 2069, 2028,  104,  278,  736]),
 array(['i', 'z', 'j', 's', 'a', 'u', 'u', 'k', 'u', 'l'], dtype='<U1'),
 array([41.29579226, 41.00267357, 41.68695102, 41.06827429, 41.30578892,
        40.82994656, 41.41195584, 41.72318209, 41.0725217 , 41.33174169]),
 array([-73.42558559, -73.06823442, -73.63846969, -73.75346369,
        -73.8743908 , -73.25780152, -73.75313524, -73.19495687,
        -73.42089353, -73.95370145]))

In [124]:
nRows = 10000000

In [129]:
### initializing random data
%time data = pd.DataFrame({'date':[startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)],'c1':np.random.rand(nRows),'c2':np.random.randint(0,3333,nRows),'c3':[random.choice(string.ascii_letters).lower() for i in range(nRows)],'lat_1':np.random.rand(nRows) + geoCenter[0],'lon_1':np.random.rand(nRows) + geoCenter[1]})

CPU times: user 16.3 s, sys: 540 ms, total: 16.8 s
Wall time: 16.8 s


In [130]:
@jit
def gen_data(nRows, startdate, geoCenter):
    #date = [startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)]
    c1 = np.random.rand(nRows)
    c2 = np.random.randint(0,3333,nRows)
    c3 = np.random.choice(['a','b','c','d','e',
               'f','g','h','i','j',
               'k','l','m','n','o',
               'p','q','r','s','t',
               'u','v','w','x','y','z'],size=nRows)
    lat_1 = np.random.rand(nRows) + geoCenter[0]
    lon_1 = np.random.rand(nRows) + geoCenter[1]
                        
    return c1, c2, c3, lat_1, lon_1
    
### initializing random data
#data = pd.DataFrame({'date':[startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)],
#                     'c1':np.random.rand(nRows),
#                     'c2':np.random.randint(0,3333,nRows),
#                     'c3':[random.choice(string.ascii_letters).lower() for i in range(nRows)],
#                     'lat_1':np.random.rand(nRows) + geoCenter[0],
#                     'lon_1':np.random.rand(nRows) + geoCenter[1]
#                    })
%time c1, c2, c3, lat_1, lon_1 = gen_data(nRows, startdate, geoCenter)
%time date = [startdate + datetime.timedelta(int(i)) for i in np.random.randint(1,36524,size=nRows)]
data = pd.DataFrame({'date':date,
                     'c1':c1,
                     'c2':c2,
                     'c3':c3,
                     'lat_1':lat_1,
                     'lon_1':lon_1
                    })

data['lat_2'] = data['lat_1']
data['lon_2'] = data['lon_1'] + 0.003

data.date = pd.to_datetime(data.date)

### picking random null values
data.loc[data.sample(frac=.02).index,'c2'] = np.nan
data.loc[data.sample(frac=.02).index,'c3'] = np.nan

CPU times: user 980 ms, sys: 156 ms, total: 1.14 s
Wall time: 1.14 s
CPU times: user 6.23 s, sys: 159 ms, total: 6.39 s
Wall time: 6.39 s


In [123]:
data

Unnamed: 0,c1,c2,c3,date,lat_1,lon_1,lat_2,lon_2
0,0.475479,1.0,s,1938-09-17,41.581084,-73.593358,41.581084,-73.590358
1,0.362571,2061.0,u,1945-04-20,41.695716,-73.841167,41.695716,-73.838167
2,0.815786,836.0,a,1931-02-03,41.321212,-73.943495,41.321212,-73.940495
3,0.774659,2575.0,a,1999-12-13,41.321807,-73.906446,41.321807,-73.903446
4,0.226543,1266.0,g,1973-12-20,41.554218,-73.64911,41.554218,-73.64611
5,0.216186,565.0,a,1951-09-02,41.412864,-73.465834,41.412864,-73.462834
6,0.261915,3280.0,t,1976-12-22,40.835462,-73.335047,40.835462,-73.332047
7,0.438692,2589.0,l,1944-04-28,40.744131,-73.235372,40.744131,-73.232372
8,0.900466,692.0,i,1918-08-17,41.30757,-73.380536,41.30757,-73.377536
9,0.911113,1601.0,l,1983-11-01,41.390585,-73.776743,41.390585,-73.773743


In [10]:
data.to_feather('/tmp/data')

In [37]:
data = pd.read_feather('/tmp/data')

## Data Pre-Processing Steps

### Dates

In column `date` we have datetime objects as if they had been parsed by pandas `read_csv` with `parse_dates`.  These are YYYY-MM-DD style, with no additional time information.

In most machine learning algorithims, we would want to separate out the individual components of a date into their own feature.  For this date, we will create five features: the year, month, week, day, and day of the week.

### Filling Missing Values

For missing values in column `c2` we're going to impute the mean value.  Additionally, we're going to create a new feature `c2_isnull` so that our models will be able to, hopefully, pick up on the fact that value is artificial.

We'll deal with the missing values in column `c3` slightly differently, using a keyword to signify an unknown quantity.

### Encoding Categorical Variables

Scikit-learn comes with a `LabelEncoder` but we'll be using one from scratch.  Unfortunately, scikit-learn's encoder does not play well with new/missing values.  

We know that column `c3` contains letters `a-z`, but let's say that in our train-test split, our training set was missing the letter `z`.  `LabelEncoder` throws an error when attempting to encode `z` in our test set.  Instead, we would rather a separate value be encoded, like `-1`, to signify that the true encoding is unknown.  Likewise, since this column contains missing values, we would also impute `-1` for nulls in our training and test set.

## Python, Pandas, Single-Processor

The following function extracts the various parts of time from a datetime column and returns the original dataframe with the new features:

In [None]:
date_extractor()

In [131]:
def date_extractor(df, dateCol, interval=['year','month','week','day','dayofweek']):
    '''
    input: dataframe, column of datetime objects, desired list of time-interval features
    output: dataframe with new time-interval features appended
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for i in interval:
        df.loc[:, i] = eval('df.%s.dt.%s'% (dateCol, i))
    df.drop(dateCol, axis=1, inplace=True)
    
    return 'Added %s as Features' % ' '.join(interval)

This next function fills our missing values and creates an addition feature: `feat_isnull`

In [30]:
def impute_null(df, cols):
    '''
    input: dataframe, numerical columns with null values
    output: dataframe with null values imputed with median and a new feature indicating that entry was null
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for col in cols:
        # creating the new feature which indicates isnull
        feat_is_null = col + '_isnull'
        df.loc[:, feat_is_null] = np.int8(df[col].isnull())
        
         # imputing median
        impute_value = np.nanmedian(df[col])
        df.loc[:, col].fillna(impute_value, inplace=True)
    
    return 'Nulls Imputed on %s' % ' '.join(cols)

For categorical features, we'll treat them slightly differently.

In [31]:
def impute_null_cat(df, cols, naStr):
    '''
    input: dataframe, categorical column with null values, string to signify missing value
    output: dataframe with null values imputed with 'UNK' and a new feature indicating entry was null
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    for col in cols:
        # creating the new feature which indicates isnull
        feat_is_null = col + '_isnull'
        df.loc[:, feat_is_null] = np.int8(df[col].isnull())
        
        # imputing missing code
        df.loc[:, col].fillna(naStr, inplace=True)
        
    return 'Nulls Imputed on %s' % ' '.join(cols)

The following functions deal with encoding categorical features.

In [53]:
def cat_encode_train(df, col):
    '''
    input: dataframe, name of single categorical column
    output: dictionary representing previous levels and new numerical encodings
    '''
    keys = set(df[col])
    values = np.arange(0, len(keys))
    
    return dict(zip(keys,values))


def cat_encoder(df, col, dict_enc, unkStr):
    '''
    input: dataframe, name of single categorical column, dictionary of encodings, string to use as unknown
    output: dataframe with categorical values encoded
    note: you probably want to match the unknown str with the string used as null in impute_null_cat
    '''
    df.is_copy = False # we're not dealing with copies, this avoids the warning
    # need to replace unknown values with unkStr
    df.loc[~data[col].isin(set(dict_enc.keys())), col] = unkStr

    df.loc[:,col] = df.loc[:,col].map(lambda x : dict_enc[x])
    
    return '%s Encoded' % col

In [77]:
results = pd.DataFrame(columns = ['pd_date',
                                  'pd_impute',
                                  'pd_encode',
                                  'pd_total',
                                  'nRows'
                                 ])

for i,nIter in enumerate(10 ** np.arange(5,8)): 
    ### Parameters for generating data
    startdate = datetime.date(1900,1,1)
    geoCenter = (40.723270, -73.988371)
    data = gen_data(nIter, startdate, geoCenter)
    
    print('Size: %s'% nIter)
    tmp_ = %timeit -n 1 -r 1 -o date_extractor(data, 'date')
    out1_ = tmp_.average
    
    tmp_ = %timeit -n 1 -r 1 -o impute_null(data, ['c2'])
    out2_ = tmp_.average
    tmp_ = %timeit -n 1 -r 1 -o impute_null_cat(data, ['c3'], 'UNK')
    out2_ += tmp_.average
    
    tmp_ = %timeit -n 1 -r 1 -o dict_ = cat_encode_train(data, 'c3')
    out3_ = tmp_.average
    tmp_ = %timeit -n 1 -r 1 -o cat_encoder(data, 'c3', cat_encode_train(data,'c3'), 'UNK')
    out3_ += tmp_.average
    
    row = {'pd_date':out1_,
           'pd_impute':out2_,
           'pd_encode':out3_,
           'pd_total':out1_ + out2_ + out3_,
           'nRows':nIter
          }
    
    results.loc[i] = row

Size: 100000
52.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
5.69 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
22.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
9.98 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
61.8 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Size: 1000000
555 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
64.3 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
143 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
53.2 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
427 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Size: 10000000
6.17 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
643 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
1.34 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
477 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
4.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loo

In [78]:
results

Unnamed: 0,pd_date,pd_impute,pd_encode,pd_total,nRows
0,0.052303,0.02777,0.071768,0.151841,100000.0
1,0.555055,0.20699,0.480368,1.242412,1000000.0
2,6.166593,1.986323,4.575245,12.728162,10000000.0


In [67]:
%lprun -f cat_encoder cat_encoder(test_,'c3',dict_,'UNK')

In [43]:
%time dict_ = cat_encode_train(data.head(10000), 'c3')
#%time cat_encoder(data.head(10000), 'c3', dict_, 'UNK')

CPU times: user 1.13 ms, sys: 0 ns, total: 1.13 ms
Wall time: 1 ms


In [None]:
results.to_csv('./results.csv')

In [None]:
fig, ax = plt.subplots()
ax.plot((results_dates.nRows), results_dates.pandas, label='Python')
ax.set_xlabel("Rows")
ax.set_ylabel("Time(Seconds)")
ax.set_xscale('log')
ax.axis([1,100000,0,.05])
ax.legend()
#fig.savefig('./python_numba_only.png',dpi=300)

In [None]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setMaster('local[*]').setAppName('parallel_preprocessing')
sc = SparkContext.getOrCreate(conf=conf)