# Embarrasingly parallel date operations

each row is processed independently

I want to calculate the number of days to and after the next holiday. As I am new to python I am unsure how to perform such a calculation efficiently

In [1]:
%load_ext Cython

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
import datetime as DT

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import time

load dates and holidays

In [39]:
datesFrame = pd.read_csv('myDates.csv')
datesFrame.myDates = pd.to_datetime(datesFrame.myDates)

holidays = pd.read_csv('holidays.csv')
holidays.day = pd.to_datetime(holidays.day)
holidays.type = holidays.type.astype("category")
holidays.name = holidays.name.astype("category")

holidays = holidays[holidays.apply(lambda x: (x.type == 'National holiday'), axis=1)]

In [14]:
def get_nearest_date(dates, pivot):
    nearest = min(dates, key=lambda x: abs(x - pivot))
    difference = abs(nearest - pivot)
    differenceAsDay = difference / np.timedelta64(1, 'D')
    return differenceAsDay.astype(int)

## approach 1 - too slow

In [4]:
datesFrameSmall= datesFrame[: 10]
datesFrameSmall= datesFrameSmall.copy()
datesFrameSmall= datesFrameSmall.reindex()

In [15]:
#time1 = time.time()
%time datesFrameSmall['daysBeforeHoliday'] = datesFrameSmall.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day < x], x))

% time datesFrameSmall['daysAfterHoliday']  =  datesFrameSmall.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day > x], x))
#time2 = time.time()
#print ('function took %0.3f ms' % ((time2-time1)*1000.0))
datesFrameSmall

CPU times: user 14.4 ms, sys: 1.76 ms, total: 16.2 ms
Wall time: 15.2 ms
CPU times: user 37 ms, sys: 875 µs, total: 37.9 ms
Wall time: 38.4 ms


Unnamed: 0,myDates,daysBeforeHoliday,daysAfterHoliday
0,2014-09-01,17,55
1,2014-03-01,54,51
2,2014-03-01,54,51
3,2014-01-18,12,93
4,2014-01-14,8,97
5,2014-01-23,17,88
6,2014-12-01,30,7
7,2014-03-01,54,51
8,2014-03-01,54,51
9,2014-04-06,90,15


In [16]:
# disabled - takes too long
#time1 = time.time()
#datesFrame['daysBeforeHoliday'] = datesFrame.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day < x], x))
#datesFrame['daysAfterHoliday']  =  datesFrame.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day > x], x))
#time2 = time.time()
#print ('function took %0.3f ms' % ((time2-time1)*1000.0))
#datesFrame

## approach 2 - noReturnValues  -> parallel processing + cython

In [19]:
%%cython
################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
import datetime as DT
import skutils

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import time
## strange - > have to re import all the things for cython. 
datesFrame = pd.read_csv('myDates.csv')
datesFrame.myDates = pd.to_datetime(datesFrame.myDates)

holidays = pd.read_csv('holidays.csv')
holidays.day = pd.to_datetime(holidays.day)
holidays.type = holidays.type.astype("category")
holidays.name = holidays.name.astype("category")

holidays = holidays[holidays.apply(lambda x: (x.type == 'National holiday'), axis=1)]

def get_nearest_date(dates, pivot):
    nearest = min(dates, key=lambda x: abs(x - pivot))
    difference = abs(nearest - pivot)
    return difference / np.timedelta64(1, 'D')
################
import multiprocessing
num_cpus = multiprocessing.cpu_count()

time1 = time.time()
n_entries = datesFrame.shape[0]
n_entrie_perfold = round(n_entries / num_cpus)
folds = [datesFrame[start:start+n_entrie_perfold] for start in range(0, n_entries + 1, n_entrie_perfold)]

def process_fold(df_per_fold):
    df_per_fold['daysBeforeHoliday'] = df_per_fold.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day < x], x))
    df_per_fold['daysAfterHoliday'] = df_per_fold.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day > x], x))
    return df_per_fold
    
pool = multiprocessing.Pool(processes=num_cpus)  
pool.map(process_fold, folds)

time2 = time.time()
print ('function took %0.3f ms' % ((time2-time1)*1000.0))
datesFrame

Seems to parallelize well. Still it is not really "quick". Strange that cython does require everything in a single jupyter cell. 

However, there is no output in the datesframe. What am I doing wrong here?

## approach 3 - parallelApply

only for grouped data --> how to port to rows?

In [10]:
# from http://stackoverflow.com/questions/26187759/parallelize-apply-after-pandas-groupby/29281494#29281494
from joblib import Parallel, delayed
import multiprocessing

def tmpFunc(df):
    df['c'] = df.a + df.b
    return df

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

df = pd.DataFrame({'a': [6, 2, 2], 'b': [4, 5, 6]},index= ['g1', 'g1', 'g2'])
print ('parallel version: ')
print( applyParallel(df.groupby(df.index), tmpFunc))

print ('regular version: ')
print (df.groupby(df.index).apply(tmpFunc))

parallel version: 
    a  b
g1  6  4
g1  2  5
    a  b
g2  2  6
    a  b
g1  6  4
g1  2  5
g2  2  6
regular version: 
    a  b
g1  6  4
g1  2  5
    a  b
g1  6  4
g1  2  5
    a  b
g2  2  6
    a  b
g1  6  4
g1  2  5
g2  2  6


here my try to port it. But so far I do not know how to pass both 2 parameters

In [16]:
from joblib import Parallel, delayed
import multiprocessing

def get_nearest_dateParallel(df):
    df.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day < x], x))
    return df

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

print ('parallel version: ')
# 2 min 30 seconds
%time result = applyParallel(datesFrame.groupby(datesFrame.index), get_nearest_dateParallel)
result

parallel version: 
CPU times: user 2min 17s, sys: 4.24 s, total: 2min 21s
Wall time: 2min 26s


Unnamed: 0,myDates
0,2014-09-01
1,2014-03-01
2,2014-03-01
3,2014-01-18
4,2014-01-14
5,2014-01-23
6,2014-12-01
7,2014-03-01
8,2014-03-01
9,2014-04-06


In [17]:
datesFrame

Unnamed: 0,myDates
0,2014-09-01
1,2014-03-01
2,2014-03-01
3,2014-01-18
4,2014-01-14
5,2014-01-23
6,2014-12-01
7,2014-03-01
8,2014-03-01
9,2014-04-06


## approach 4 - pool

did not have too much time to look into that option. But if approach 3 works that looks much cleaner

In [None]:
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [None]:
def multiply_columns(data):
    data['length_of_word'] = data['species'].apply(lambda x: len(x))
    return data
    
iris = parallelize_dataframe(iris, multiply_columns)

## approach 5 sorted 
http://stackoverflow.com/questions/39284989/parallelize-pandas-apply?noredirect=1#comment65962782_39284989

In [14]:
import pandas as pd

dates = pd.to_datetime(['2016-01-03', '2016-09-09', '2016-12-12', '2016-03-03'])
from pandas.tseries.holiday import USFederalHolidayCalendar

holiday_calendar = USFederalHolidayCalendar()
holidays2 = holiday_calendar.holidays('2016-01-01')

In [42]:
#holidays = holidays.set_index('day')
#holidays = holidays.drop(['name','type'], axis=1)
#holidays

In [49]:
h2 = holidays.copy()
h2 = h2.set_index('day')
h2.index

DatetimeIndex(['2013-01-01', '2013-01-06', '2013-04-01', '2013-05-01',
               '2013-05-09', '2013-05-20', '2013-05-30', '2013-08-15',
               '2013-10-26', '2013-11-01', '2013-12-08', '2013-12-25',
               '2013-12-26', '2014-01-01', '2014-01-06', '2014-04-21',
               '2014-05-01', '2014-05-29', '2014-06-09', '2014-06-19',
               '2014-08-15', '2014-10-26', '2014-11-01', '2014-12-08',
               '2014-12-25', '2014-12-26', '2015-01-01', '2015-01-06',
               '2015-04-06', '2015-05-01', '2015-05-14', '2015-05-25',
               '2015-06-04', '2015-08-15', '2015-10-26', '2015-11-01',
               '2015-12-08', '2015-12-25', '2015-12-26', '2016-01-01',
               '2016-01-06', '2016-03-28', '2016-05-01', '2016-05-05',
               '2016-05-16', '2016-05-26', '2016-08-15', '2016-10-26',
               '2016-11-01', '2016-12-08', '2016-12-25', '2016-12-26',
               '2017-01-01', '2017-01-06', '2017-04-17', '2017-05-01',
      

In [50]:
indices = holidays.index.searchsorted(dates)
# array([1, 6, 9, 3])
next_nearest = holidays[indices]

TypeError: invalid type promotion

In [4]:
next_nearest

DatetimeIndex(['2016-01-18', '2016-10-10', '2016-12-26', '2016-05-30'], dtype='datetime64[ns]', freq=None)

In [5]:
next_nearest_diff = pd.to_timedelta(next_nearest.values - dates.values).days

In [6]:
next_nearest_diff

array([15, 31, 14, 88])

In [11]:
from datetime import date
from workalendar.europe import Germany
cal = Germany()
cal.holidays(2012)
[(datetime.date(2012, 1, 1), 'New year'),
 (datetime.date(2012, 4, 9), 'Easter Monday'),
 (datetime.date(2012, 5, 1), 'Labour Day'),
 (datetime.date(2012, 5, 8), 'Victory in Europe Day'),
 (datetime.date(2012, 5, 17), 'Ascension Day'),
 (datetime.date(2012, 5, 28), 'Whit Monday'),
 (datetime.date(2012, 7, 14), 'Bastille Day'),
 (datetime.date(2012, 8, 15), 'Assumption of Mary to Heaven'),
 (datetime.date(2012, 11, 1), "All Saints' Day"),
 (datetime.date(2012, 11, 11), 'Armistice Day'),
 (datetime.date(2012, 12, 25), 'Christmas')]
#cal.is_working_day(date(2012, 12, 25))  # it's Christmas
#cal.is_working_day(date(2012, 12, 30))  # it's Sunday
#cal.is_working_day(date(2012, 12, 26))
#cal.add_working_days(date(2012, 12, 23), 5)  # 5 working days after Xmas
#datetime.date(2012, 12, 31)

NameError: name 'datetime' is not defined