# Embarrasingly parallel date operations

each row is processed independently

I want to calculate the number of days to and after the next holiday. As I am new to python I am unsure how to perform such a calculation efficiently

In [1]:
%load_ext Cython

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.cm as cm
import datetime as DT

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import time

load dates and holidays

In [2]:
datesFrame = pd.read_csv('myDates.csv')
datesFrame.myDates = pd.to_datetime(datesFrame.myDates)

holidays = pd.read_csv('holidays.csv')
holidays.day = pd.to_datetime(holidays.day)
holidays.type = holidays.type.astype("category")
holidays.name = holidays.name.astype("category")

holidays = holidays[holidays.apply(lambda x: (x.type == 'National holiday'), axis=1)]

In [3]:
def get_nearest_date(dates, pivot):
    nearest = min(dates, key=lambda x: abs(x - pivot))
    difference = abs(nearest - pivot)
    differenceAsDay = difference / np.timedelta64(1, 'D')
    return differenceAsDay.astype(int)

## approach 3 - parallelApply

Takes 4 min 18 seconds

In [4]:
from joblib import Parallel, delayed
import multiprocessing

def get_nearest_dateParallel(df):
    df['daysBeforeHoliday'] = df.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day < x], x))
    df['daysAfterHoliday']  =  df.myDates.apply(lambda x: get_nearest_date(holidays.day[holidays.day > x], x))
    return df

def applyParallel(dfGrouped, func):
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for name, group in dfGrouped)
    return pd.concat(retLst)

print ('parallel version: ')
# 2 min 30 seconds
%time result = applyParallel(datesFrame.groupby(datesFrame.index), get_nearest_dateParallel)

parallel version: 
CPU times: user 4min 6s, sys: 3.46 s, total: 4min 10s
Wall time: 5min 27s


In [29]:
result

Unnamed: 0,myDates,daysBeforeHoliday,daysAfterHoliday
0,2014-09-01,17,55
1,2014-03-01,54,51
2,2014-03-01,54,51
3,2014-01-18,12,93
4,2014-01-14,8,97
5,2014-01-23,17,88
6,2014-12-01,30,7
7,2014-03-01,54,51
8,2014-03-01,54,51
9,2014-04-06,90,15


## approach 5 sorted 
http://stackoverflow.com/questions/39284989/parallelize-pandas-apply?noredirect=1#comment65962782_39284989

Further improvement: How can I utilize pandas.tseries.holiday import USFederalHolidayCalendar for other national calendars instead of my CSV?

In [5]:
holidays = holidays.set_index('day')
holidays.index
holidays = holidays.drop(['name','type'], axis=1)

In [6]:
holidays.index

DatetimeIndex(['2013-01-01', '2013-01-06', '2013-04-01', '2013-05-01',
               '2013-05-09', '2013-05-20', '2013-05-30', '2013-08-15',
               '2013-10-26', '2013-11-01', '2013-12-08', '2013-12-25',
               '2013-12-26', '2014-01-01', '2014-01-06', '2014-04-21',
               '2014-05-01', '2014-05-29', '2014-06-09', '2014-06-19',
               '2014-08-15', '2014-10-26', '2014-11-01', '2014-12-08',
               '2014-12-25', '2014-12-26', '2015-01-01', '2015-01-06',
               '2015-04-06', '2015-05-01', '2015-05-14', '2015-05-25',
               '2015-06-04', '2015-08-15', '2015-10-26', '2015-11-01',
               '2015-12-08', '2015-12-25', '2015-12-26', '2016-01-01',
               '2016-01-06', '2016-03-28', '2016-05-01', '2016-05-05',
               '2016-05-16', '2016-05-26', '2016-08-15', '2016-10-26',
               '2016-11-01', '2016-12-08', '2016-12-25', '2016-12-26',
               '2017-01-01', '2017-01-06', '2017-04-17', '2017-05-01',
      

In [7]:
datesFrame['next_date'] = holidays.index[datesFrame.myDates.apply(holidays.index.searchsorted)]

In [8]:
result['checkCompare'] = datesFrame['next_date'] - datesFrame['myDates']

In [22]:
pd.to_datetime(result.checkCompare).day

AttributeError: 'Series' object has no attribute 'day'

In [34]:
result[result.checkCompare > pd.Timedelta('200 days')]

Unnamed: 0,myDates,daysBeforeHoliday,daysAfterHoliday,checkCompare


In [32]:
holidays[holidays.index[datesFrame.myDates.apply(holidays.index.searchsorted)]].value_counts()

KeyError: "DatetimeIndex(['2014-10-26', '2014-04-21', '2014-04-21', '2014-04-21',\n               '2014-04-21', '2014-04-21', '2014-12-08', '2014-04-21',\n               '2014-04-21', '2014-04-21',\n               ...\n               '2015-04-06', '2015-04-06', '2015-08-15', '2015-04-06',\n               '2015-04-06', '2015-04-06', '2015-04-06', '2015-04-06',\n               '2015-04-06', '2015-04-06'],\n              dtype='datetime64[ns]', name='day', length=178664, freq=None) not in index"

In [23]:
result.checkCompare

0        55 days
1        51 days
2        51 days
3        93 days
4        97 days
5        88 days
6         7 days
7        51 days
8        51 days
9        15 days
10       51 days
11       51 days
12       51 days
13       45 days
14       20 days
15       20 days
16       51 days
17       20 days
18       51 days
19       45 days
20        0 days
21       51 days
22       20 days
23       51 days
24       14 days
25       45 days
26       45 days
27       51 days
28       97 days
29       55 days
           ...  
178634    8 days
178635    4 days
178636   53 days
178637   53 days
178638   53 days
178639    8 days
178640    8 days
178641   25 days
178642   53 days
178643   53 days
178644   53 days
178645    8 days
178646   53 days
178647   53 days
178648   25 days
178649   53 days
178650   53 days
178651   53 days
178652   53 days
178653   53 days
178654   53 days
178655   53 days
178656   34 days
178657   53 days
178658   53 days
178659   53 days
178660   25 days
178661   53 da

In [11]:
result[result.checkCompare > 100]

TypeError: invalid type comparison

In [38]:
datesFrame['prev_date'] = holidays.index[datesFrame.myDates.apply(holidays.index) - 1]
result['checkBefore'] = datesFrame['prev_date'] - datesFrame['myDates']

TypeError: 'DatetimeIndex' object is not callable

In [40]:
indices = datesFrame.myDates.apply(holidays.index.searchsorted) #do this once
next_hols = holidays.index[indices] # next hols
prev_hols = holidays.index[indices - 1] # prev hols
after_next = holidays.index[indices + 1] # etc...

In [37]:
result

Unnamed: 0,myDates,daysBeforeHoliday,daysAfterHoliday,checkCompare,checkBefore
0,2014-09-01,17,55,55 days,-17 days
1,2014-03-01,54,51,51 days,-54 days
2,2014-03-01,54,51,51 days,-54 days
3,2014-01-18,12,93,93 days,-12 days
4,2014-01-14,8,97,97 days,-8 days
5,2014-01-23,17,88,88 days,-17 days
6,2014-12-01,30,7,7 days,-30 days
7,2014-03-01,54,51,51 days,-54 days
8,2014-03-01,54,51,51 days,-54 days
9,2014-04-06,90,15,15 days,-90 days


All dates already are a timestamp -> why does it not work to convert them?

In [16]:
print(datesFrame.myDates.dtype)
print(holidays.index.dtype)
# holidays.index.astype(np.int64)

datetime64[ns]
datetime64[ns]


In [26]:
# .astype(np.int64)
indices = holidays.index.searchsorted(datesFrame.myDates)
# array([1, 6, 9, 3])
next_nearest = holidays[indices]

TypeError: Cannot convert input to Timestamp

In [4]:
next_nearest

DatetimeIndex(['2016-01-18', '2016-10-10', '2016-12-26', '2016-05-30'], dtype='datetime64[ns]', freq=None)

In [5]:
next_nearest_diff = pd.to_timedelta(next_nearest.values - dates.values).days

In [6]:
next_nearest_diff

array([15, 31, 14, 88])

In [11]:
#from datetime import date
#from workalendar.europe import Germany
#cal = Germany()
#cal.holidays(2012)
#[(datetime.date(2012, 1, 1), 'New year'),
# (datetime.date(2012, 4, 9), 'Easter Monday'),
# (datetime.date(2012, 5, 1), 'Labour Day'),
# (datetime.date(2012, 5, 8), 'Victory in Europe Day'),
# (datetime.date(2012, 5, 17), 'Ascension Day'),
# (datetime.date(2012, 5, 28), 'Whit Monday'),
# (datetime.date(2012, 7, 14), 'Bastille Day'),
# (datetime.date(2012, 8, 15), 'Assumption of Mary to Heaven'),
# (datetime.date(2012, 11, 1), "All Saints' Day"),
# (datetime.date(2012, 11, 11), 'Armistice Day'),
# (datetime.date(2012, 12, 25), 'Christmas')]
#cal.is_working_day(date(2012, 12, 25))  # it's Christmas
#cal.is_working_day(date(2012, 12, 30))  # it's Sunday
#cal.is_working_day(date(2012, 12, 26))
#cal.add_working_days(date(2012, 12, 23), 5)  # 5 working days after Xmas
#datetime.date(2012, 12, 31)