In [1]:
# Run for the first time
#!pip install holidays

In [2]:
from datetime import date as date

import holidays
import numpy as np
import pandas as pd

In [69]:
for date, name in sorted(holidays.US(years=2001).items()):
    print(date, name)

2001-01-01 New Year's Day
2001-01-15 Martin Luther King Jr. Day
2001-02-19 Washington's Birthday
2001-05-28 Memorial Day
2001-07-04 Independence Day
2001-09-03 Labor Day
2001-10-08 Columbus Day
2001-11-11 Veterans Day
2001-11-12 Veterans Day (Observed)
2001-11-22 Thanksgiving
2001-12-25 Christmas Day


In [84]:
us_holidays = holidays.US()

'2014-01-01'

In [94]:
country_code = "US"
us_holidays = holidays.CountryHoliday(country_code)
us_holidays.prov = ["CA", "MX"]
us_holidays.get("2020-1-2")

## Add holiday to dataframe

In [6]:
def add_holidays_to_dataframe(
    dataframe, date_field, country_code, province: list = None
):
    """
    params: dataframe: Pandas dataframe in which holidays are to be determined
            date_field: Datetime field used to determine if its a holiday or not
            country_code: Country code for which it is to be determined if its a holiday or not
            province: State codes (if necessary) for the country
    
    returns: None, Changes made directly to dataframe
    """
    country_holidays = holidays.CountryHoliday(country_code)
    country_holidays.prov = province
    # dataframe['StateHoliday'] = 0
    cols = list(dataframe.columns)
    cols.append("StateHoliday")
    # print(dataframe)
    # dates = pd.DataFrame(dataframe[date_field]).copy()
    dataframe["StateHoliday"] = 0
    # print(dates.dtypes)
    if dataframe[date_field].dtypes == "datetime64[ns]":
        dataframe[date_field] = dataframe[date_field].dt.strftime("%Y-%m-%d")

    dataframe.set_index(date_field, inplace=True)
    # print(dataframe)
    for i in zip(dataframe.index):
        if country_holidays.get(i[0]):
            dataframe["StateHoliday"].loc[i[0]] = 1
    dataframe.reset_index(inplace=True)
    dataframe.columns = cols
    dataframe[date_field] = pd.to_datetime(dataframe[date_field])

In [86]:
df = pd.DataFrame(
    pd.to_datetime(
        pd.Series(
            ["20010101", "20010201", "20010704", "20010815", "20011122", "20011225"]
        ),
        format="%Y%m%d",
    )
)
df.columns = ["date"]
# df['date'] = df['date'].dt.strftime('%Y-%m-%d')
# df.set_index('date',inplace=True)
# df['state'] = 0
# df.loc['2001-01-01']['state']
df

Unnamed: 0,date
0,2001-01-01
1,2001-02-01
2,2001-07-04
3,2001-08-15
4,2001-11-22
5,2001-12-25


In [87]:
add_holidays_to_dataframe(df, "date", "US", ["CA"])
df

Unnamed: 0,date,StateHoliday
0,2001-01-01,1
1,2001-02-01,0
2,2001-07-04,1
3,2001-08-15,0
4,2001-11-22,1
5,2001-12-25,1


## Get time elapsed after a holiday or before the next one

In [83]:
def get_elapsed_after(
    dataframe, date_field, calculated_field, group_field=None, prefix="After"
):
    """
    params: dataframe: Pandas dataframe in which elapsed time is to be calculated
            date_field: The field with the date in YMD format
            calculated_field: The field consisting of binaries which specify if it is to be counted as the event occurrence or not
            group_field: Grouping done on specific field in order to create partitions separate
            prefix: ["Before", "After"]
                    After: Calculate days elapsed after an event
                    Before: Calculate days remaining before an event
    
    returns: None, Changes made directly to dataframe
    """
    hol_array = []
    res = []
    if group_field:
        last_store = 0
        if prefix == "After":
            dataframe.sort_values([group_field, date_field], inplace=True)
        elif prefix == "Before":
            dataframe.sort_values(
                [group_field, date_field], ascending=[True, False], inplace=True
            )
    else:
        if prefix == "After":
            dataframe.sort_values([date_field], inplace=True)
        elif prefix == "Before":
            dataframe.sort_values([date_field], ascending=[False], inplace=True)
    last_date = dataframe[date_field].iloc[0]
    first_param = (
        dataframe[group_field].values if group_field else dataframe.index.values
    )
    for s, v, d in zip(
        first_param, dataframe[calculated_field].values, dataframe[date_field].values
    ):
        # print(s, v, d)
        if group_field:
            if s != last_store:
                last_date = np.datetime64()
                last_store = s
                hol_array = []

        if v:
            last_date = d
            hol_array.append(d)
        else:
            last_date = d if not hol_array else hol_array[-1]

        z = ((d - last_date).astype("timedelta64[D]")) / np.timedelta64(1, "D")
        res.append(abs(float(z)))
        # print(hol_array)
        # print(res)
    dataframe[prefix + calculated_field] = res
    # print(dataframe)

In [89]:
get_elapsed_after(df, "date", "StateHoliday")
df

Unnamed: 0,date,StateHoliday,BeforeStateHoliday,AfterStateHoliday
0,2001-01-01,1,0.0,0.0
1,2001-02-01,0,153.0,31.0
2,2001-07-04,1,0.0,0.0
3,2001-08-15,0,99.0,42.0
4,2001-11-22,1,0.0,0.0
5,2001-12-25,1,0.0,0.0


In [79]:
# TODO add dynamic holidays not present in dataframe already