# Init library

In [1]:
# get parent directory for easy calling files outside of folder conatining the code file

import os
import sys
from pathlib import Path

parent_dir = Path(os.getcwd()).resolve().parent
sys.path.append(str(parent_dir))

print(parent_dir)



O:\Projects\time_series_feature_engineering


In [2]:
#!/usr/bin/env python
# coding: utf-8

import numpy as np

import polars as pl

import holidays

# Read data

In [3]:
df = pl.read_excel(parent_dir / 'data/test_data_2020_202208.xlsx')

print(df.shape)

df.head(5)

(988, 4)


date,x_1,x_2,label
date,i64,i64,i64
2022-09-14,125490,169835,108957173922
2022-09-13,107326,146494,97902777699
2022-09-12,93820,130855,95690875998
2022-09-11,64274,87798,58820280634
2022-09-10,66959,90096,69852043855


# Time-based features
- Holidays, number of days until next/have passed holidays…
- Weekdays, weekends
- Seasons of year (quarters)
- Sales days (Black Friday, 11/11…)
- Day, month, quarter, year of date

In [5]:
# years in data
df_prep = df.with_columns(year=pl.col('date').dt.year())

years = list(df_prep['year'].unique())

# ====================================
# holiday in selected years
vn_holidays = holidays.VN(years=years)

df_prep = df_prep.with_columns(is_holiday=pl.col('date').is_in(vn_holidays.keys()).cast(pl.Int8))


# ====================================
# weekdays, weekends
df_prep = df_prep.with_columns(weekday=pl.col('date').dt.weekday().cast(pl.Int8))


# ====================================
# Day, month, quarter, year of date
df_prep = df_prep.with_columns(day=pl.col('date').dt.day().cast(pl.Int8))
df_prep = df_prep.with_columns(month=pl.col('date').dt.month().cast(pl.Int8))
df_prep = df_prep.with_columns(quarter=pl.col('date').dt.quarter().cast(pl.Int8))


# ====================================
# Sales days (Black Friday, 11/11…)
# you can go crazy with the logic of sales days. Either have a fixed list of dates, or create a logic to get last friday of the month...
df_prep = df_prep.with_columns(special_day=(pl.col('day') == pl.col('month')).cast(pl.Int8))


df_prep.head(14)

date,x_1,x_2,label,year,is_holiday,weekday,day,month,quarter,special_day
date,i64,i64,i64,i32,i8,i8,i8,i8,i8,i8
2022-09-14,125490,169835,108957173922,2022,0,3,14,9,3,0
2022-09-13,107326,146494,97902777699,2022,0,2,13,9,3,0
2022-09-12,93820,130855,95690875998,2022,0,1,12,9,3,0
2022-09-11,64274,87798,58820280634,2022,0,7,11,9,3,0
2022-09-10,66959,90096,69852043855,2022,0,6,10,9,3,0
…,…,…,…,…,…,…,…,…,…,…
2022-09-05,45267,56439,52145550109,2022,0,1,5,9,3,0
2022-09-04,21103,26208,22803785170,2022,0,7,4,9,3,0
2022-09-03,20778,26028,22707880783,2022,0,6,3,9,3,0
2022-09-02,18867,23122,19098035913,2022,1,5,2,9,3,0


In [6]:
# ====================================
# days to next holiday
# days pass holiday

# the next year holiday for future holiday of last dates of a year
years_adj = years.copy()

years_adj.append(years_adj[-1] + 1)

# the previous year holiday for past holiday of first dates of a year
years_adj.append(years_adj[0] - 1)

years_adj.sort()

print(years_adj)

vn_holidays = holidays.VN(years=years_adj)
holidays_array = np.array(list(vn_holidays.keys()))

# Convert date to numerical format for vectorized operations
def date_to_numeric(date):
    days = np.datetime64(date) - np.datetime64('1970-01-01')  # days since epoch
    return days.astype('timedelta64[D]') / np.timedelta64(1, 'D')

# Function to compute nearest holidays
def compute_nearest_holidays(df):    
    # Convert DataFrame dates to numpy array
    dates = df['date'].to_numpy()
    
    # Convert dates to numeric format (days since epoch)
    dates_numeric = np.array([date_to_numeric(date) for date in dates])
    holidays_numeric = np.array([date_to_numeric(date) for date in holidays_array])
    
    # Compute differences
    diff_matrix = dates_numeric[:, None] - holidays_numeric

    # Mask for past and future holidays
    past_mask = diff_matrix >= 0
    future_mask = diff_matrix < 0

    # Handle past holidays
    past_diff_matrix = np.where(past_mask, diff_matrix, np.inf)
    nearest_past_days = np.min(past_diff_matrix, axis=1)

    # Handle future holidays
    future_diff_matrix = np.where(future_mask, -diff_matrix, np.inf)
    nearest_future_days = np.min(future_diff_matrix, axis=1)

    # Create result DataFrame
    result_df = df.with_columns([
        pl.Series(name="days_to_nearest_past", values=nearest_past_days).cast(pl.Int16),
        pl.Series(name="days_to_nearest_future", values=nearest_future_days).cast(pl.Int16)
    ])
    
    return result_df

# Apply the function
df_prep = compute_nearest_holidays(df_prep)

df_prep.head(10)

[2019, 2020, 2021, 2022, 2023]


date,x_1,x_2,label,year,is_holiday,weekday,day,month,quarter,special_day,days_to_nearest_past,days_to_nearest_future
date,i64,i64,i64,i32,i8,i8,i8,i8,i8,i8,i16,i16
2022-09-14,125490,169835,108957173922,2022,0,3,14,9,3,0,12,109
2022-09-13,107326,146494,97902777699,2022,0,2,13,9,3,0,11,110
2022-09-12,93820,130855,95690875998,2022,0,1,12,9,3,0,10,111
2022-09-11,64274,87798,58820280634,2022,0,7,11,9,3,0,9,112
2022-09-10,66959,90096,69852043855,2022,0,6,10,9,3,0,8,113
2022-09-09,72821,97114,79781916805,2022,0,5,9,9,3,1,7,114
2022-09-08,54406,71847,55782383425,2022,0,4,8,9,3,0,6,115
2022-09-07,59809,75899,56484560972,2022,0,3,7,9,3,0,5,116
2022-09-06,54644,69028,53346288660,2022,0,2,6,9,3,0,4,117
2022-09-05,45267,56439,52145550109,2022,0,1,5,9,3,0,3,118


In [7]:
# ====================================
# expand features columns to horizontal
df_prep = df_prep.to_dummies(columns=['weekday', 'day', 'month', 'quarter'])

df_prep.head(10)

date,x_1,x_2,label,year,is_holiday,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,day_1,day_10,day_11,day_12,day_13,day_14,day_15,day_16,day_17,day_18,day_19,day_2,day_20,day_21,day_22,day_23,day_24,day_25,day_26,day_27,day_28,day_29,day_3,day_30,day_31,day_4,day_5,day_6,day_7,day_8,day_9,month_1,month_10,month_11,month_12,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,quarter_1,quarter_2,quarter_3,quarter_4,special_day,days_to_nearest_past,days_to_nearest_future
date,i64,i64,i64,i32,i8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,i8,i16,i16
2022-09-14,125490,169835,108957173922,2022,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,12,109
2022-09-13,107326,146494,97902777699,2022,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,11,110
2022-09-12,93820,130855,95690875998,2022,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,10,111
2022-09-11,64274,87798,58820280634,2022,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,9,112
2022-09-10,66959,90096,69852043855,2022,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,8,113
2022-09-09,72821,97114,79781916805,2022,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,7,114
2022-09-08,54406,71847,55782383425,2022,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,6,115
2022-09-07,59809,75899,56484560972,2022,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,5,116
2022-09-06,54644,69028,53346288660,2022,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,4,117
2022-09-05,45267,56439,52145550109,2022,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,3,118
