# Load dataset

In [1]:
import pandas as pd

df = pd.read_csv('../../raw_data/df_clean.csv')
df = df.drop(columns='Unnamed: 0')
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)
df = df.rename(columns={"fb_costs": "facebook", "google_costs": "google", "tt_costs": "tiktok"})

# Create a simple dataframe

In [2]:
simple_df = df.drop(columns = ['fb_impressions', 'fb_clicks', 'google_impressions', 'google_clicks', 'tt_impressions', 'tt_clicks'])

X_simple = simple_df.drop(columns = ['orders', 'total_sales']) # 3 channels: facebook, google, tiktok
y_simple = simple_df['total_sales'] # for now lets just use total_sales and exclude orders

# Split data

In [3]:
from sklearn.model_selection import TimeSeriesSplit

n_splits = 5  # Number of splits for cross-validation
tscv = TimeSeriesSplit(n_splits=n_splits)

for train_index, test_index in tscv.split(simple_df):
    X_train = simple_df.iloc[train_index]
    X_test = simple_df.iloc[test_index]
    
y_train = X_train['total_sales']
y_test = X_test['total_sales']

# Confirm chronological split (test data should have the most recent dates)

In [4]:
y_train

Day
2021-07-01     414.85
2021-07-02     229.95
2021-07-03     459.89
2021-07-04     508.36
2021-07-05     399.88
               ...   
2023-04-12    3718.12
2023-04-13    2913.63
2023-04-14    1735.46
2023-04-15    1320.67
2023-04-16    3647.40
Name: total_sales, Length: 655, dtype: float64

In [5]:
y_test

Day
2023-04-17     3121.57
2023-04-18     3851.15
2023-04-19     2858.61
2023-04-20     3623.44
2023-04-21     1804.11
                ...   
2023-08-20    13616.97
2023-08-21    15348.42
2023-08-22     6054.53
2023-08-23     4810.67
2023-08-24     4889.20
Name: total_sales, Length: 130, dtype: float64

# Feature Engineering

### Date Time Features
* Weekend or not
* Holiday or not

### Lag and Window Features
* Carryover effect
* Saturation


## Add holidays with `holidays`

In [6]:
from datetime import date
import holidays

### Create German holiday dataframe

In [7]:
de_holiday_list = []
for holiday in holidays.Germany(years=[2021,2022,2023]).items():
    de_holiday_list.append(holiday)
de_holidays_df = pd.DataFrame(de_holiday_list, columns=["date", "holiday"])
de_holidays_df['date'] = pd.to_datetime(de_holidays_df['date'])
de_holidays_df.set_index('date', inplace=True)

### Create Austrian holiday dataframe

In [8]:
at_holiday_list = []
for holiday in holidays.Austria(years=[2021,2022,2023]).items():
    at_holiday_list.append(holiday)
at_holidays_df = pd.DataFrame(at_holiday_list, columns=["date", "holiday"])
at_holidays_df['date'] = pd.to_datetime(at_holidays_df['date'])
at_holidays_df.set_index('date', inplace=True)

### Add DE holidays

In [9]:
merged_df = simple_df.merge(de_holidays_df, how='left', left_index=True, right_index=True)
merged_df['de_holiday'] = merged_df.index.isin(de_holidays_df.index).astype(int)
merged_df.drop(columns=['holiday'], inplace=True)
merged_df.head() # new column with 1 for german holiday, 0 for no german holiday

Unnamed: 0_level_0,orders,total_sales,facebook,google,tiktok,de_holiday
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-01,10,414.85,257.01,1.17,0.0,0
2021-07-02,6,229.95,250.3,0.84,0.0,0
2021-07-03,10,459.89,248.59,0.72,0.0,0
2021-07-04,17,508.36,258.05,0.07,0.0,0
2021-07-05,13,399.88,257.47,0.38,0.0,0


### Check a random holiday

In [10]:
specific_date = pd.to_datetime('2023-05-10')  # Change to the date you want to check
is_holiday_value = merged_df.loc[specific_date, 'de_holiday']
is_holiday_value

0

### Add AT holidays

In [11]:
at_holidays_df['at_holiday'] = 1 # add a 1 column to austrian holidays dataframe to help us merge with DE holidays
merged_df = merged_df.merge(at_holidays_df[['at_holiday']], how='left', left_index=True, right_index=True)
merged_df.head() # creates two columns (at_holiday_x, at_holiday_y), we only need one
merged_df['at_holiday'].fillna(0, inplace=True) # replace NaN (no holiday) with 0
merged_df['at_holiday'] = merged_df['at_holiday'].astype(int) # convert 1 and 0 to integers
merged_df.head() # new column with 1 for austrian holiday, 0 for no austrian holiday

Unnamed: 0_level_0,orders,total_sales,facebook,google,tiktok,de_holiday,at_holiday
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-07-01,10,414.85,257.01,1.17,0.0,0,0
2021-07-02,6,229.95,250.3,0.84,0.0,0,0
2021-07-03,10,459.89,248.59,0.72,0.0,0,0
2021-07-04,17,508.36,258.05,0.07,0.0,0,0
2021-07-05,13,399.88,257.47,0.38,0.0,0,0


### Check a random holiday

In [12]:
specific_date = pd.to_datetime('2023-05-10')  # Change to the date you want to check
is_holiday_value = merged_df.loc[specific_date, 'at_holiday']
is_holiday_value

0

### Combine DE and AT holiday columns
* 1: holiday in Germany or Austria
* 0: no holiday in either country

In [13]:
merged_df['holiday'] = (merged_df['at_holiday'] | merged_df['de_holiday']).astype(int)
merged_df = merged_df.drop(columns = ['de_holiday', 'at_holiday']) # drop individual DE and AT rows
merged_df.head()

Unnamed: 0_level_0,orders,total_sales,facebook,google,tiktok,holiday
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-07-01,10,414.85,257.01,1.17,0.0,0
2021-07-02,6,229.95,250.3,0.84,0.0,0
2021-07-03,10,459.89,248.59,0.72,0.0,0
2021-07-04,17,508.36,258.05,0.07,0.0,0
2021-07-05,13,399.88,257.47,0.38,0.0,0


### Check a random holiday

In [14]:
specific_date = pd.to_datetime('2022-04-15')  # Checking Good Friday - only a holiday in Germany
is_holiday_value = merged_df.loc[specific_date, 'holiday']
is_holiday_value

1

In [15]:
specific_date = pd.to_datetime('2022-06-16')  # Checking Corpus Christi - only a holiday in Austria
is_holiday_value = merged_df.loc[specific_date, 'holiday']
is_holiday_value

1

In [16]:
specific_date = pd.to_datetime('2022-06-02')  # Checking random non-holiday date
is_holiday_value = merged_df.loc[specific_date, 'holiday']
is_holiday_value

0

## New column: Friday or Saturday = 1

In [17]:
weekday_values = merged_df.index.weekday
merged_df['fri_sat'] = ((weekday_values == 4) | (weekday_values == 5)).astype(int)
merged_df.head()

Unnamed: 0_level_0,orders,total_sales,facebook,google,tiktok,holiday,fri_sat
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-07-01,10,414.85,257.01,1.17,0.0,0,0
2021-07-02,6,229.95,250.3,0.84,0.0,0,1
2021-07-03,10,459.89,248.59,0.72,0.0,0,1
2021-07-04,17,508.36,258.05,0.07,0.0,0,0
2021-07-05,13,399.88,257.47,0.38,0.0,0,0
