# Load Dataset

In [1]:
import pandas as pd
import numpy as np

from mamimo.time_utils import add_time_features, PowerTrend
from mamimo.carryover import ExponentialCarryover
from mamimo.saturation import ExponentialSaturation

from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('../../raw_data/df_clean.csv')
df = df.drop(columns='Unnamed: 0')
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)
df = df.rename(columns={"fb_costs": "facebook", "google_costs": "google", "tt_costs": "tiktok"})

# Create a simple dataframe

In [3]:
simple_df = df.drop(columns = ['fb_impressions', 'fb_clicks', 'google_impressions', 'google_clicks', 'tt_impressions', 'tt_clicks'])

#X_simple = simple_df.drop(columns = ['orders', 'total_sales']) # 3 channels: facebook, google, tiktok
#y_simple = simple_df['total_sales'] # for now lets just use total_sales and exclude orders

# Split data

In [4]:
n_splits = 5  # Number of splits for cross-validation
tscv = TimeSeriesSplit(n_splits=n_splits)

for train_index, test_index in tscv.split(simple_df):
    train_df = simple_df.iloc[train_index]
    test_df = simple_df.iloc[test_index]

y_train = train_df['total_sales']
X_train = train_df.drop(columns = ['total_sales', 'orders'])


y_test = test_df['total_sales']
X_test = test_df.drop(columns = ['total_sales', 'orders'])


# Baseline Model using Naive Forecast
* Naive forecast shifts the time series by one time step (aka one day); we make the naive forecast predict the previous value for each day
* MSE: Focuses on the average magnitude of prediction errors, which is important for understanding the absolute accuracy of your forecasts
  * measure of how well model captures actual sales values and prediction accuracy
* R-squared: Measures the proportion of variance explained by the model, indicating how well your model captures the underlying patterns and variability in the target variable.

In [5]:
baseline = train_df.shift(1) # baseline['total_sales'] is equivalent to y_pred

y_pred = baseline['total_sales']
y_true = train_df['total_sales']

# MSE
mse = ((y_true - y_pred) ** 2).mean()

# Calculate TSS (Total Sum of Squares)
mean_y_true = np.mean(y_true)
tss = np.sum((y_true - mean_y_true)**2)

# Calculate RSS (Residual Sum of Squares)
rss = np.sum((y_true - y_pred)**2)

# Calculate R-squared (Coefficient of Determination)
r_squared = 1 - (rss / tss)

f"{mse} is our MSE to beat; {r_squared} is our R squared to beat!"

'595453.442743578 is our MSE to beat; 0.6398209085770498 is our R squared to beat!'

# Feature Engineering

In [6]:
from statsmodels.tsa.stattools import adfuller

adfuller(train_df.total_sales)[1] # close enough to 0.05, so our data is stationary enough

0.062198848540433505

## Let's create functions for our FunctionTransformer in order to create our pipeline

### Prereqs:
* data must be a Pandas DataFrame
* date column must be the index
* index must be DateTime format
* Columns must be clearly labeled: total_sales, facebook, google, tiktok

### Time Features

#### Add a holiday column

In [7]:
def add_holidays(df: pd.DataFrame):
    """creates a new column with row value = 1 if the day is a German or Austrian holiday and 0 if not"""
    de_holiday_list = []
    for holiday in holidays.Germany(years=[2021,2022,2023]).items():
        de_holiday_list.append(holiday)
    de_holidays_df = pd.DataFrame(de_holiday_list, columns=["date", "holiday"])
    de_holidays_df['date'] = pd.to_datetime(de_holidays_df['date'])
    de_holidays_df.set_index('date', inplace=True)
    
    at_holiday_list = []
    for holiday in holidays.Austria(years=[2021,2022,2023]).items():
        at_holiday_list.append(holiday)
    at_holidays_df = pd.DataFrame(at_holiday_list, columns=["date", "holiday"])
    at_holidays_df['date'] = pd.to_datetime(at_holidays_df['date'])
    at_holidays_df.set_index('date', inplace=True)

    # add DE holidays to df
    merged_df = df.merge(de_holidays_df, how='left', left_index=True, right_index=True)
    merged_df['de_holiday'] = merged_df.index.isin(de_holidays_df.index).astype(int)
    merged_df.drop(columns=['holiday'], inplace=True)
    
    # add AT holidays to df
    at_holidays_df['at_holiday'] = 1 # add a 1 column to austrian holidays dataframe to help us merge with DE holidays
    merged_df = merged_df.merge(at_holidays_df[['at_holiday']], how='left', left_index=True, right_index=True)
    merged_df.head() # creates two columns (at_holiday_x, at_holiday_y), we only need one
    merged_df['at_holiday'].fillna(0, inplace=True) # replace NaN (no holiday) with 0
    merged_df['at_holiday'] = merged_df['at_holiday'].astype(int) # convert 1 and 0 to integers
    
    # combine columns
    merged_df['holiday'] = (merged_df['at_holiday'] | merged_df['de_holiday']).astype(int)
    merged_df = merged_df.drop(columns = ['de_holiday', 'at_holiday']) # drop individual DE and AT rows
    
    return merged_df

#### Add a weekend column

In [8]:
def add_weekend(df: pd.DataFrame):
    """creates a new column with row value = 1 if the day is a Friday or Saturday and 0 if not"""
    weekday_values = df.index.weekday
    df['fri_sat'] = ((weekday_values == 4) | (weekday_values == 5)).astype(int)
    
    return df

#### Add a month column

In [9]:
def add_month(df: pd.DataFrame):
    df = (df
     .pipe(add_time_features, month=True)
     .assign(trend=range(len(df))) # adds a linear trend which is only counting up from 0 to len(df). these #s will change during preprocessing
    )
    return df

### Cumulative Features

#### Add a cumulative spending column
* We can look at weekly, monthly, and quarterly spending
* For each time period, we have two options: sum previous data, or don't

In [19]:
def add_cum_spending(df: pd.DataFrame, period = 'weekly', carryover = False):
    """Add a cumulative spending column. Select period (weekly, monthly, or quarterly).
    Indicate whether you would like to carry over the previous period in subsequent sums with sum_prev_data."""
    w_cumulative_spending = (
                df[['google', 'tiktok', 'facebook']]
                .resample('W-Mon')  # Resample to weekly, starting on Mondays
                .sum()
            )
    w_cumulative_spending_carryover = (
                df[['google', 'tiktok', 'facebook']]
                .resample('W-Mon')
                .sum()
                .cumsum()
            )
    
    m_cumulative_spending = (
                df[['google', 'tiktok', 'facebook']]
                .resample('M')  # Resample to monthly
                .sum()
            )
    m_cumulative_spending_carryover = (
                df[['google', 'tiktok', 'facebook']]
                .resample('M')
                .sum()
                .cumsum()
            )

    q_cumulative_spending = (
                df[['google', 'tiktok', 'facebook']]
                .resample('Q')  # Resample to quarterly
                .sum()
            )
    q_cumulative_spending_carryover = (
                df[['google', 'tiktok', 'facebook']]
                .resample('Q')
                .sum()
                .cumsum()
            )
    if carryover == False:
        if period == 'weekly':
            df = df.join(w_cumulative_spending, rsuffix='_Cumulative')
        elif period == 'monthly':
            df = df.join(m_cumulative_spending, rsuffix='_Cumulative')
        elif period == 'quarterly':
            df = df.join(q_cumulative_spending, rsuffix='_Cumulative')
        else:
            raise ValueError('Invalid period option. Choose "weekly", "monthly", or "quarterly".')            

    elif carryover == True:
        if period == 'weekly':
            df = df.join(w_cumulative_spending_carryover, rsuffix='_Cumulative')
        elif period == 'monthly':
            df = df.join(m_cumulative_spending_carryover, rsuffix='_Cumulative')
        elif period == 'quarterly':
            df = df.join(q_cumulative_spending_carryover, rsuffix='_Cumulative')
        else:
            raise ValueError('Invalid period option. Choose "weekly", "monthly", or "quarterly".')
            
    else:
        raise ValueError('Invalid carryover option. Choose True or False.')
        
    return df

In [30]:
copy = train_df.copy()

In [31]:
(
copy[['google', 'tiktok', 'facebook']]
.resample('M')
.sum()
.cumsum()
)

Unnamed: 0_level_0,google,tiktok,facebook
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-31,57.15,0.0,18239.87
2021-08-31,128.25,0.0,52249.39
2021-09-30,195.92,0.0,62670.2
2021-10-31,243.06,0.0,95164.24
2021-11-30,277.35,0.0,109291.67
2021-12-31,287.34,0.0,109582.27
2022-01-31,424.89,0.0,131868.3
2022-02-28,683.64,0.0,151541.89
2022-03-31,1531.3,0.0,166579.93
2022-04-30,3294.32,0.0,175550.99


In [24]:
train_df

Unnamed: 0_level_0,orders,total_sales,facebook,google,tiktok
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-07-01,10,414.85,257.01,1.17,0.0
2021-07-02,6,229.95,250.30,0.84,0.0
2021-07-03,10,459.89,248.59,0.72,0.0
2021-07-04,17,508.36,258.05,0.07,0.0
2021-07-05,13,399.88,257.47,0.38,0.0
...,...,...,...,...,...
2023-04-12,83,3718.12,885.33,208.52,0.0
2023-04-13,75,2913.63,882.17,186.47,0.0
2023-04-14,53,1735.46,662.08,160.69,0.0
2023-04-15,41,1320.67,481.71,149.16,0.0


In [29]:
add_cum_spending(copy, period = 'weekly').head(32)

Unnamed: 0_level_0,orders,total_sales,facebook,google,tiktok,google_Cumulative,tiktok_Cumulative,facebook_Cumulative
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-07-01,10,414.85,257.01,1.17,0.0,,,
2021-07-02,6,229.95,250.3,0.84,0.0,,,
2021-07-03,10,459.89,248.59,0.72,0.0,,,
2021-07-04,17,508.36,258.05,0.07,0.0,,,
2021-07-05,13,399.88,257.47,0.38,0.0,3.18,0.0,1271.42
2021-07-06,13,492.87,256.4,3.48,0.0,,,
2021-07-07,8,344.92,255.83,0.9,0.0,,,
2021-07-08,15,685.33,256.67,1.18,0.0,,,
2021-07-09,15,699.27,253.04,0.34,0.0,,,
2021-07-10,8,349.92,247.54,0.49,0.0,,,


### Lag Features

#### Sales lag

In [13]:
def add_sales_lag(df: pd.DataFrame, period = 7):
    """Adds a sales lag column. Indicate the period of lag (in days - default is weekly/7 days)."""
    df['sales_lag'] = df['total_sales'].shift(period)
    df['sales_lag'] = df['sales_lag'].bfill()  # Backfill missing values
    return df

### Moving Average Features

#### Sales moving average

In [14]:
def add_sales_moving_avg(df: pd.DataFrame, window_size = 7):
    """Adds a sales moving average column. Indicate the window size (default = 7)"""
    df['sales_moving_avg'] = df['total_sales'].rolling(window=window_size, center=True).mean()
    df['sales_moving_avg'] = df['sales_moving_avg'].bfill()  # Backfill missing values at beginning of df
    df['sales_moving_avg'] = df['sales_moving_avg'].ffill()  # Forwardfill missing values at end of df
    return df

### Exponential Smoothing

#### Sales exponential smoothing

In [15]:
def add_sales_exp_smoothing(df: pd.DataFrame, alpha = 0.5):
    """Adds a sales (exponential smoothing) column. Indicate the alpha value (default = 0.5)"""
    smoothed_series = [df['total_sales'].iloc[0]]
    for i in range(1, len(df)):
        smoothed_value = alpha * df['total_sales'].iloc[i] + (1 - alpha) * smoothed_series[-1]
        smoothed_series.append(smoothed_value)
    df['smoothed_sales'] = smoothed_series
    return df

## Transform Features

#### Transform features to [advertising adstock](https://en.wikipedia.org/wiki/Advertising_adstock) features
* Advertising adstock: lagged effect of advertising on consumer purchase behavior.

In [16]:
adstock = ColumnTransformer(
    [
     ('facebook_pipe', Pipeline([
            ('carryover', ExponentialCarryover()),
            ('saturation', ExponentialSaturation())
     ]), ['facebook']),
     ('google_pipe', Pipeline([
            ('carryover', ExponentialCarryover()),
            ('saturation', ExponentialSaturation())
     ]), ['google']),
     ('tiktok_pipe', Pipeline([
            ('carryover', ExponentialCarryover()),
            ('saturation', ExponentialSaturation())
     ]), ['tiktok'])
])

In [17]:
# check taxifare notebook and add time series feature
# do i keep other time features after this?
# check how this affects modelling

## Pipeline Prep

### Create a 1. FunctionTransformer for stateless transformations and 2. FunctionTransformer + GridSearchCV for stateful transformation

In [18]:
holidays = FunctionTransformer(add_holidays)
weekends = FunctionTransformer(add_weekends)
months = FunctionTransformer(add_months)

###

cum_spending = FunctionTransformer(add_cum_spending, kw_args = {'period': 'weekly', 'carryover': False})

cum_param_grid = dict(
    cum_spending__kw_args = [
        {'period': 'weekly', 'carryover': False},
        {'period': 'monthly', 'carryover': False},
        {'period': 'quarterly', 'carryover': False},
        {'period': 'weekly', 'carryover': True},
        {'period': 'monthly', 'carryover': True},
        {'period': 'quarterly', 'carryover': True},
    ]
)

###

sales_lag = FunctionTransformer(add_sales_lag, kw_args = {'period': 7})

lag_param_grid = dict(
    sales_lag__kw_args = [
        {'period': [1]},
        {'period': [7]},
        {'period': [15]},
        {'period': [30]}
    ]
)

###

sales_moving_avg = FunctionTransformer(add_sales_moving_avg, kw_args = {'window_size': 7})

mov_avg_param_grid = dict(
    sales_moving_avg__kw_args = [
        {'window_size': [7]},
        {'window_size': [15]},
        {'window_size': [30]}
    ]
)

###

sales_exp_smoothing = FunctionTransformer(add_sales_exp_smoothing, kw_args = {'alpha': 0.5})

smoothing_param_grid = dict(
    sales_exp_smoothing__kw_args = [
        {'alpha': [0.2]},
        {'alpha': [0.5]},
        {'alpha': [0.7]}
    ]
)

NameError: name 'add_weekends' is not defined

## Pipeline

In [None]:
model = Pipeline([
    ('adstock', adstock),
    ('regression', LinearRegression(positive=True))
])