# Load Dataset

In [21]:

import pandas as pd
import numpy as np

from mamimo.time_utils import add_time_features, PowerTrend
from mamimo.carryover import ExponentialCarryover
from mamimo.saturation import ExponentialSaturation

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, OneToOneFeatureMixin

from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

import holidays

In [22]:
df = pd.read_csv('../../raw_data/df_clean.csv')
df = df.drop(columns='Unnamed: 0')
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)
df = df.rename(columns={"fb_costs": "facebook", "google_costs": "google", "tt_costs": "tiktok"})

# Create a simple dataframe

In [23]:
simple_df = df.drop(columns = ['fb_impressions', 'fb_clicks', 'google_impressions', 'google_clicks', 'tt_impressions', 'tt_clicks'])

#X_simple = simple_df.drop(columns = ['orders', 'total_sales']) # 3 channels: facebook, google, tiktok
#y_simple = simple_df['total_sales'] # for now lets just use total_sales and exclude orders

# Split data

In [24]:
n_splits = 5  # Number of splits for cross-validation
tscv = TimeSeriesSplit(n_splits=n_splits)

for train_index, test_index in tscv.split(simple_df):
    train_df = simple_df.iloc[train_index]
    test_df = simple_df.iloc[test_index]

y_train = train_df['total_sales']
X_train = train_df.drop(columns = ['total_sales', 'orders'])


y_test = test_df['total_sales']
X_test = test_df.drop(columns = ['total_sales', 'orders'])


# Baseline Model using Naive Forecast
* Naive forecast shifts the time series by one time step (aka one day); we make the naive forecast predict the previous value for each day
* MSE: Focuses on the average magnitude of prediction errors, which is important for understanding the absolute accuracy of your forecasts
  * measure of how well model captures actual sales values and prediction accuracy
* R-squared: Measures the proportion of variance explained by the model, indicating how well your model captures the underlying patterns and variability in the target variable.

In [25]:
baseline = train_df.shift(1) # baseline['total_sales'] is equivalent to y_pred

y_pred = baseline['total_sales']
y_true = train_df['total_sales']

# MSE
mse = ((y_true - y_pred) ** 2).mean()

# Calculate TSS (Total Sum of Squares)
mean_y_true = np.mean(y_true)
tss = np.sum((y_true - mean_y_true)**2)

# Calculate RSS (Residual Sum of Squares)
rss = np.sum((y_true - y_pred)**2)

# Calculate R-squared (Coefficient of Determination)
r_squared = 1 - (rss / tss)

f"{mse} is our MSE to beat; {r_squared} is our R squared to beat!"

'595453.442743578 is our MSE to beat; 0.6398209085770498 is our R squared to beat!'

# Feature Engineering

In [26]:
from statsmodels.tsa.stattools import adfuller

adfuller(train_df.total_sales)[1] # close enough to 0.05, so our data is stationary enough

0.062198848540433505

## Let's create functions for our FunctionTransformer in order to create our pipeline

### Prereqs:
* data must be a Pandas DataFrame
* date column must be the index
* index must be DateTime format
* Columns must be clearly labeled: total_sales, facebook, google, tiktok

### Time Features

#### Add a holiday column

In [27]:
def add_holidays(df: pd.DataFrame):
    """creates a new column with row value = 1 if the day is a German or Austrian holiday and 0 if not"""
    de_holiday_list = []
    for holiday in holidays.Germany(years=[2021,2022,2023]).items():
        de_holiday_list.append(holiday)
    de_holidays_df = pd.DataFrame(de_holiday_list, columns=["date", "holiday"])
    de_holidays_df['date'] = pd.to_datetime(de_holidays_df['date'])
    de_holidays_df.set_index('date', inplace=True)
    
    at_holiday_list = []
    for holiday in holidays.Austria(years=[2021,2022,2023]).items():
        at_holiday_list.append(holiday)
    at_holidays_df = pd.DataFrame(at_holiday_list, columns=["date", "holiday"])
    at_holidays_df['date'] = pd.to_datetime(at_holidays_df['date'])
    at_holidays_df.set_index('date', inplace=True)

    # add DE holidays to df
    merged_df = df.merge(de_holidays_df, how='left', left_index=True, right_index=True)
    merged_df['de_holiday'] = merged_df.index.isin(de_holidays_df.index).astype(int)
    merged_df.drop(columns=['holiday'], inplace=True)
    
    # add AT holidays to df
    at_holidays_df['at_holiday'] = 1 # add a 1 column to austrian holidays dataframe to help us merge with DE holidays
    merged_df = merged_df.merge(at_holidays_df[['at_holiday']], how='left', left_index=True, right_index=True)
    merged_df.head() # creates two columns (at_holiday_x, at_holiday_y), we only need one
    merged_df['at_holiday'].fillna(0, inplace=True) # replace NaN (no holiday) with 0
    merged_df['at_holiday'] = merged_df['at_holiday'].astype(int) # convert 1 and 0 to integers
    
    # combine columns
    merged_df['holiday'] = (merged_df['at_holiday'] | merged_df['de_holiday']).astype(int)
    merged_df = merged_df.drop(columns = ['de_holiday', 'at_holiday']) # drop individual DE and AT rows
    
    return merged_df

#### Add a weekend column

In [28]:
def add_weekends(df: pd.DataFrame):
    """creates a new column with row value = 1 if the day is a Friday or Saturday and 0 if not"""
    weekday_values = df.index.weekday
    df['fri_sat'] = ((weekday_values == 4) | (weekday_values == 5)).astype(int)
    
    return df

#### Add a cyclical month column

In [29]:
def add_cyclical_months(df: pd.DataFrame):
    df = (df
     .pipe(add_time_features, month=True)
     .assign(trend=range(len(df))) # adds a linear trend which is only counting up from 0 to len(df). these #s will change during preprocessing
    )
    
    months_in_a_year = 12

    df['sin_MonthYear'] = np.sin(2*np.pi*(df['month'])/months_in_a_year)
    df['cos_MonthYear'] = np.cos(2*np.pi*(df['month'])/months_in_a_year)
    df.drop(columns=['month'], inplace=True)
    
    return df

### Cumulative Features

#### Add a cumulative spending column
* We can look at weekly, monthly, and quarterly spending
* For each time period, we have two options: sum previous data, or don't

In [30]:
def add_cum_spending(df: pd.DataFrame, period = 'weekly', carryover = False):
    """Add a cumulative spending column. Select period (weekly, monthly, or quarterly).
    Indicate whether you would like to carry over the previous period in subsequent sums with sum_prev_data."""
    w_cumulative_spending = (
                df[['google', 'tiktok', 'facebook']]
                .resample('W-Mon')  # Resample to weekly, starting on Mondays
                .sum()
            )
    w_cumulative_spending_carryover = (
                df[['google', 'tiktok', 'facebook']]
                .resample('W-Mon')
                .sum()
                .cumsum()
            )
    m_cumulative_spending = (
                df[['google', 'tiktok', 'facebook']]
                .resample('M')  # Resample to monthly
                .sum()
            )
    m_cumulative_spending_carryover = (
                df[['google', 'tiktok', 'facebook']]
                .resample('M')
                .sum()
                .cumsum()
            )

    q_cumulative_spending = (
                df[['google', 'tiktok', 'facebook']]
                .resample('Q')  # Resample to quarterly
                .sum()
            )
    q_cumulative_spending_carryover = (
                df[['google', 'tiktok', 'facebook']]
                .resample('Q')
                .sum()
                .cumsum()
            )

    week_values = {
        1: 6, # tuesday
        2: 5, # wednesday
        3: 4, # thursday
        4: 3, # friday
        5: 2, # saturday
        6: 1 # sunday
    }
    
    first_date = df.index[0]
    day_of_week = first_date.dayofweek
    cum_channels = ['facebook_cumulative','google_cumulative','tiktok_cumulative']
    cost_channels = ['facebook', 'google', 'tiktok']
    
    if carryover == False:
        if period == 'weekly':
            cumulative_spending_daily = w_cumulative_spending.reindex(df.index, method='ffill')                   
            # Merge the daily cumulative values back to the original DataFrame:
            df = df.join(cumulative_spending_daily, rsuffix='_cumulative')
            if day_of_week in week_values:
                idx = (week_values[day_of_week])
                selection = pd.DataFrame(df.loc[df.index[0:idx], 'facebook'])
                df['facebook_cumulative'][0:idx] = selection['facebook'].sum()
                
                selection = pd.DataFrame(df.loc[df.index[0:idx], 'google'])
                df['google_cumulative'][0:idx] = selection['google'].sum()
                
                selection = pd.DataFrame(df.loc[df.index[0:idx], 'tiktok'])
                df['tiktok_cumulative'][0:idx] = selection['tiktok'].sum()
        elif period == 'monthly':
            cumulative_spending_monthly = m_cumulative_spending.reindex(df.index, method='bfill')
            df = df.join(cumulative_spending_monthly, rsuffix='_cumulative')
        elif period == 'quarterly':
            cumulative_spending_quarterly = q_cumulative_spending.reindex(df.index, method='bfill')
            df = df.join(cumulative_spending_quarterly, rsuffix='_cumulative')
        else:
            raise ValueError('Invalid period option. Choose "weekly", "monthly", or "quarterly".')            

    elif carryover == True:
        if period == 'weekly':
            cumulative_spending_filled = w_cumulative_spending_carryover.reindex(df.index, method='ffill')
            cumulative_spending_filled.columns = [f'{col}_cum_carryover' for col in cumulative_spending_filled.columns]
            df = df.join(cumulative_spending_filled)
            if day_of_week in week_values:
                idx = (week_values[day_of_week])
                selection = pd.DataFrame(df.loc[df.index[0:idx], 'facebook'])
                df['facebook_cum_carryover'][0:idx] = selection['facebook'].sum()
                
                selection = pd.DataFrame(df.loc[df.index[0:idx], 'google'])
                df['google_cum_carryover'][0:idx] = selection['google'].sum()
                
                selection = pd.DataFrame(df.loc[df.index[0:idx], 'tiktok'])
                df['tiktok_cum_carryover'][0:idx] = selection['tiktok'].sum()
        elif period == 'monthly':
            cumulative_spending_filled = m_cumulative_spending_carryover.reindex(df.index, method='bfill')
            cumulative_spending_filled.columns = [f'{col}_cum_carryover' for col in cumulative_spending_filled.columns]
            df = df.join(cumulative_spending_filled)
        elif period == 'quarterly':
            cumulative_spending_filled = q_cumulative_spending_carryover.reindex(df.index, method='bfill')
            cumulative_spending_filled.columns = [f'{col}_cum_carryover' for col in cumulative_spending_filled.columns]
            df = df.join(cumulative_spending_filled)
        else:
            raise ValueError('Invalid period option. Choose "weekly", "monthly", or "quarterly".')
            
    else:
        raise ValueError('Invalid carryover option. Choose True or False.')
        

    return df

### Lag Features

#### Sales lag

In [31]:
def add_sales_lag(df: pd.DataFrame, period = 7):
    """Adds a sales lag column. Indicate the period of lag (in days - default is weekly/7 days)."""
    df['sales_lag'] = df['total_sales'].shift(period)
    df['sales_lag'] = df['sales_lag'].bfill()  # Backfill missing values
    return df

### Moving Average Features

#### Sales moving average

In [32]:
def add_sales_moving_avg(df: pd.DataFrame, window_size = 7):
    """Adds a sales moving average column. Indicate the window size (default = 7)"""
    df['sales_moving_avg'] = df['total_sales'].rolling(window=window_size, center=True).mean()
    df['sales_moving_avg'] = df['sales_moving_avg'].bfill()  # Backfill missing values at beginning of df
    df['sales_moving_avg'] = df['sales_moving_avg'].ffill()  # Forwardfill missing values at end of df
    return df

### Exponential Smoothing

#### Sales exponential smoothing

In [33]:
def add_sales_exp_smoothing(df: pd.DataFrame, alpha = 0.5):
    """Adds a sales (exponential smoothing) column. Indicate the alpha value (default = 0.5)"""
    smoothed_series = [df['total_sales'].iloc[0]]
    for i in range(1, len(df)):
        smoothed_value = alpha * df['total_sales'].iloc[i] + (1 - alpha) * smoothed_series[-1]
        smoothed_series.append(smoothed_value)
    df['smoothed_sales'] = smoothed_series
    return df

### Functions to aid my pipeline

In [34]:
def drop_column(df: pd.DataFrame):
    df = df.drop(columns = 'total_sales')
    return df

In [35]:
copy = train_df.copy()
df = add_holidays(copy)
df = add_weekends(df)
df = add_cyclical_months(df)
df = add_cum_spending(df)
df = add_sales_lag(df)
df = add_sales_moving_avg(df)
df = add_sales_exp_smoothing(df)
df.columns

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['facebook_cumulative'][0:idx] = selection['facebook'].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['google_cumulative'][0:idx] = selection['google'].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tiktok_cumulative'][0:idx] = selection['tiktok'].sum()


Index(['orders', 'total_sales', 'facebook', 'google', 'tiktok', 'holiday',
       'fri_sat', 'trend', 'sin_MonthYear', 'cos_MonthYear',
       'google_cumulative', 'tiktok_cumulative', 'facebook_cumulative',
       'sales_lag', 'sales_moving_avg', 'smoothed_sales'],
      dtype='object')

In [36]:
def create_df(array: np.array):
    columns = ['orders', 'total_sales', 'facebook', 'google', 'tiktok', 'holiday',
       'fri_sat', 'trend', 'sin_MonthYear', 'cos_MonthYear',
       'google_cumulative', 'tiktok_cumulative', 'facebook_cumulative',
       'sales_lag', 'sales_moving_avg', 'smoothed_sales']
    columns_ = ['facebook','google','tiktok']
    df = pd.DataFrame(array, columns = columns_)
    datetime_series = pd.Series(
        pd.date_range("2021-07-01", periods=len(df), freq="D")
        )
    df = pd.concat([df,datetime_series],axis=1)
    df = df.set_index(0)
    df = df.rename(index={0: 'Day'})
    return df

In [37]:
def drop_col(df: pd.DataFrame):
    df = pd.DataFrame(df)
    df = df.drop(columns = 1)
    return df

## Transform Features

#### Transform features to [advertising adstock](https://en.wikipedia.org/wiki/Advertising_adstock) features
* Advertising adstock: lagged effect of advertising on consumer purchase behavior.

In [38]:
adstock = ColumnTransformer(
    [
     ('facebook_pipe', Pipeline([
            ('carryover', ExponentialCarryover()),
            ('saturation', ExponentialSaturation())
     ]), ['facebook']),
     ('google_pipe', Pipeline([
            ('carryover', ExponentialCarryover()),
            ('saturation', ExponentialSaturation())
     ]), ['google']),
     ('tiktok_pipe', Pipeline([
            ('carryover', ExponentialCarryover()),
            ('saturation', ExponentialSaturation())
     ]), ['tiktok'])
], passthrough)

## Pipeline Prep

### Create a 1. FunctionTransformer for stateless transformations and 2. FunctionTransformer + GridSearchCV for stateful transformation

In [39]:


holidays_ft = FunctionTransformer(add_holidays)
weekends = FunctionTransformer(add_weekends)
months = FunctionTransformer(add_cyclical_months)
drop_column_ft = FunctionTransformer(drop_column)
create_df_ft = FunctionTransformer(create_df)
drop_col_ft = FunctionTransformer(drop_col)

###

cum_spending = FunctionTransformer(add_cum_spending, kw_args = {'period': 'weekly', 'carryover': False})

cum_spending_param_grid = dict(
    union__preprocessing__cum_spending__kw_args = [
        {'period': 'weekly', 'carryover': False},
        {'period': 'monthly', 'carryover': False},
        {'period': 'quarterly', 'carryover': False},
        {'period': 'weekly', 'carryover': True},
        {'period': 'monthly', 'carryover': True},
        {'period': 'quarterly', 'carryover': True},
    ]
)

###

sales_lag = FunctionTransformer(add_sales_lag, kw_args = {'period': 7})

sales_lag_param_grid = dict(
    union__preprocessing__sales_lag__kw_args = [
        {'period': 1},
        {'period': 7},
        {'period': 15},
        {'period': 30}
    ]
)

###

sales_moving_avg = FunctionTransformer(add_sales_moving_avg, kw_args = {'window_size': 7})

sales_moving_avg_param_grid = dict(
    union__preprocessing__sales_moving_avg__kw_args = [
        {'window_size': 7},
        {'window_size': 15},
        {'window_size': 30}
    ]
)

###

sales_exp_smoothing = FunctionTransformer(add_sales_exp_smoothing, kw_args = {'alpha': 0.5})

sales_exp_smoothing_param_grid = dict(
    union__preprocessing__sales_exp_smoothing__kw_args = [
        {'alpha': 0.2},
        {'alpha': 0.5},
        {'alpha': 0.7}
    ]
)



### Pipeline

### Moment of Truth - Grid Search!

In [58]:
model = LinearRegression()

# add dummy target column to user-given DF  so it can be dropped in the pipeline

adstock_pipeline = Pipeline([
    ('adstock', adstock),
    ('create_df_ft', create_df_ft)
])

preprocessing = Pipeline([
    ('holidays_ft', holidays_ft),
    ('weekends', weekends),
    ('months', months),
    ('cum_spending', cum_spending),
    ('sales_lag', sales_lag),
    ('sales_moving_avg', sales_moving_avg),
    ('sales_exp_smoothing', sales_exp_smoothing)
])

union = FeatureUnion([
    #('preprocessing', preprocessing),
    ('adstock_pipeline', adstock_pipeline)
])

pipeline = Pipeline([
    ('union', union),
    ('drop_col_ft', drop_col_ft),
    ('model', model)
])

grid = {}
combined_param_grid = [
    {
        **cum_spending_param_grid,
        **sales_lag_param_grid,
        **sales_moving_avg_param_grid,
        **sales_exp_smoothing_param_grid
    }
]

grid_search = GridSearchCV(pipeline, combined_param_grid, cv=tscv, scoring='neg_mean_squared_error')
grid_search.get_params().keys()
grid_search.fit(train_df, y_train)

In [59]:
grid_search.best_params_

{}

In [60]:
grid_search.best_estimator_.score(test_df, y_test)

-1.1087291294489234

## Check adstock

In [42]:
adstock.fit_transform(train_df)

array([[1.        , 0.68963306, 0.        ],
       [1.        , 0.56828948, 0.        ],
       [1.        , 0.51324774, 0.        ],
       ...,
       [1.        , 1.        , 0.        ],
       [1.        , 1.        , 0.        ],
       [1.        , 1.        , 0.        ]])

## Check adstock pipeline

In [43]:
mydf = adstock_pipeline.fit_transform(train_df)

## Check union pipeline

In [44]:
ar = union.fit_transform(train_df)
ar = pd.DataFrame(ar)
ar

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['facebook_cumulative'][0:idx] = selection['facebook'].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['google_cumulative'][0:idx] = selection['google'].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tiktok_cumulative'][0:idx] = selection['tiktok'].sum()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,10.0,414.85,257.01,1.17,0.0,0.0,0.0,0.0,-0.500000,-0.866025,2.80,0.0,1013.95,414.85,407.245714,414.850000,1.0,0.689633,0.0
1,6.0,229.95,250.30,0.84,0.0,0.0,1.0,1.0,-0.500000,-0.866025,2.80,0.0,1013.95,414.85,407.245714,322.400000,1.0,0.568289,0.0
2,10.0,459.89,248.59,0.72,0.0,0.0,1.0,2.0,-0.500000,-0.866025,2.80,0.0,1013.95,414.85,407.245714,391.145000,1.0,0.513248,0.0
3,17.0,508.36,258.05,0.07,0.0,0.0,0.0,3.0,-0.500000,-0.866025,2.80,0.0,1013.95,414.85,407.245714,449.752500,1.0,0.067606,0.0
4,13.0,399.88,257.47,0.38,0.0,0.0,0.0,4.0,-0.500000,-0.866025,3.18,0.0,1271.42,414.85,445.885714,424.816250,1.0,0.316139,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,83.0,3718.12,885.33,208.52,0.0,0.0,0.0,650.0,0.866025,-0.500000,2185.29,0.0,4793.34,3173.95,3026.895714,3845.153512,1.0,1.000000,0.0
651,75.0,2913.63,882.17,186.47,0.0,0.0,0.0,651.0,0.866025,-0.500000,2185.29,0.0,4793.34,1865.47,3237.397143,3379.391756,1.0,1.000000,0.0
652,53.0,1735.46,662.08,160.69,0.0,0.0,1.0,652.0,0.866025,-0.500000,2185.29,0.0,4793.34,2133.27,3237.397143,2557.425878,1.0,1.000000,0.0
653,41.0,1320.67,481.71,149.16,0.0,0.0,1.0,653.0,0.866025,-0.500000,2185.29,0.0,4793.34,1767.01,3237.397143,1939.047939,1.0,1.000000,0.0
