In [11]:
import pandas as pd
import datetime

from sklearn.model_selection import TimeSeriesSplit
from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import MinMaxScaler

import pickle

In [2]:
df = pd.read_csv('../../raw_data/df.csv')
df = df.drop(columns='Unnamed: 0')
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)
df = df.rename(columns={"fb_costs": "facebook", "google_costs": "google", "tt_costs": "tiktok"})
df['day'] = df.index # we will use this in our column transformer
df = df.drop(columns = ['fb_impressions', 'google_impressions', 'tt_impressions'])

In [3]:
# Sum clicks for each channel
df = df.eval('total_clicks = fb_clicks + tt_clicks + google_clicks')
df

Unnamed: 0_level_0,orders,total_sales,facebook,fb_clicks,google,google_clicks,tiktok,tt_clicks,day,total_clicks
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-07-01,10,414.85,257.01,129.0,1.17,20.0,0.00,0.0,2021-07-01,149.0
2021-07-02,6,229.95,250.30,150.0,0.84,7.0,0.00,0.0,2021-07-02,157.0
2021-07-03,10,459.89,248.59,129.0,0.72,6.0,0.00,0.0,2021-07-03,135.0
2021-07-04,17,508.36,258.05,119.0,0.07,5.0,0.00,0.0,2021-07-04,124.0
2021-07-05,13,399.88,257.47,142.0,0.38,16.0,0.00,0.0,2021-07-05,158.0
...,...,...,...,...,...,...,...,...,...,...
2023-08-20,294,13616.97,2148.53,1366.0,466.56,801.0,368.86,644.0,2023-08-20,2811.0
2023-08-21,356,15348.42,2210.01,1284.0,400.58,782.0,391.14,494.0,2023-08-21,2560.0
2023-08-22,150,6054.53,1335.84,839.0,289.46,619.0,218.61,398.0,2023-08-22,1856.0
2023-08-23,130,4810.67,1315.81,841.0,275.86,537.0,90.21,133.0,2023-08-23,1511.0


In [4]:
n_splits = 5  # Number of splits for cross-validation
tscv = TimeSeriesSplit(n_splits=n_splits)

for train_index, test_index in tscv.split(df):
    train_df = df.iloc[train_index]
    test_df = df.iloc[test_index]

y_train = train_df['total_clicks']
X_train = train_df.drop(columns = ['total_sales', 'orders', 'total_clicks', 'fb_clicks', 'google_clicks', 'tt_clicks'])

y_test = test_df['total_clicks']
X_test = test_df.drop(columns = ['total_sales', 'orders', 'total_clicks', 'fb_clicks', 'google_clicks', 'tt_clicks'])

In [5]:
class AddWeekendsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        # This transformer doesn't need to learn any parameters during fitting,
        # so we simply return self.
        return self

    def transform(self, X):
        """Creates a new column with row value = 1 if the day is a Friday or Saturday and 0 if not."""
        df = X.copy()  # Create a copy of the input DataFrame to avoid modifying it directly
        weekday_values = df.index.weekday
        df['fri_sat'] = ((weekday_values == 4) | (weekday_values == 5)).astype(int)
        return df[['fri_sat']]

In [6]:
time_features = ColumnTransformer([
     ('weekends_pipe', Pipeline([
            ('add_weekends', AddWeekendsTransformer())
     ]), ['day'])
])

scaler = ColumnTransformer(
    transformers=[
        ('minmax', MinMaxScaler(), ['facebook','google','tiktok'])
    ])

In [7]:
time_preprocessing = Pipeline([
    ('time_features', time_features)
])

union = FeatureUnion([
    ('time_preprocessing', time_preprocessing),
    ('scaler', scaler)
])

In [9]:
pipeline = union.fit(X_train)

In [None]:
with open(pipeline-{datetime.now()}.pkl, ‘wb’) as file:
pickle.dump(pipeline, file)