In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, TimeSeriesSplit, cross_validate

In [2]:
sales = pd.read_csv('../raw_data/sales_data.csv')
sales['shipping_country'].unique()

array(['Germany', 'Austria', nan, 'Netherlands', 'Italy'], dtype=object)

In [46]:
df = pd.read_csv('../raw_data/df.csv')
df = df.drop(columns='Unnamed: 0')
df.shape

(880, 9)

In [47]:
df.info()
df.nunique()
df.duplicated().sum()
df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Day                 880 non-null    object 
 1   orders              880 non-null    float64
 2   total_sales         880 non-null    float64
 3   fb_costs            880 non-null    float64
 4   fb_impressions      880 non-null    int64  
 5   fb_clicks           880 non-null    int64  
 6   google_costs        880 non-null    float64
 7   google_impressions  880 non-null    int64  
 8   google_clicks       880 non-null    int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 62.0+ KB


Day                   0
orders                0
total_sales           0
fb_costs              0
fb_impressions        0
fb_clicks             0
google_costs          0
google_impressions    0
google_clicks         0
dtype: int64

In [48]:
df.head()

Unnamed: 0,Day,orders,total_sales,fb_costs,fb_impressions,fb_clicks,google_costs,google_impressions,google_clicks
0,2021-07-01,10.0,414.85,257.01,36200,129,1.17,39,20
1,2021-07-02,6.0,229.95,250.3,33040,150,0.84,14,7
2,2021-07-03,10.0,459.89,248.59,33031,129,0.72,12,6
3,2021-07-04,17.0,508.36,258.05,37463,119,0.07,13,5
4,2021-07-05,13.0,399.88,257.47,35963,142,0.38,34,16


In [6]:
df['Day'] = pd.to_datetime(df['Day'])
df.set_index('Day', inplace=True)

In [7]:
df.head()

Unnamed: 0_level_0,orders,total_sales,fb_costs,fb_impressions,fb_clicks,google_costs,google_impressions,google_clicks
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-07-01,10.0,414.85,257.01,36200,129,1.17,39,20
2021-07-02,6.0,229.95,250.3,33040,150,0.84,14,7
2021-07-03,10.0,459.89,248.59,33031,129,0.72,12,6
2021-07-04,17.0,508.36,258.05,37463,119,0.07,13,5
2021-07-05,13.0,399.88,257.47,35963,142,0.38,34,16


In [8]:
isinstance(df.index, pd.DatetimeIndex)
type(df.index)

pandas.core.indexes.datetimes.DatetimeIndex

# All data

In [9]:
google_data = ['google_impressions', 'google_clicks', 'google_costs']
google_targets = ['google_clicks', 'google_impressions']

fb_data = ['fb_impressions', 'fb_clicks', 'fb_costs']
fb_targets = ['fb_impressions', 'fb_clicks']

targets = ['total_sales', 'orders']

In [10]:
X = df.drop(columns = ['total_sales', 'orders', 'fb_impressions', 'fb_clicks', 'google_impressions', 'google_clicks'])
y = df[targets]
y_sales = df['total_sales']
y_orders = df['orders']

In [11]:
df.head()

Unnamed: 0_level_0,orders,total_sales,fb_costs,fb_impressions,fb_clicks,google_costs,google_impressions,google_clicks
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-07-01,10.0,414.85,257.01,36200,129,1.17,39,20
2021-07-02,6.0,229.95,250.3,33040,150,0.84,14,7
2021-07-03,10.0,459.89,248.59,33031,129,0.72,12,6
2021-07-04,17.0,508.36,258.05,37463,119,0.07,13,5
2021-07-05,13.0,399.88,257.47,35963,142,0.38,34,16


In [12]:
X.head()

Unnamed: 0_level_0,fb_costs,google_costs
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-07-01,257.01,1.17
2021-07-02,250.3,0.84
2021-07-03,248.59,0.72
2021-07-04,258.05,0.07
2021-07-05,257.47,0.38


In [13]:
lr = LinearRegression()

In [14]:
cross_validate(lr, X, y_sales, cv=TimeSeriesSplit())

{'fit_time': array([0.00851107, 0.00193286, 0.00188994, 0.00199485, 0.00166988]),
 'score_time': array([0.00185394, 0.00107431, 0.00106025, 0.00106812, 0.00102401]),
 'test_score': array([-1.83372775,  0.4297462 ,  0.37534817, -0.09666166,  0.75541467])}

In [15]:
lr.fit(X, y_sales) # refit the model with the complete dataset 
f"Coefficients: {lr.coef_}"

'Coefficients: [1.42769386 9.72364977]'

In [16]:
f"Intercept: {lr.intercept_}"

'Intercept: -55.4211134245254'

In [17]:
df.head()

Unnamed: 0_level_0,orders,total_sales,fb_costs,fb_impressions,fb_clicks,google_costs,google_impressions,google_clicks
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-07-01,10.0,414.85,257.01,36200,129,1.17,39,20
2021-07-02,6.0,229.95,250.3,33040,150,0.84,14,7
2021-07-03,10.0,459.89,248.59,33031,129,0.72,12,6
2021-07-04,17.0,508.36,258.05,37463,119,0.07,13,5
2021-07-05,13.0,399.88,257.47,35963,142,0.38,34,16


# FB data

In [18]:
df_fb = df.drop(columns = google_data)
X_fb = df_fb.drop(columns = fb_targets + targets)
y_fb = df[fb_targets + targets]

# Google data

In [19]:
df_google = df.drop(columns = fb_data)
X_google = df_google.drop(columns = google_targets + targets)
y_google = df[google_targets + targets]

In [20]:
y_fb

Unnamed: 0_level_0,fb_impressions,fb_clicks,total_sales,orders
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-07-01,36200,129,414.85,10.0
2021-07-02,33040,150,229.95,6.0
2021-07-03,33031,129,459.89,10.0
2021-07-04,37463,119,508.36,17.0
2021-07-05,35963,142,399.88,13.0
...,...,...,...,...
2020-07-29,64296,460,0.00,0.0
2020-07-28,59499,386,0.00,0.0
2020-07-27,112787,770,0.00,0.0
2020-07-26,145720,1003,0.00,0.0


In [21]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted, check_array

class ExponentialSaturation(BaseEstimator, TransformerMixin):
    def __init__(self, a=1.):
        self.a = a
        
    def fit(self, X, y=None):
        X = check_array(X)
        self._check_n_features(X, reset=True) # from BaseEstimator
        return self
    def transform(self, X):
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False) # from BaseEstimator
        return 1 - np.exp(-self.a*X)

In [22]:
from scipy.signal import convolve2d

class ExponentialCarryover(BaseEstimator, TransformerMixin):
    def __init__(self, strength=0.5, length=1):
        self.strength = strength
        self.length = length
    def fit(self, X, y=None):
        X = check_array(X)
        self._check_n_features(X, reset=True)
        self.sliding_window_ = (
            self.strength ** np.arange(self.length + 1)
        ).reshape(-1, 1)
        return self
    def transform(self, X: np.ndarray):
        check_is_fitted(self)
        X = check_array(X)
        self._check_n_features(X, reset=False)
        convolution = convolve2d(X, self.sliding_window_)
        if self.length > 0:
            convolution = convolution[: -self.length]
        return convolution

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
adstock = ColumnTransformer(
    [
     ('fb_pipe', Pipeline([
                           ('carryover', ExponentialCarryover()),
                           ('saturation', ExponentialSaturation())
     ]), ['fb_costs']),
     ('gg_pipe', Pipeline([
                           ('carryover', ExponentialCarryover()),
                           ('saturation', ExponentialSaturation())
     ]), ['google_costs'])
    ],
    remainder='passthrough'
    )

model = Pipeline([
                  ('adstock', adstock),
                  ('regression', LinearRegression())
])

In [39]:
df.head()

Unnamed: 0_level_0,orders,total_sales,fb_costs,fb_impressions,fb_clicks,google_costs,google_impressions,google_clicks
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-07-01,10.0,414.85,257.01,36200,129,1.17,39,20
2021-07-02,6.0,229.95,250.3,33040,150,0.84,14,7
2021-07-03,10.0,459.89,248.59,33031,129,0.72,12,6
2021-07-04,17.0,508.36,258.05,37463,119,0.07,13,5
2021-07-05,13.0,399.88,257.47,35963,142,0.38,34,16


In [40]:
X = df.drop(columns = ['total_sales', 'orders', 'fb_impressions', 'fb_clicks', 'google_impressions', 'google_clicks'])
y = df[['total_sales']]

In [45]:
X.head()

Unnamed: 0_level_0,fb_costs,google_costs
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-07-01,257.01,1.17
2021-07-02,250.3,0.84
2021-07-03,248.59,0.72
2021-07-04,258.05,0.07
2021-07-05,257.47,0.38


In [43]:
model.fit(X, y)
cross_val_score(model, X, y, cv=TimeSeriesSplit())#.mean()

array([ 0.35660108, -0.0883035 , -0.21349722, -1.3135077 ,  0.0991357 ])

In [51]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
tuned_model = RandomizedSearchCV(
    estimator=model,
    param_distributions={
        'adstock__fb_pipe__carryover__strength': uniform(0, 1),
        'adstock__fb_pipe__carryover__length': randint(0, 6),
        'adstock__fb_pipe__saturation__a': uniform(0, 0.01),
        'adstock__gg_pipe__carryover__strength': uniform(0, 1),
        'adstock__gg_pipe__carryover__length': randint(0, 6),
        'adstock__gg_pipe__saturation__a': uniform(0, 0.01)
    },
    n_iter=100,
    cv=TimeSeriesSplit(),
    random_state=0
)

In [52]:
tuned_model.fit(X, y)
cross_val_score(tuned_model, X, y, cv=TimeSeriesSplit())#.mean()

array([-4.52379983,  0.27669101,  0.51670609, -0.23440471,  0.54721974])