In [1]:

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import sys
import time
from datetime import datetime
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
# get_ipython().run_line_magic('matplotlib', 'inline')

sns.color_palette("husl")
sns.set_style('darkgrid')

In [3]:
# Data
# Four years' (209 weeks) records of sales, media impression and media spending at weekly level.
df = pd.read_csv('C:/Users/AlbertodeTorres/OneDrive - Nektiu S.L/UOC/TFG/data/MMM_data.csv')

# 1. media variables
# media impression
mdip_cols=[col for col in df.columns if 'mdip_' in col]
# media spending
mdsp_cols=[col for col in df.columns if 'mdsp_' in col]

# 2. control variables
# macro economics variables
me_cols = [col for col in df.columns if 'me_' in col]
# store count variables
st_cols = ['st_ct']
# markdown/discount variables
mrkdn_cols = [col for col in df.columns if 'mrkdn_' in col]
# holiday variables
hldy_cols = [col for col in df.columns if 'hldy_' in col]
# seasonality variables
seas_cols = [col for col in df.columns if 'seas_' in col]
base_vars = me_cols+st_cols+mrkdn_cols+hldy_cols+seas_cols

# 3. sales variables
sales_cols =['sales']

df[['wk_strt_dt']+mdip_cols+['sales']].head()

# EDA - correlation, distribution plots
#plt.figure(figsize=(24,20))
#sns.heatmap(df[mdip_cols+['sales']].corr(), square=True, annot=True, vmax=1, vmin=-1, cmap='RdBu')

#plt.figure(figsize=(50,50))
#sns.pairplot(df[mdip_cols+['sales']], vars=mdip_cols+['sales'])


Unnamed: 0,wk_strt_dt,mdip_dm,mdip_inst,mdip_nsp,mdip_auddig,mdip_audtr,mdip_vidtr,mdip_viddig,mdip_so,mdip_on,mdip_em,mdip_sms,mdip_aff,mdip_sem,sales
0,2014-08-03,4863885,29087520,2421933,692315,37778097,10038746,2111112,0,3271007,1514755,27281,197828,83054,72051457.64
1,2014-08-10,20887502,8345120,3984494,475810,12063657,9847977,587184,0,4260715,2234569,27531,123688,83124,78794770.54
2,2014-08-17,11097724,17276800,1846832,784732,5770115,7235336,1015658,0,4405992,1616990,55267,186781,79768,70071185.56
3,2014-08-24,1023446,18468480,2394834,1032301,12174000,8625122,2149160,0,6638320,1897998,32470,122389,138936,68642464.59
4,2014-08-31,21109811,26659920,3312008,400456,31656134,19785657,2408661,0,4347752,2569158,55878,209969,87531,86190784.65


In [5]:
# 1.1 Adstock
def apply_adstock(x, L, P, D):
    '''
    params:
    x: original media variable, array
    L: length
    P: peak, delay in effect
    D: decay, retain rate
    returns:
    array, adstocked media variable
    '''
    x = np.append(np.zeros(L-1), x)

    weights = np.zeros(L)
    for l in range(L):
        weight = D**((l-P)**2)
        weights[L-1-l] = weight

    adstocked_x = []
    for i in range(L-1, len(x)):
        x_array = x[i-L+1:i+1]
        xi = sum(x_array * weights)/sum(weights)
        adstocked_x.append(xi)
    adstocked_x = np.array(adstocked_x)
    return adstocked_x

def adstock_transform(df, md_cols, adstock_params):
    '''
    params:
    df: original data
    md_cols: list, media variables to be transformed
    adstock_params: dict,
        e.g., {'sem': {'L': 8, 'P': 0, 'D': 0.1}, 'dm': {'L': 4, 'P': 1, 'D': 0.7}}
    returns:
    adstocked df
    '''
    md_df = pd.DataFrame()
    for md_col in md_cols:
        md = md_col.split('_')[-1]
        L, P, D = adstock_params[md]['L'], adstock_params[md]['P'], adstock_params[md]['D']
        xa = apply_adstock(df[md_col].values, L, P, D)
        md_df[md_col] = xa
    return md_df


# # plot adstock with varying decay
# fig, ax = plt.subplots(figsize=(15,5))
# psets = [
#     [8, 1, 0.1],
#     [8, 1, 0.9]
# ]
# xm = df['mdip_vidtr'].values
# sns.lineplot(x=range(52), y=xm[-52:], ax=ax, label='original')
# for i in range(len(psets)):
#     p = psets[i]
#     L, P, D = p[0], p[1], p[2]
#     xm_adstocked = apply_adstock(xm, L, P, D)
#     sns.lineplot(x=range(52), y=xm_adstocked[-52:], ax=ax,
#                  label='L=%d, P=%d, D=%.1f'%(L, P, D))
#     ax.lines[i+1].set_linestyle("--")
# ax.set_title('Adstock Parameter: Decay', fontsize=16)

# # plot adstock with varying length
# fig, ax = plt.subplots(figsize=(15,5))
# psets = [
#     [4, 1, 0.9],
#     [12, 1, 0.9]
# ]
# xm = df['mdip_vidtr'].values
# sns.lineplot(x=range(52), y=xm[-52:], ax=ax, label='original')
# for i in range(len(psets)):
#     p = psets[i]
#     L, P, D = p[0], p[1], p[2]
#     xm_adstocked = apply_adstock(xm, L, P, D)
#     sns.lineplot(x=range(52), y=xm_adstocked[-52:], ax=ax,
#                  label='L=%d, P=%d, D=%.1f'%(L, P, D))
#     ax.lines[i+1].set_linestyle("--")
# ax.set_title('Adstock Parameter: Length', fontsize=16)


# 1.2 Diminishing Return
def hill_transform(x, ec, slope):
    return 1 / (1 + (x / ec)**(-slope))

# # plot hill function with varying K and S
# fig, ax = plt.subplots(figsize=(9,6))
# psets = [
#     [0.5, 0.5],
#     [0.5, 1.0],
#     [0.95, 1.0],
#     [0.95, 3.0]
# ]
# xm = np.arange(0,2,0.05)
# for i in range(len(psets)):
#     p = psets[i]
#     ec, slope = p[0], p[1]
#     sns.lineplot(x=xm, y=hill_transform(xm, ec, slope), ax=ax,
#                  label='K=%.2f, S=%.1f'%(ec, slope))
#     #ax.lines[i+1].set_linestyle("--")
# ax.set_title('Hill Function', fontsize=16)


# 2. Model Implementation
# The model is built in a stacked way. Three models are trained:
# - Control Model
# - Marketing Mix Model
# - Diminishing Return Model

# 2.1 Control Model / Base Sales Model
import pystan
import os
#os.environ['CC'] = 'gcc-10'
#os.environ['CXX'] = 'g++-10'

# helper functions
from sklearn.metrics import mean_squared_error
def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def apply_mean_center(x):
    mu = np.mean(x)
    xm = x/mu
    return xm, mu

def mean_center_trandform(df, cols):
    '''
    returns:
    mean-centered df
    scaler, dict
    '''
    df_new = pd.DataFrame()
    sc = {}
    for col in cols:
        x = df[col].values
        df_new[col], mu = apply_mean_center(x)
        sc[col] = mu
    return df_new, sc

def mean_log1p_trandform(df, cols):
    '''
    returns:
    mean-centered, log1p transformed df
    scaler, dict
    '''
    df_new = pd.DataFrame()
    sc = {}
    for col in cols:
        x = df[col].values
        xm, mu = apply_mean_center(x)
        sc[col] = mu
        df_new[col] = np.log1p(xm)
    return df_new, sc

import json

def save_json(data, file_name):
    with open(file_name, 'w') as fp:
        json.dump(data, fp)

def load_json(file_name):
    with open(file_name, 'r') as fp:
        data = json.load(fp)
    return data



ModuleNotFoundError: No module named 'pystan'