### Budget trends in RL data

The following code uses the `all_activity_normal.cs` dataset to create trends lines for average advertising budget for the 2009-2018 period. The dataset can be found at `G:\Market\for Matt & B2B\Data\ReachLocal\Platform Data\all_activities 2009-Nov. 2018.zip` The code can be modified to create these graphs based on a variety of categories, for example business categoty or advertiser subcategory. The logic is fairly straight forward and following the comments along should be easy. The code has been automated to some extend and will be updated in the future to a turn-key code.

*Author: Farshad Nasiri - B2B Data Science Team*

In [1]:
# Import necessay libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
#from datetime import datetime
#################################################################
## For including LateX interpreter
#################################################################
#from matplotlib import rc
#rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
#rc('text', usetex=True)

In [2]:
# Read the data
df=pd.read_csv('../data/all_activity_normal.csv')

In [3]:
# Quick look at the shape of the dataframe
#df.shape

In [4]:
# Quick look at the columns
#df.columns

In [5]:
# Summary of missing values
#df.isnull().sum()

In [6]:
# Convert the date columns to date-time format
df['Cycle_Started']=pd.to_datetime(df.Cycle_Started)
df['Cycle_Ended']=pd.to_datetime(df.Cycle_Ended)

In [7]:
# Remove the one row with campaign start date missing
# and reset the index
df=df[~df.Cycle_Started.isnull()]
df.reset_index(drop=True, inplace=True)

In [8]:
# Drop the columns that we won't be using for graph generation 
# in order to reduce memory load

df.drop(columns=['Unnamed: 0', 'Primary_BSC_ID', 'BusinessSpecialtyID', 'Advertiser_URL', 
                 'Country','Currency','BusinessSpecialty', 'CPC', 'CTR', 'CPL', 'CTL', 
                 'Paid_On'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [9]:
#temp=df.dropna(axis=0, subset=['BusinessCategory'])

#temp=temp[['BusinessCategory', 'campaign_budget','Cycle_Started','Cycle_Ended']]

#bc=df.BusinessCategory.unique()
#bc[-1]='Other'

In [10]:
def ma(array, window=1):  # Moving average
    return np.convolve(array[0], np.ones((1,window))[0]/window, mode='same').reshape(1,array.shape[1])

In [11]:
from sklearn.linear_model import LinearRegression
mdl=LinearRegression(n_jobs=4)

In [14]:
col='BusinessCategory'

df[col].fillna('Other', inplace=True)

temp=df[[col, 'campaign_budget','Cycle_Started','Cycle_Ended']]

names=df[col].unique()
row_size=len(names)

years=range(2009,2019)
col_names=[]
for year in years:
    for quarter in [1,2,3,4]:
        col_names.append(str(year)+'-Q'+str(quarter))


my_df=pd.DataFrame(data=np.zeros((row_size,4*len(years))), columns=col_names)

my_df.index=sorted(names.tolist())

for quarter in range(1,5):
    if quarter==1:
        start_str='-01-01'
        end_str='-03-31'
    if quarter==2:
        start_str='-04-01'
        end_str='-06-30'
    if quarter==3:
        start_str='-07-01'
        end_str='-09-30'
    if quarter==4:
        start_str='-10-01'
        end_str='-12-31'
    
    for year in years:
        start=pd.to_datetime(str(year)+start_str)
        end = pd.to_datetime(str(year)+end_str)

        temp2=temp[(temp.Cycle_Started >= start) & (temp.Cycle_Ended <= end)]
        temp2=temp2.groupby(by=[col]).agg({'campaign_budget':'mean'})
       # print(temp2.shape[0])
        temp2.sort_values(by=[col], inplace=True)
        
        my_col=str(year)+'-Q'+str(quarter)
        my_df.loc[temp2.index, my_col]=temp2.campaign_budget
#######################################################################################

#pd.set_option("display.max_columns",100)

#my_df

#from matplotlib import rc
# for i in range(0, len(my_df.index)):
#     location=i

#     plt.figure(figsize=(12,6))
#     plt.cla()
#     plt.plot(my_df.columns, my_df.iloc[location,:],'k-o')
#     plt.xticks(rotation='vertical');
#     plt.tick_params(direction='in', length=10, width=2.5, colors='k',
#                                grid_color='k', grid_alpha=1, labelsize=14)
#     plt.ylabel('Ave. Budget', fontsize=16)
#     plt.title(my_df.index[location], fontsize=16)
#     #ax = plt.gca()
    
#     save_to='../output/BC-2009-2018/'+my_df.index[location].replace(' ', '_').replace('/','')+".png"
#     plt.savefig(save_to, bbox_inches = 'tight')
#     plt.close()


In [None]:
#my_df

In [23]:
#for i in range(0, len(my_df.index)):

threshold = 0.001
ma_window=4

seasonal=[]
period=[]
coefficient=[]
for i in range(0, len(my_df.index)):
    location=i

    time_series=np.array(my_df.iloc[location,:]).reshape(1,-1)
    time_series=time_series-ma(time_series, window=ma_window) # Remove the trend

        # Next largest power of 2        
    nfft=1<<(time_series.shape[1]-1).bit_length()

    y=np.fft.fft(time_series, n=nfft)
    y=abs(y**2)
    y=(y-y.min())
    y=y/y.max()

    freq_ts=(np.linspace(0,nfft/2,int(nfft/2)) *1/float(nfft))
    freq_ts=1/freq_ts
    freq_ts=freq_ts[1:]
    freq_ts=freq_ts.reshape(-1,1)

    y=y[0][1:int(nfft/2)].reshape(1,-1).T

    if time_series.shape == (1,1):
        coeff=0.0
    else:
        mdl.fit(freq_ts, y)
        coeff=mdl.coef_
    
    coefficient.append(coeff[0][0])
    
    if (np.abs(coeff) > threshold):
        seasonal.append(1)
        period.append( (freq_ts[ np.argmax(y) ])[0] )

    else:
        seasonal.append(0)
        period.append(0)

In [27]:
np.array(period)

#period

array([ 3.44444444,  3.1       ,  3.44444444,  4.76923077,  6.2       ,
       12.4       , 12.4       ,  2.81818182, 12.4       ,  3.1       ,
        2.21428571,  0.        , 12.4       ,  4.13333333,  0.        ,
        3.44444444,  6.2       ,  4.76923077,  2.38461538, 12.4       ,
       12.4       ,  3.1       , 12.4       , 12.4       , 12.4       ,
        2.38461538,  2.        ,  3.64705882,  6.2       ,  3.1       ,
       12.4       ])

In [26]:
np.array(coefficient)

array([ 7.76910988e-03,  2.98716245e-03, -3.74605319e-03,  5.64487366e-03,
        7.69264717e-03,  3.52386051e-03,  7.30721202e-03, -6.98440359e-03,
        6.56270366e-03, -2.51816242e-03, -4.66374204e-03, -9.77980514e-05,
        7.58816223e-03,  1.12766900e-03, -6.01955354e-04, -1.77574761e-03,
        3.95486640e-03, -2.57428233e-03, -4.77024104e-03,  4.07884811e-03,
        5.65477760e-03,  2.53009366e-03,  5.27495546e-03,  5.32783575e-03,
        7.84285796e-03, -6.08822553e-03,  3.66528005e-03,  1.51994365e-03,
       -1.07636981e-03, -4.35303550e-03,  3.74462927e-03])

In [None]:
location=20

print('Seasonality detected:', seasonal[location])
print('Period detected:', period[location])

plt.figure(figsize=(12,6))
plt.cla()
plt.plot(my_df.columns, my_df.iloc[location,:],'k-o')
plt.xticks(rotation='vertical');
plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                           grid_color='k', grid_alpha=1, labelsize=14)
plt.ylabel('Ave. Budget', fontsize=16)
plt.title(my_df.index[location], fontsize=16);

time_series=np.array(my_df.iloc[location,:]).reshape(1,-1)
mov_avg=ma(time_series, window=4)
time_series=time_series-ma(time_series, window=4)
nfft=1<<(time_series.shape[1]-1).bit_length()
freq_ts=(np.linspace(0,nfft/2,int(nfft/2)) *1/float(nfft))
freq_ts=freq_ts.reshape(1,-1).T


# Perform FFT
y=np.fft.fft(time_series, n=nfft)
y=abs(y**2) # Get the power spectrum

# Scale (optional)
y=(y-y.min())
y=y/y.max()

# Reshaping because Python is stupid and doesn't take care of this 
# automatically. Boy I miss Matlab.
y=y[0][0:int(nfft/2)].reshape(1,-1).T

plt.figure(figsize=(12,6))
plt.cla()
plt.plot(my_df.columns,time_series[0],'k-o')
plt.xticks(my_df.columns,rotation='vertical');
plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                           grid_color='k', grid_alpha=1, labelsize=14)
plt.grid(b=True, linewidth=0.2)
plt.ylabel('Ave. Budget', fontsize=16)
plt.title('Time series with trend removed', fontsize=16)


plt.figure(figsize=(12,6))
plt.cla()
plt.plot(my_df.columns,mov_avg[0],'k-o')
plt.xticks(my_df.columns,rotation='vertical');
plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                           grid_color='k', grid_alpha=1, labelsize=14)
plt.grid(b=True, linewidth=0.2)
plt.ylabel('Ave. Budget', fontsize=16)
plt.title('Moving average', fontsize=16)


plt.figure(figsize=(12,6))
plt.cla()
plt.bar(freq_ts.T[0],y.T[0], width=0.01)
plt.xticks(ticks=freq_ts.T[0],rotation='vertical');
#plt.yscale('log')
plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                           grid_color='k', grid_alpha=1, labelsize=14)
plt.ylabel('Power spectrum', fontsize=16);
plt.xlabel('Frequency', fontsize=16);