### Budget trends in RL data

The following code uses the `all_activity_normal.cs` dataset to create trends lines for average advertising budget for the 2009-2018 period. The dataset can be found at `G:\Market\for Matt & B2B\Data\ReachLocal\Platform Data\all_activities 2009-Nov. 2018.zip` The code can be modified to create these graphs based on a variety of categories, for example business categoty or advertiser subcategory. The logic is fairly straight forward and following the comments along should be easy. The code has been automated to some extend and will be updated in the future to a turn-key code.

*Author: Farshad Nasiri - B2B Data Science Team*

In [1]:
# Import necessay libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#################################################################
## For including LateX interpreter
#################################################################
#from matplotlib import rc
#rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']})
## for Palatino and other serif fonts use:
#rc('font',**{'family':'serif','serif':['Palatino']})
#rc('text', usetex=True)

In [34]:
# Read the data
df=pd.read_csv('../data/all_activity_normal.csv')

In [35]:
# Quick look at the shape of the dataframe
df.shape

(1957753, 55)

In [36]:
# Quick look at the columns
df.columns

Index(['Unnamed: 0', 'idcampaign', 'idcampaign_master', 'idOffer',
       'Offer_Name', 'Finance_Product', 'idadvertiser', 'idadvertiser_master',
       'Advertiser_URL', 'idBusiness', 'Channel', 'Country', 'Currency',
       'campaign_budget', 'Retail_Cost', 'COGS', 'Overage', 'impressions',
       'clicks', 'CVTs', 'qualified_calls', 'calls', 'emails',
       'qualified_web_events', 'Leads', 'CPC', 'CTR', 'CPL', 'CTL', 'Paid_On',
       'Cycle_Started', 'Cycle_Ended', 'MCID_Started', 'Cycle_Number',
       'Advertiser_BC_ID', 'Advertiser_BusinessCategory', 'Advertiser_BSC_ID',
       'Advertiser_BusinessSubCategory', 'BC_ID', 'BusinessCategory',
       'Primary_BSC_ID', 'Primary_BusinessSubCategory', 'Secondary_BSC_Count',
       'Secondary_BSC_IDs', 'Seconardy_BSCs', 'BusinessSpecialtyID',
       'BusinessSpecialty', 'TargetType', 'Radius_Target_Address',
       'Radius_Target_Miles', 'Cities_Targetd', 'DMAs_Targeted',
       'Population_Target', 'ratio', 'ratio_cat'],
      dtype='

In [37]:
# Summary of missing values
df.isnull().sum()

Unnamed: 0                              0
idcampaign                              0
idcampaign_master                       0
idOffer                                 0
Offer_Name                            275
Finance_Product                       275
idadvertiser                            0
idadvertiser_master                     0
Advertiser_URL                      35884
idBusiness                             11
Channel                                11
Country                                11
Currency                               11
campaign_budget                         0
Retail_Cost                             0
COGS                                    0
Overage                                 0
impressions                             0
clicks                                  0
CVTs                                    0
qualified_calls                         0
calls                                   0
emails                                  0
qualified_web_events              

In [41]:
# Convert the date columns to date-time format
df['Cycle_Started']=pd.to_datetime(df.Cycle_Started)
df['Cycle_Ended']=pd.to_datetime(df.Cycle_Ended)

In [42]:
# Remove the one row with campaign start date missing
# and reset the index
df=df[~df.Cycle_Started.isnull()]
df.reset_index(drop=True, inplace=True)

In [59]:
# Drop the columns that we won't be using for graph generation 
# in order to reduce memory load

df.drop(columns=['Unnamed: 0', 'Primary_BSC_ID', 'BusinessSpecialtyID', 'Advertiser_URL', 
                 'Country','Currency','BusinessSpecialty', 'CPC', 'CTR', 'CPL', 'CTL', 
                 'Paid_On'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [60]:
temp=df.dropna(axis=0, subset=['BusinessCategory'])

temp=temp[['BusinessCategory', 'campaign_budget','Cycle_Started','Cycle_Ended']]

bc=df.BusinessCategory.unique()
#bc[-1]='Other'

In [29]:
col='BusinessCategory'
names=df[col].unique()
row_size=len(names)

my_df=pd.DataFrame(data=np.zeros((row_size,4)), columns=['q1','q2','q3','q4'] )

my_df.index=sorted(names.tolist())

years=range(2009,2019)
for quarter in range(1,5):
    if quarter==1:
        start_str='-01-01'
        end_str='-03-31'
    if quarter==2:
        start_str='-04-01'
        end_str='-06-30'
    if quarter==3:
        start_str='-07-01'
        end_str='-09-30'
    if quarter==4:
        start_str='-10-01'
        end_str='-12-31'
    
    for year in years:
        start=pd.to_datetime(str(year)+start_str)
        end = pd.to_datetime(str(year)+end_str)

        temp2=temp[(temp.Cycle_Started >= start) & (temp.Cycle_Ended <= end)]
        temp2=temp2.groupby(by=[col]).agg({'campaign_budget':'mean'})
       # print(temp2.shape[0])

        temp2.sort_values(by=[col], inplace=True)
        my_col='q'+str(quarter)
        my_df.loc[temp2.index,my_col]=my_df.loc[temp2.index,my_col]+temp2.campaign_budget

In [30]:
my_df

Unnamed: 0,q1,q2,q3,q4
Animals & Pets,7808.288451,7708.191971,7872.684065,7903.125647
Apparel / Fashion & Jewelry,11273.697075,11185.842412,10912.746549,11226.098921
Arts & Entertainment,12883.625808,12720.295831,13174.45663,13469.917689
Attorneys & Legal Services,15860.201891,16111.650225,15969.693421,15719.323484
Automotive -- For Sale,18437.01218,18532.436695,18342.515884,18303.444691
"Automotive -- Repair, Service & Parts",9331.245651,9360.649124,9369.656556,9277.417455
Beauty & Personal Care,11434.344637,11214.54083,11100.282426,11048.925732
Business Opportunities,17868.025623,15684.919854,17025.955651,15968.289928
Business Services,9886.767801,10155.350165,10241.741682,10174.967425
Career & Employment,11331.608365,11172.250897,10352.497631,10951.765443


In [31]:
col='BusinessCategory'
names=df[col].unique()
row_size=len(names)

years=range(2009,2019)
col_names=[]
for year in years:
    for quarter in [1,2,3,4]:
        col_names.append(str(year)+'-Q'+str(quarter))


my_df=pd.DataFrame(data=np.zeros((row_size,4*len(years))), columns=col_names)

my_df.index=sorted(names.tolist())


for quarter in range(1,5):
    if quarter==1:
        start_str='-01-01'
        end_str='-03-31'
    if quarter==2:
        start_str='-04-01'
        end_str='-06-30'
    if quarter==3:
        start_str='-07-01'
        end_str='-09-30'
    if quarter==4:
        start_str='-10-01'
        end_str='-12-31'
    
    for year in years:
        start=pd.to_datetime(str(year)+start_str)
        end = pd.to_datetime(str(year)+end_str)

        temp2=temp[(temp.Cycle_Started >= start) & (temp.Cycle_Ended <= end)]
        temp2=temp2.groupby(by=[col]).agg({'campaign_budget':'mean'})
       # print(temp2.shape[0])
        temp2.sort_values(by=[col], inplace=True)
        
        my_col=str(year)+'-Q'+str(quarter)
        my_df.loc[temp2.index, my_col]=temp2.campaign_budget

#pd.set_option("display.max_columns",100)

#my_df

#from matplotlib import rc
for i in range(0, len(my_df.index)):
    location=i

    plt.figure(figsize=(12,6))
    plt.cla()
    plt.plot(my_df.columns, my_df.iloc[location,:],'k-o')
    plt.xticks(rotation='vertical');
    plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                               grid_color='k', grid_alpha=1, labelsize=14)
    plt.ylabel('Ave. Budget', fontsize=16)
    plt.title(my_df.index[location], fontsize=16)
    #ax = plt.gca()
    
    save_to='../output/BC-2009-2018/'+my_df.index[location].replace(' ', '_').replace('/','')+".png"
    plt.savefig(save_to, bbox_inches = 'tight')
    plt.close()


In [60]:
col='Advertiser_BusinessCategory'

df[col].fillna('Other', inplace=True)

temp=df[[col, 'campaign_budget','Cycle_Started','Cycle_Ended']]

names=df[col].unique()
row_size=len(names)

years=range(2009,2019)
col_names=[]
for year in years:
    for quarter in [1,2,3,4]:
        col_names.append(str(year)+'-Q'+str(quarter))


my_df=pd.DataFrame(data=np.zeros((row_size,4*len(years))), columns=col_names)

my_df.index=sorted(names.tolist())


for quarter in range(1,5):
    if quarter==1:
        start_str='-01-01'
        end_str='-03-31'
    if quarter==2:
        start_str='-04-01'
        end_str='-06-30'
    if quarter==3:
        start_str='-07-01'
        end_str='-09-30'
    if quarter==4:
        start_str='-10-01'
        end_str='-12-31'
    
    for year in years:
        start=pd.to_datetime(str(year)+start_str)
        end = pd.to_datetime(str(year)+end_str)

        temp2=temp[(temp.Cycle_Started >= start) & (temp.Cycle_Ended <= end)]
        temp2=temp2.groupby(by=[col]).agg({'campaign_budget':'mean'})
       # print(temp2.shape[0])
        temp2.sort_values(by=[col], inplace=True)
        
        my_col=str(year)+'-Q'+str(quarter)
        my_df.loc[temp2.index, my_col]=temp2.campaign_budget

#pd.set_option("display.max_columns",100)

#my_df

#from matplotlib import rc
for i in range(0, len(my_df.index)):
    location=i

    plt.figure(figsize=(12,6))
    plt.cla()
    plt.plot(my_df.columns, my_df.iloc[location,:],'k-o')
    plt.xticks(rotation='vertical');
    plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                               grid_color='k', grid_alpha=1, labelsize=14)
    plt.ylabel('Ave. Budget', fontsize=16)
    plt.title(my_df.index[location], fontsize=16)
    #ax = plt.gca()
    
    save_to='../output/AdvBC-2009-2018/'+my_df.index[location].replace(' ', '_').replace('/','')+".png"
    plt.savefig(save_to, bbox_inches = 'tight')
    plt.close()

In [64]:
df['Primary_BusinessSubCategory'].nunique()

951

In [65]:
col='Primary_BusinessSubCategory'

df[col].fillna('Other', inplace=True)

temp=df[[col, 'campaign_budget','Cycle_Started','Cycle_Ended']]

names=df[col].unique()
row_size=len(names)

years=range(2009,2019)
col_names=[]
for year in years:
    for quarter in [1,2,3,4]:
        col_names.append(str(year)+'-Q'+str(quarter))


my_df=pd.DataFrame(data=np.zeros((row_size,4*len(years))), columns=col_names)

my_df.index=sorted(names.tolist())


for quarter in range(1,5):
    if quarter==1:
        start_str='-01-01'
        end_str='-03-31'
    if quarter==2:
        start_str='-04-01'
        end_str='-06-30'
    if quarter==3:
        start_str='-07-01'
        end_str='-09-30'
    if quarter==4:
        start_str='-10-01'
        end_str='-12-31'
    
    for year in years:
        start=pd.to_datetime(str(year)+start_str)
        end = pd.to_datetime(str(year)+end_str)

        temp2=temp[(temp.Cycle_Started >= start) & (temp.Cycle_Ended <= end)]
        temp2=temp2.groupby(by=[col]).agg({'campaign_budget':'mean'})
       # print(temp2.shape[0])
        temp2.sort_values(by=[col], inplace=True)
        
        my_col=str(year)+'-Q'+str(quarter)
        my_df.loc[temp2.index, my_col]=temp2.campaign_budget

#pd.set_option("display.max_columns",100)

#my_df

#from matplotlib import rc
for i in range(0, len(my_df.index)):
    location=i

    plt.figure(figsize=(12,6))
    plt.cla()
    plt.plot(my_df.columns, my_df.iloc[location,:],'k-o')
    plt.xticks(rotation='vertical');
    plt.tick_params(direction='in', length=10, width=2.5, colors='k',
                               grid_color='k', grid_alpha=1, labelsize=14)
    plt.ylabel('Ave. Budget', fontsize=16)
    plt.title(my_df.index[location], fontsize=16)
    #ax = plt.gca()
    
    save_to='../output/Primary_BSC-2009-2018/'+my_df.index[location].replace(' ', '_').replace('/','')+".png"
    plt.savefig(save_to, bbox_inches = 'tight')
    plt.close()