In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import beta

import warnings
# Filter out all warnings
warnings.filterwarnings('ignore', category=Warning)

In [15]:
#Ingest historical data
df = pd.read_csv('/Users/peter/Documents/Data Science/Waterfall/Chris Copy Impressions + Conversions - Sheet1.csv')
#Convert the 'TOTAL_AD_SPEND' and 'REVENUE_PER_BRAND_IMPRESSION' columns from dollar format to float
df['TOTAL_AD_SPEND'] = df['TOTAL_AD_SPEND'].replace('[\$,]', '', regex=True).astype(float)
df['REVENUE_PER_BRAND_IMPRESSION'] = df['REVENUE_PER_BRAND_IMPRESSION'].replace('[\$,]', '', regex=True).astype(float)
#Convert the 'CVR' column from percentage format to float
df['CVR'] = df['CVR'].replace('[\%,]', '', regex=True).astype(float)/100
#Remove the row where 'BRAND_IMPRESSIONS' is 'd'
df = df[df.BRAND_IMPRESSIONS != 'd']
#Convert the 'BRAND_IMPRESSIONS' column from string to int
df['BRAND_IMPRESSIONS'] = df['BRAND_IMPRESSIONS'].astype(int)
df.head()

Unnamed: 0,PUBLISHER,BRAND_NAME,DISPLAY_CUSTOMER_TYPE,DISTINCT_MERCHANTS_AT_LOAD_TIME,ML_MODEL,BRAND_IMPRESSIONS,CONV_COUNT,TOTAL_AD_SPEND,REVENUE_PER_BRAND_IMPRESSION,CVR
0,Laura Geller,Dr. Squatch,returning_customer,4,bert4recs1,156,7,7.0,0.0449,0.044872
1,Laura Geller,Caden Lane,returning_customer,4,bert4recs1,156,6,120.0,0.7692,0.038462
2,Laura Geller,Sol de Janeiro,returning_customer,1,boost5,368,14,140.0,0.3804,0.038043
3,Laura Geller,Kopari,returning_customer,3,boost5,128,4,120.0,0.9375,0.03125
4,Laura Geller,Dollar Shave Club,returning_customer,4,bert4recs1,112,3,15.0,0.1339,0.026786


In [17]:
#Create filter variables
model_filter = 'bert4recs1'
customer_type_filter = 'new_customer'
#Apply these filters based on the appropriate columns
df_filter = df[(df['ML_MODEL'] == model_filter) & (df['DISPLAY_CUSTOMER_TYPE'] == customer_type_filter)]
df_filter.drop(['ML_MODEL', 'DISPLAY_CUSTOMER_TYPE', 'DISTINCT_MERCHANTS_AT_LOAD_TIME'], axis=1, inplace=True)

#Aggregate the data 
df_filter_agg = df_filter.groupby(['PUBLISHER', 'BRAND_NAME'], as_index=False).sum()
df_filter_agg['CVR'] = df_filter_agg['CONV_COUNT'] / df_filter_agg['BRAND_IMPRESSIONS']
df_filter_agg['REVENUE_PER_BRAND_IMPRESSION'] = df_filter_agg['TOTAL_AD_SPEND'] / df_filter_agg['BRAND_IMPRESSIONS']
df_filter_agg.head()

Unnamed: 0,PUBLISHER,BRAND_NAME,BRAND_IMPRESSIONS,CONV_COUNT,TOTAL_AD_SPEND,REVENUE_PER_BRAND_IMPRESSION,CVR
0,120/Life,Dr. Squatch,101,1,25.0,0.247525,0.009901
1,120/Life,Laura Geller,217,1,15.0,0.069124,0.004608
2,120/Life,True Classic,201,0,0.0,0.0,0.0
3,8Greens,BlendJet,132,0,0.0,0.0,0.0
4,8Greens,Laura Geller,175,0,0.0,0.0,0.0


In [21]:
#Ingest proposed rankings
rankings = pd.read_csv('Proposed Rankings.csv')
rankings.head()

Unnamed: 0,Publisher,Advertiser,Rank
0,Laura Geller,Mario Badescu Skin Care,1
1,Laura Geller,LIVELY,2
2,Laura Geller,Love in Faith,3
3,Laura Geller,Ancient Nutrition,4
4,Laura Geller,Sol de Janeiro,5


In [22]:
#Filter for rows where 'PUBLISHER' in df is in 'publisher' in rankings
output = df_filter_agg[df_filter_agg['PUBLISHER'].isin(rankings['Publisher'])]
output = output[output['BRAND_NAME'].isin(rankings['Advertiser'])]
#Change the column names of rankings to match df
rankings.columns = ['PUBLISHER', 'BRAND_NAME', 'PROPOSED_RANK']
#Merge the two dataframes
output = pd.merge(output, rankings, on=['PUBLISHER', 'BRAND_NAME'], how='left')
#Filter for rows where 'PROPOSED_RANK' is not null
output = output[output['PROPOSED_RANK'].notnull()]
output.sample(5)
print(len(output))
print(len(df))
output.drop(columns='PROPOSED_RANK', inplace=True)
output.sample(5)

11
8894


Unnamed: 0,PUBLISHER,BRAND_NAME,BRAND_IMPRESSIONS,CONV_COUNT,TOTAL_AD_SPEND,REVENUE_PER_BRAND_IMPRESSION,CVR
3,Laura Geller,Dagne Dover,1508,0,0.0,0.0,0.0
13,Sol de Janeiro,Dagne Dover,867,0,0.0,0.0,0.0
18,Sol de Janeiro,Liquid I.V.,7998,4,140.0,0.017504,0.0005
2,Dr. Squatch,Twillory,233,0,0.0,0.0,0.0
1,Dr. Squatch,True Classic,18047,21,945.0,0.052363,0.001164


## Analysis

In [23]:
output.head(11)

Unnamed: 0,PUBLISHER,BRAND_NAME,BRAND_IMPRESSIONS,CONV_COUNT,TOTAL_AD_SPEND,REVENUE_PER_BRAND_IMPRESSION,CVR
1,Dr. Squatch,True Classic,18047,21,945.0,0.052363,0.001164
2,Dr. Squatch,Twillory,233,0,0.0,0.0,0.0
3,Laura Geller,Dagne Dover,1508,0,0.0,0.0,0.0
6,Laura Geller,Kindra,459,0,0.0,0.0,0.0
8,Laura Geller,Mario Badescu Skin Care,560,0,0.0,0.0,0.0
9,Laura Geller,Sol de Janeiro,943,0,0.0,0.0,0.0
13,Sol de Janeiro,Dagne Dover,867,0,0.0,0.0,0.0
15,Sol de Janeiro,Glamnetic,3002,1,40.0,0.013324,0.000333
16,Sol de Janeiro,LIVELY,2079,0,0.0,0.0,0.0
17,Sol de Janeiro,Laura Geller,14534,19,285.0,0.019609,0.001307


In [25]:
def bayesian_variables(df):
    df_top_pub = df.copy()
    prior_alpha = 1
    prior_beta = 1  
    df_top_pub['Posterior CVR'] = (df_top_pub['CONV_COUNT'] + prior_alpha) / \
                               (df_top_pub['BRAND_IMPRESSIONS'] + prior_alpha + prior_beta)
    
    # Calculate Beta distribution parameters
    df_top_pub['alpha'] = df_top_pub['CONV_COUNT'] + prior_alpha
    df_top_pub['beta'] = df_top_pub['BRAND_IMPRESSIONS'] - df_top_pub['CONV_COUNT'] + prior_beta
    return df_top_pub

In [28]:
def bayesian_comparison(df):
    """
    Calculate the probability of being better for each pair of brands in the DataFrame.

    Args:
    df (pd.DataFrame): DataFrame containing the brands and their Beta distribution parameters.

    Returns:
    pd.DataFrame: A matrix of probabilities where each element (i, j) represents the probability
                  of brand i being better than brand j.
    """
    n_brands = len(df)
    prob_matrix = np.zeros((n_brands, n_brands))

    for i in range(n_brands):
        for j in range(n_brands):
            if i == j:
                # Probability of a brand being better than itself is not defined
                prob_matrix[i, j] = np.nan
            else:
                alpha_i, beta_i = df.iloc[i]['alpha'], df.iloc[i]['beta']
                alpha_j, beta_j = df.iloc[j]['alpha'], df.iloc[j]['beta']
                
                # Define a range of conversion rates for evaluation
                x = np.linspace(0, 1, 10000)
                
                # Calculate the PDFs for both distributions
                pdf_i = beta.pdf(x, alpha_i, beta_i)
                pdf_j = beta.pdf(x, alpha_j, beta_j)
                
                # Estimate "probability of being better" for brand i over brand j
                prob_being_better = np.trapz(np.where(pdf_i > pdf_j, pdf_i, 0), x) / np.trapz(pdf_i, x)
                prob_matrix[i, j] = prob_being_better

    # Create a DataFrame from the matrix for better readability
    prob_df = pd.DataFrame(prob_matrix, index=df['BRAND_NAME'], columns=df['BRAND_NAME'])
    return prob_df

bayesian_comparison(sample)

BRAND_NAME,Dagne Dover,Glamnetic,LIVELY,Laura Geller,Liquid I.V.
BRAND_NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Dagne Dover,,0.490873,0.520872,0.690713,0.59722
Glamnetic,0.791243,,0.719153,0.782008,0.45593
LIVELY,0.79115,0.523807,,0.83645,0.593556
Laura Geller,0.950026,0.930704,0.971675,,0.932374
Liquid I.V.,0.871633,0.771328,0.8206,0.876052,


In [None]:
sample = output[output['PUBLISHER'] == 'facebook']
bayesian_comparison(sample)

In [29]:
def rank_probabilities(df):
    """
    Rank brands based on their aggregate "best" score calculated from the probability matrix.

    Args:
    prob_matrix_df (pd.DataFrame): Probability matrix where each element (i, j) represents
                                   the probability of brand i being better than brand j.

    Returns:
    pd.DataFrame: Ranked brands based on their aggregate "best" scores, from highest to lowest.
    """
    # Calculate the aggregate "best" score for each brand
    # Ignoring NaN values in the mean calculation
    aggregate_best_scores = prob_matrix_df.mean(axis=1)

    # Create a DataFrame for the scores
    aggregate_scores_df = pd.DataFrame({'Brand': aggregate_best_scores.index, 'Aggregate Best Score': aggregate_best_scores.values})

    # Rank the brands based on their aggregate "best" score
    ranked_brands = aggregate_scores_df.sort_values(by='Aggregate Best Score', ascending=False).reset_index(drop=True)

    return ranked_brands

prob_matrix_df = bayesian_comparison(sample)
rank_probabilities(prob_matrix_df)

Unnamed: 0,Brand,Aggregate Best Score
0,Laura Geller,0.946195
1,Liquid I.V.,0.834903
2,Glamnetic,0.687084
3,LIVELY,0.686241
4,Dagne Dover,0.57492


In [36]:
def calculate_rankings(df):
    bayesian_output = []
    for publisher in df['PUBLISHER'].unique():
        sample = bayesian_variables(df[df['PUBLISHER'] == publisher])
        prob_matrix_df = bayesian_comparison(sample)
        ranked_brands = rank_probabilities(prob_matrix_df)
        ranked_brands['PUBLISHER'] = publisher
        ranked_brands = ranked_brands[['PUBLISHER', 'Brand', 'Aggregate Best Score']]
        ranked_brands.columns = ['PUBLISHER', 'BRAND_NAME', 'AGGREGATE_BEST_SCORE']
        bayesian_output.append(ranked_brands)
    
    agg_scores = pd.concat(bayesian_output)
    #merge with the original dataframe
    return pd.merge(df, agg_scores, on=['PUBLISHER', 'BRAND_NAME'], how='left').sort_values(by=['PUBLISHER', 'AGGREGATE_BEST_SCORE'], ascending=False)
    #return output[['PUBLISHER', 'Brand', 'Aggregate Best Score']]

bayesian_output = calculate_rankings(output)
bayesian_output

Unnamed: 0,PUBLISHER,BRAND_NAME,BRAND_IMPRESSIONS,CONV_COUNT,TOTAL_AD_SPEND,REVENUE_PER_BRAND_IMPRESSION,CVR,AGGREGATE_BEST_SCORE
9,Sol de Janeiro,Laura Geller,14534,19,285.0,0.019609,0.001307,0.946195
10,Sol de Janeiro,Liquid I.V.,7998,4,140.0,0.017504,0.0005,0.834903
7,Sol de Janeiro,Glamnetic,3002,1,40.0,0.013324,0.000333,0.687084
8,Sol de Janeiro,LIVELY,2079,0,0.0,0.0,0.0,0.686241
6,Sol de Janeiro,Dagne Dover,867,0,0.0,0.0,0.0,0.57492
2,Laura Geller,Dagne Dover,1508,0,0.0,0.0,0.0,0.57492
3,Laura Geller,Kindra,459,0,0.0,0.0,0.0,
4,Laura Geller,Mario Badescu Skin Care,560,0,0.0,0.0,0.0,
5,Laura Geller,Sol de Janeiro,943,0,0.0,0.0,0.0,
0,Dr. Squatch,True Classic,18047,21,945.0,0.052363,0.001164,


In [37]:
bayesian_output.to_csv('bayesian_output.csv', index=False)

In [40]:
rankings.merge(bayesian_output, on=['PUBLISHER', 'BRAND_NAME'], how='left').drop(columns='PROPOSED_RANK').to_csv('bayesian_output.csv', index=False)