In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from scipy.stats import beta

In [2]:
df = pd.read_csv('/Users/peter/Documents/Data Science/Nurture/offers_publisher_brand_3.7.24.csv')
df.sample(5)

Unnamed: 0,SESSION_ID,PUBLISHER_NAME,BRAND_NAME,SEQUENCE_NUMBER,CLAIMED_OR_BROWSED
42786,18411923-7d3f-4d4b-8225-b2c16d0c8091,Matador Meggings,Twillory,0,browsed
109438,8afb311c-4135-4db0-b3cd-8b74ecac6e10,Laura Geller,Caden Lane,0,browsed
63629,dd16c6b4-58cb-4455-8ff5-7c49440a0b1e,Afloral,Laura Geller,0,browsed
82353,dd446045-a1b8-40b5-8d51-9be886a67852,Blueland,Buffbunny Collection,1,browsed
117544,6e4e8fbc-267b-474f-9cf0-5110d93712ba,Branch,ettitude,0,browsed


In [3]:
#Find the total number of offers claimed
len(df.loc[df['CLAIMED_OR_BROWSED'] == 'claimed'])

6107

## Aggregate Metrics

In [4]:
def calculate_agg_performance(df):
    # Total Unique Sessions
    total_unique_sessions = df['SESSION_ID'].nunique()
    
    # Browsed Past the First Offer
    browsed_past_first_offer = df[df['SEQUENCE_NUMBER'] > 0]['SESSION_ID'].nunique()
    
    # Browsed All Offers
    sessions_browsed_all_offers = df[df['CLAIMED_OR_BROWSED'] == 'browsed'].groupby('SESSION_ID')['SEQUENCE_NUMBER'].nunique()
    browsed_all_offers = sessions_browsed_all_offers[sessions_browsed_all_offers == 3].size
    
    # Claimed At Least One Offer
    claimed_at_least_one_offer = df[df['CLAIMED_OR_BROWSED'] == 'claimed']['SESSION_ID'].nunique()
    
    # Engaged: Claimed any offer OR browsed all offers
    # Find sessions that claimed any offer
    sessions_claimed_any_offer = df[df['CLAIMED_OR_BROWSED'] == 'claimed']['SESSION_ID'].unique()
    
    # Find sessions that browsed all offers
    sessions_browsed_all = sessions_browsed_all_offers[sessions_browsed_all_offers == 3].index
    
    # Combine and find unique sessions to define "Engaged"
    engaged_sessions = len(pd.unique(np.concatenate((sessions_claimed_any_offer, sessions_browsed_all))))
    
    # Prepare the output DataFrame
    metrics_data = {
        'Metric': [
            'Total Unique Sessions',
            'Browsed Past the First Offer',
            'Browsed All Offers',
            'Claimed At Least One Offer',
            'Engaged'
        ],
        'unique_sessions': [
            total_unique_sessions,
            browsed_past_first_offer,
            browsed_all_offers,
            claimed_at_least_one_offer,
            engaged_sessions
        ]
    }
    metrics_df = pd.DataFrame(metrics_data)
    
    # Calculate percent column as decimal to 5 places
    metrics_df['percent'] = (metrics_df['unique_sessions'] / total_unique_sessions).round(5)
    
    return metrics_df

agg_df = calculate_agg_performance(df)
agg_df

Unnamed: 0,Metric,unique_sessions,percent
0,Total Unique Sessions,218352,1.0
1,Browsed Past the First Offer,15807,0.07239
2,Browsed All Offers,11323,0.05186
3,Claimed At Least One Offer,5283,0.02419
4,Engaged,16606,0.07605


## Publisher Breakdown

In [5]:
def calculate_publisher_performance(df):
    # Total Unique Sessions per Publisher
    total_sessions_per_publisher = df.groupby('PUBLISHER_NAME')['SESSION_ID'].nunique()
    
    # Browsed Past the First Offer Rate per Publisher
    browsed_past_first_offer = df[df['SEQUENCE_NUMBER'] > 0].groupby('PUBLISHER_NAME')['SESSION_ID'].nunique()
    
    # Browsed All Offers Rate per Publisher
    browsed_all_offers = df[df['CLAIMED_OR_BROWSED'] == 'browsed'].groupby(['PUBLISHER_NAME', 'SESSION_ID'])['SEQUENCE_NUMBER'].nunique()
    browsed_all_offers = browsed_all_offers.reset_index()
    browsed_all_offers = browsed_all_offers[browsed_all_offers['SEQUENCE_NUMBER'] == 3].groupby('PUBLISHER_NAME')['SESSION_ID'].nunique()
    
    # Claimed At Least One Offer Rate per Publisher
    claimed_at_least_one_offer = df[df['CLAIMED_OR_BROWSED'] == 'claimed'].groupby('PUBLISHER_NAME')['SESSION_ID'].nunique()
    
    # Engaged Rate per Publisher
    # Find sessions that claimed any offer or browsed all offers per publisher
    claimed_sessions = df[df['CLAIMED_OR_BROWSED'] == 'claimed'].groupby('PUBLISHER_NAME')['SESSION_ID'].unique()
    engaged_sessions = (claimed_sessions.apply(len) + browsed_all_offers.reindex(claimed_sessions.index, fill_value=0)).reindex(total_sessions_per_publisher.index, fill_value=0)

    # Prepare the DataFrame
    metrics_df = pd.DataFrame({
        'total_unique_sessions': total_sessions_per_publisher,
        'browsed_past_first_offer_rate': (browsed_past_first_offer / total_sessions_per_publisher).round(5),
        'browsed_all_offers_rate': (browsed_all_offers / total_sessions_per_publisher).round(5),
        'claimed_at_least_one_offer_rate': (claimed_at_least_one_offer / total_sessions_per_publisher).round(5),
        'engaged_rate': (engaged_sessions / total_sessions_per_publisher).round(5)
    })

    # Remove rows where all rate values are NaN
    metrics_df.dropna(how='all', subset=['browsed_past_first_offer_rate', 'browsed_all_offers_rate', 'claimed_at_least_one_offer_rate', 'engaged_rate'], inplace=True)

    # Sort DataFrame by 'total_unique_sessions' in descending order
    metrics_df.sort_values(by='total_unique_sessions', ascending=False, inplace=True)
    
    return metrics_df.reset_index().rename(columns={'PUBLISHER_NAME': 'publisher'})

publisher_df = calculate_publisher_performance(df)
publisher_df

Unnamed: 0,publisher,total_unique_sessions,browsed_past_first_offer_rate,browsed_all_offers_rate,claimed_at_least_one_offer_rate,engaged_rate
0,Laura Geller,24617,0.05577,0.04184,0.01276,0.05460
1,BlendJet,14194,0.04206,0.02825,0.01282,0.04107
2,True Classic,8832,0.04529,0.03419,0.01313,0.04733
3,Dr. Squatch,7748,0.10003,0.07808,0.02013,0.09822
4,Sol de Janeiro,7346,0.10700,0.06834,0.04792,0.11625
...,...,...,...,...,...,...
439,Favour,1,,,,0.00000
440,Skidders,1,,,,0.00000
441,Anjie + Ash,1,,,,0.00000
442,Goldmine,1,,,,0.00000


## Brand Breakdown

In [6]:
def calculate_brand_performance(df):
    # Ensure 'BRAND_NAME' and other relevant columns are of type string to avoid unexpected behavior
    df['BRAND_NAME'] = df['BRAND_NAME'].astype(str)
    
    # Calculate total unique sessions per brand
    total_sessions_per_brand = df.groupby('BRAND_NAME')['SESSION_ID'].nunique()

    # Initialize the output DataFrame
    brand_performance = pd.DataFrame({
        'Brand': total_sessions_per_brand.index,
        'Unique_Sessions': total_sessions_per_brand.values
    })

    # Helper function to calculate browse and claim rates
    def calculate_rate(action, sequence):
        filtered = df[(df['CLAIMED_OR_BROWSED'] == action) & (df['SEQUENCE_NUMBER'] == sequence)]
        counts = filtered.groupby('BRAND_NAME')['SESSION_ID'].nunique()
        rates = counts.div(total_sessions_per_brand, fill_value=0)
        return rates

    # Calculate metrics for each position and action
    for position in range(3):
        browse_rate = calculate_rate('browsed', position).round(5)
        claim_rate = calculate_rate('claimed', position).round(5)

        brand_performance[f'Offer{position+1}_ViewRate'] = brand_performance['Brand'].map(browse_rate).fillna(0)
        brand_performance[f'Offer{position+1}_ClaimRate'] = brand_performance['Brand'].map(claim_rate).fillna(0)

    return brand_performance.sort_values('Unique_Sessions', ascending=False).reset_index(drop=True)

brand_df = calculate_brand_performance(df)
brand_df

Unnamed: 0,Brand,Unique_Sessions,Offer1_ViewRate,Offer1_ClaimRate,Offer2_ViewRate,Offer2_ClaimRate,Offer3_ViewRate,Offer3_ClaimRate
0,Laura Geller,73500,0.96167,0.01999,0.01227,0.00087,0.00491,0.00029
1,Caden Lane,63851,0.94169,0.01685,0.03237,0.00116,0.00767,0.00025
2,Little Words Project,16481,0.75347,0.01814,0.08052,0.00322,0.13646,0.00819
3,Canopy,14679,0.75182,0.01587,0.08543,0.00599,0.13271,0.00817
4,Buffbunny Collection,13464,0.80622,0.01248,0.13622,0.0104,0.03335,0.00134
5,Lalo,11777,0.85947,0.01656,0.06912,0.00484,0.04789,0.00212
6,D.S. & Durga,8295,0.82725,0.02447,0.08041,0.00374,0.05992,0.00422
7,ettitude,7119,0.79239,0.01517,0.10254,0.01025,0.07473,0.00492
8,LIVELY,6426,0.87286,0.00654,0.06038,0.00093,0.0554,0.00389
9,Lollipetals,5870,0.52249,0.01448,0.25741,0.01618,0.17649,0.01295


In [9]:
from datetime import datetime

# Get the current date
current_date = datetime.now()
date = current_date.strftime('%m_%d_%y')

#output all of the df to excel
with pd.ExcelWriter(f'/Users/peter/Documents/Data Science/Nurture/brand_publisher_performance{date}.xlsx') as writer:
    agg_df.to_excel(writer, sheet_name='Aggregate Performance', index=False)
    publisher_df.to_excel(writer, sheet_name='Publisher Performance', index=False)
    brand_df.to_excel(writer, sheet_name='Brand Performance', index=False)