In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import beta
import plotly.express as px
from datetime import datetime
from dotenv import load_dotenv
from snowflake import connector

import warnings
# Filter out all warnings
warnings.filterwarnings('ignore', category=Warning)

In [9]:
load_dotenv('/Users/peter/.env')
username = os.getenv("SNOWFLAKE_USERNAME")
password=os.getenv("SNOWFLAKE_PASSWORD")
account=os.getenv("SNOWFLAKE_ACCOUNT")
role=os.getenv("SNOWFLAKE_ROLE"), 
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE")

# establish Snowflake connection
connection = connector.connect(user=os.getenv("SNOWFLAKE_USERNAME"), 
                  password=os.getenv("SNOWFLAKE_PASSWORD"), 
                  account=os.getenv("SNOWFLAKE_ACCOUNT"), 
                  role=os.getenv("SNOWFLAKE_ROLE"), 
                  warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
                  database = 'DISCO_CORE' 
                  )

# # Set OpenAI API key
# openai.api_key = os.getenv("OPENAI_API_KEY")

### Ingest Data

In [25]:
raw_daily_display_query = """
select 
    to_date(event_created_at) as event_date,
    -- widget_type,
    -- ml_model,
    case 
        when widget_type = 'DISCOFEED' and ml_model not ilike '%waterfall%' and e.ml_model != 'contextual' then 'bert_boost_classic'
        when ml_model ilike '%waterfall%' then 'waterfall'
        when widget_type = 'LEAD_GEN' then 'nurture'
        when ml_model = 'contextual' and widget_type != 'LEAD_GEN' then 'contextual_classic'
        else 'other' 
    end as load_type,
    count(distinct event_id) as brand_displays
from event.event e 
where
    event_name = 'widget_brand_display'
    and to_date(event_created_at) between '2024-03-08' and '2024-04-30'
    -- and ml_model is null
    and brand_name != publisher_name
group by all 
order by event_date asc
"""

raw_daily_display = pd.read_sql(raw_daily_display_query, connection)
#change all columns to lower case
raw_daily_display.columns = map(str.lower, raw_daily_display.columns)

In [26]:
raw_zero_day_adspend_query = """
with conv_table as (
select *
from playground_analytics.derived__conversion_ordermap_fix 
where to_date(order_created_at) between '2024-02-01' and '2024-03-31'
union all 
select *
from derived.conversion 
where to_date(order_created_at) between '2024-04-01' and '2024-04-30'
)
select 
    to_date(c.event_created_at) as event_date,
    case 
        when e.widget_type = 'DISCOFEED' and e.ml_model not ilike '%waterfall%' and e.ml_model != 'contextual' then 'bert_boost_classic'
        when e.ml_model ilike '%waterfall%' then 'waterfall'
        when e.widget_type = 'LEAD_GEN' then 'nurture'
        when e.ml_model = 'contextual' and e.widget_type != 'LEAD_GEN' then 'contextual_classic'
        else 'other' 
    end as load_type,
    sum(case when billable_amount is null then 0 else billable_amount end) as zeroday_ad_spend
from conv_table c
left join curated.ad_spend_revenue asr using (order_id)
left join event.event e on e.event_id = c.event_id
where 
    c.conversion_type = 'cross-sell'
    and to_date(c.event_created_at) between '2024-03-08' and '2024-04-30'
    and c.days_to_attribution = 0
group by all 
order by event_date asc
;
"""

raw_zero_day_adspend = pd.read_sql(raw_zero_day_adspend_query, connection)
#change all columns to lower case
raw_zero_day_adspend.columns = map(str.lower, raw_zero_day_adspend.columns)

In [42]:
final_conv_query = """
with conv_table as (
select *
from playground_analytics.derived__conversion_ordermap_fix 
where to_date(order_created_at) between '2024-02-01' and '2024-03-31'
union all 
select *
from derived.conversion 
where to_date(order_created_at) between '2024-04-01' and '2024-04-30'
)
select 
    to_date(c.event_created_at) as event_date,
        -- case 
    --     when e.widget_type = 'DISCOFEED' and e.ml_model not ilike '%waterfall%' and e.ml_model != 'contextual' then 'bert_boost_classic'
    --     when e.ml_model ilike '%waterfall%' then 'waterfall'
    --     when e.widget_type = 'LEAD_GEN' then 'nurture'
    --     when e.ml_model = 'contextual' and e.widget_type != 'LEAD_GEN' then 'contextual_classic'
    --     else 'other' 
    -- end as load_type,
    count(distinct c.order_id) as final_conv_count,
    sum(case when billable_amount is null then 0 else billable_amount end) as final_ad_spend,
    -- count(case when c.customer_type = 'new' then 1 else null end) / count(distinct c.order_id) as new_cust_rate,
    -- count(case when c.event_classification = 'click' then 1 else null end) / count(distinct c.order_id) as click_rate,
from event.event e 
left join conv_table c using (event_id)
left join curated.ad_spend_revenue asr on c.order_id = asr.order_id
where 
    c.conversion_type = 'cross-sell'
    and to_date(c.event_created_at) between '2024-03-08' and '2024-04-30'
    -- and c.days_to_attribution = 0
group by all 
order by event_date asc
;
"""

final_conv = pd.read_sql(final_conv_query, connection)
#change all columns to lower case
final_conv.columns = map(str.lower, final_conv.columns)

### Organize Data

In [36]:
#Pivot the data by load type
display_pivot = raw_daily_display.pivot(index='event_date', columns='load_type', values='brand_displays').reset_index()
#fill NaN values with 0
display_pivot = display_pivot.fillna(0)
#remove the index name
display_pivot.columns.name = None
#add a subscript of "_displays" to all of the columns except 'event_data' 
display_pivot.columns = ['event_date'] + [col + '_displays' for col in display_pivot.columns[1:]]
display_pivot.sample(5)

Unnamed: 0,event_date,bert_boost_classic_displays,contextual_classic_displays,nurture_displays,other_displays,waterfall_displays
17,2024-03-25,230868.0,17805.0,24720.0,28.0,34803.0
19,2024-03-27,224306.0,10779.0,23128.0,1314.0,58037.0
18,2024-03-26,239359.0,10064.0,26365.0,404.0,51054.0
37,2024-04-14,210455.0,14254.0,25874.0,7767.0,1.0
32,2024-04-09,219055.0,11876.0,22216.0,6003.0,58123.0


In [38]:
#Pivot the zero day ad spend data by load type
zero_day_adspend_pivot = raw_zero_day_adspend.pivot(index='event_date', columns='load_type', values='zeroday_ad_spend').reset_index()
#fill NaN values with 0
zero_day_adspend_pivot = zero_day_adspend_pivot.fillna(0)
#remove the index name
zero_day_adspend_pivot.columns.name = None
#add a subscript of "_ad_spend" to all of the columns except 'event_data'
zero_day_adspend_pivot.columns = ['event_date'] + [col + '_0day_adspend' for col in zero_day_adspend_pivot.columns[1:]]
zero_day_adspend_pivot.sample(5)

Unnamed: 0,event_date,bert_boost_classic_0day_adspend,contextual_classic_0day_adspend,nurture_0day_adspend,other_0day_adspend,waterfall_0day_adspend
29,2024-04-06,1693.98,140.0,92.0,20.0,165.0
45,2024-04-22,1843.7,20.0,78.0,75.0,281.0
36,2024-04-13,1641.81,85.0,287.0,10.0,0.0
46,2024-04-23,1465.0,65.0,105.0,75.0,106.0
51,2024-04-28,1927.5,425.0,0.0,75.0,402.98


In [43]:
# final_conv['rpl'] = round(final_conv['final_ad_spend'] / final_conv['final_conv_count'],4)
final_conv.sample(5)

Unnamed: 0,event_date,final_conv_count,final_ad_spend
35,2024-04-12,486,7053.05
27,2024-04-04,442,6775.17
9,2024-03-17,412,6126.8
51,2024-04-28,225,3840.96
3,2024-03-11,415,6795.97
