In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import beta
import plotly.express as px
from datetime import datetime

import warnings
# Filter out all warnings
warnings.filterwarnings('ignore', category=Warning)

In [87]:
from snowflake import connector
import os
from dotenv import load_dotenv

load_dotenv('/Users/peter/.env')
username = os.getenv("SNOWFLAKE_USERNAME")
password=os.getenv("SNOWFLAKE_PASSWORD")
account=os.getenv("SNOWFLAKE_ACCOUNT")
role=os.getenv("SNOWFLAKE_ROLE"), 
warehouse=os.getenv("SNOWFLAKE_WAREHOUSE")

# establish Snowflake connection
connection = connector.connect(user=os.getenv("SNOWFLAKE_USERNAME"), 
                  password=os.getenv("SNOWFLAKE_PASSWORD"), 
                  account=os.getenv("SNOWFLAKE_ACCOUNT"), 
                  role=os.getenv("SNOWFLAKE_ROLE"), 
                  warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
                  database = 'DISCO_CORE' 
                  )

In [3]:
df = pd.read_csv('data/session_combos.csv')
#change all column names to lowercase
df.columns = df.columns.str.lower()
df.sample(5)

Unnamed: 0,session_id,widget_type,ml_model,publisher_name,brand_name
77440,f9f98380-ed62-4fbe-9a68-afdc24944c0d,DISCOFEED,,Lalo,Lalo
2458904,4152cef4-495a-42f2-b228-771a54a2ce22,DISCOFEED,,Love in Faith,Sol de Janeiro
1574299,86519570-63b3-4c98-90c6-530e41362cee,DISCOFEED,bert4recs1,Tubby Todd Bath Co.,Dreamland Baby
2161242,5887bb15-7b96-4264-a958-0c8c7c63ec2b,DISCOFEED,bert4recs1,Beginning Boutique,Vegamour
2645733,32a5e190-9223-4871-b384-b39e9da0f868,DISCOFEED,bert4recs1,Love in Faith,QALO


In [5]:
brands = pd.read_csv('data/max_budget_brands.csv')
brands.columns = brands.columns.str.lower()
brands.head()

Unnamed: 0,order_brand_name,max_budget,reach_max_timestamp
0,Blueland,6000.0,2024-03-09T18:45:53Z
1,Dreamland Baby,2500.0,2024-03-17T16:29:27Z
2,HATCH,2000.0,2024-03-30T03:44:35Z
3,Kopari,5000.0,2024-03-15T20:37:07Z
4,LIVELY,4000.0,2024-03-29T15:43:37Z


### Identify Top Publishers and Brands

In [14]:
def find_top_publishers(df, brands):
    results = []

    # Iterate over each brand in the 'brands' DataFrame
    for brand in brands['order_brand_name'].unique():
        # Filter 'df' for the current brand
        df_filtered = df[df['brand_name'] == brand]
        
        # Count sessions by publisher for the current brand
        publisher_counts = df_filtered.groupby('publisher_name')['session_id'].nunique().reset_index(name='publisher_session_count')
        
        # Sort the counts in descending order to get top publishers
        publisher_counts_sorted = publisher_counts.sort_values(by='publisher_session_count', ascending=False)
        
        # Rank the publishers
        publisher_counts_sorted['publisher_rank'] = range(1, len(publisher_counts_sorted) + 1)
        
        # Keep only the top 10 publishers
        top_publishers = publisher_counts_sorted.head(10)
        
        # Add the brand name to the DataFrame
        top_publishers['brand_name'] = brand
        
        # Append the result
        results.append(top_publishers[['brand_name', 'publisher_name', 'publisher_rank', 'publisher_session_count']])

    # Concatenate all results into a single DataFrame
    final_results = pd.concat(results, ignore_index=True)

    # Rename columns to match the desired output
    final_results.rename(columns={'publisher_name': 'publisher'}, inplace=True)

    return final_results

top_publishers = find_top_publishers(df_subset, brands)
top_publishers.sample(10)

Unnamed: 0,brand_name,publisher,publisher_rank,publisher_session_count
16,Dreamland Baby,Beginning Boutique,7,1908
57,Lunya,Lunya,8,351
54,Lunya,Bala,5,385
47,LIVELY,Love in Faith,8,1696
56,Lunya,Foria,7,356
90,True Botanicals,True Botanicals,1,10089
10,Dreamland Baby,Tubby Todd Bath Co.,1,34503
7,Blueland,Ora Organic,8,233
75,Sol de Janeiro,Liquid I.V.,6,4207
35,Kopari,Tubby Todd Bath Co.,6,199


In [19]:
def find_top_brands(df, brands_df):
    all_results = pd.DataFrame()

    for brand in brands_df['order_brand_name'].unique():
        # Find sessions for the current brand
        sessions_with_brand = df[df['brand_name'] == brand]['session_id'].unique()
        
        # Find other brands in those sessions, excluding the current brand
        other_brands = df[(df['session_id'].isin(sessions_with_brand)) & (df['brand_name'] != brand)]
        
        # Count the unique sessions for other brands, sort them, and get the top 10
        other_brand_counts = other_brands.groupby('brand_name')['session_id']\
                            .nunique()\
                            .reset_index(name='other_brand_count')\
                            .sort_values(by='other_brand_count', ascending=False)\
                            .head(10)
        
        # Add a rank column based on the session count
        other_brand_counts['other_brand_rank'] = other_brand_counts['other_brand_count'].rank(method='min', ascending=False)
        
        # Rename and reorder columns to match desired output
        other_brand_counts.rename(columns={'brand_name': 'other_brand'}, inplace=True)
        other_brand_counts['maxed_budget_brand'] = brand
        other_brand_counts = other_brand_counts[['maxed_budget_brand', 'other_brand', 'other_brand_rank', 'other_brand_count']]
        
        # Append the results for this brand to the overall results DataFrame
        all_results = pd.concat([all_results, other_brand_counts], ignore_index=True)

    return all_results

top_brands = find_top_brands(df_subset, brands)
top_brands.sample(10)

Unnamed: 0,maxed_budget_brand,other_brand,other_brand_rank,other_brand_count
73,Sol de Janeiro,LIVELY,4.0,20473
58,Lunya,Dollar Shave Club,9.0,490
97,True Botanicals,Manduka,8.0,123
118,vitruvi,Branch,9.0,48
59,Lunya,Twillory,10.0,469
12,Dreamland Baby,Tubby Todd Bath Co.,3.0,12002
99,True Botanicals,Solly Baby,10.0,102
96,True Botanicals,Nood,7.0,129
39,Kopari,Beekman 1802,10.0,1038
107,Tubby Todd Bath Co.,Tushbaby,8.0,4332


In [20]:
top_brands.to_csv('data/top_brands_output.csv', index=False)
top_publishers.to_csv('data/top_publishers_output.csv', index=False)

## Analyze Post Performance

### Publisher Performance

In [58]:
def find_publisher_performance(brand_name, brand_df, publisher_df):
    brand_filter = brand_df[brand_df['order_brand_name'] == brand_name]
    if not brand_filter.empty:
        max_time = brand_filter.iloc[0]['reach_max_timestamp']
        max_time = datetime.strptime(max_time, '%Y-%m-%dT%H:%M:%SZ')
        time_cutoff = max_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    else:
        print(f"No max budget timestamp found for {brand_name}. Skipping.")
        return pd.DataFrame()

    #Find the top publishers for the brand
    top_publishers = publisher_df[publisher_df['brand_name'] == brand_name]
    #Filter for rows where publisher_session_count is greater than 100
    top_publishers = top_publishers[top_publishers['publisher_session_count'] >= 100]

    # Placeholder for the final results
    final_results = pd.DataFrame()

    #Iterate through each publisher in the top publishers list
    for publisher_name in top_publishers['publisher'].unique():
    
        publisher_query = f"""
        with loads as (
                select 
                    case when dvce_created_tstamp <= '{time_cutoff}' then 'Pre-Max' else 'Post-Max' end as period,
                    count(distinct event_id) as widget_loads
                from event.event e 
                where
                    publisher_name = '{publisher_name}'
                    and to_date(dvce_created_tstamp) < '2024-04-01'
                    and to_date(dvce_created_tstamp) >= '2024-03-01'
                    and event_name = 'widget_display'
                group by all),
                conv as (
                select 
                    case when event_created_at <= '{time_cutoff}' then 'Pre-Max' else 'Post-Max' end as period,
                    count(distinct order_id) as conv_count,
                    sum(billable_amount) as ad_spend
                from curated.ad_spend_revenue
                where 
                    event_brand_name = '{publisher_name}'
                    and to_date(event_created_at) < '2024-04-01'
                    and to_date(event_created_at) >= '2024-03-01'
                group by all
                )
                select *
                from loads
                inner join conv using (period)
                """
        
        df = pd.read_sql(publisher_query, connection)
        #Print completion message
        print(f'Completed {publisher_name} query')
        df['publisher'] = publisher_name
        df['brand'] = brand_name
        final_results = pd.concat([final_results, df], ignore_index=True)

    #Move the 'brand' and 'publisher' columns to the front
    cols = final_results.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    final_results = final_results[cols]

    #Change all column names to lowercase
    final_results.columns = final_results.columns.str.lower()
    output = final_results.groupby(['brand', 'period']).agg({'widget_loads': 'sum', 'conv_count': 'sum', 'ad_spend': 'sum'}).reset_index()
    return output

# blueland = find_publisher_performance('Blueland', brands, top_publishers)

In [None]:
df_output = []
for brand in brands['order_brand_name'].unique():
    df_output.append(find_publisher_performance(brand, brands, top_publishers))
    #Print completion message
    print(f'Completed {brand} - brand query')

final_output = pd.concat(df_output, ignore_index=True)

In [61]:
#compute cvr as conv_count/widget_loads, round to 4 decimal places
final_output['cvr'] = round(final_output['conv_count'] / final_output['widget_loads'], 4)
#compute cpa as ad_spend/conv_count, round to 2 decimal places
final_output['cpa'] = round(final_output['ad_spend'] / final_output['conv_count'], 2)
#compute rpl as ad_spend/widget_loads, round to 3 decimal places
final_output['rpl'] = round(final_output['ad_spend'] / final_output['widget_loads'], 3)
final_output.sample(5)

Unnamed: 0,brand,period,widget_loads,conv_count,ad_spend,cvr,cpa,rpl
4,HATCH,Post-Max,14863,77,1219.0,0.0052,15.83,0.082
0,Blueland,Post-Max,678599,1715,23701.01,0.0025,13.82,0.035
16,Surely Wines,Post-Max,3550,16,261.0,0.0045,16.31,0.074
18,True Botanicals,Post-Max,589489,1127,13355.77,0.0019,11.85,0.023
12,Snuggle Me Organic,Post-Max,173218,937,14932.96,0.0054,15.94,0.086


In [64]:
final_output.to_csv('data/publisher_performance_output.csv', index=False)

### Co-Advertiser Performance

In [84]:
def find_coadvertiser_performance(brand_name, brand_df, top_brand_df):
    brand_filter = brand_df[brand_df['order_brand_name'] == brand_name]
    if not brand_filter.empty:
        max_time = brand_filter.iloc[0]['reach_max_timestamp']
        max_time = datetime.strptime(max_time, '%Y-%m-%dT%H:%M:%SZ')
        time_cutoff = max_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    else:
        print(f"No max budget timestamp found for {brand_name}. Skipping.")
        return pd.DataFrame()

    #Find the top publishers for the brand
    top_brands = top_brand_df[top_brand_df['maxed_budget_brand'] == brand_name]
    #Remove rows where 'other_brand' is in 'brands_df', and 'other_brand' is not in 'top_brands'
    top_brands = top_brands[~top_brands['other_brand'].isin(brand_df['order_brand_name'])] 

    # Placeholder for the final results
    final_results = pd.DataFrame()
    # print(time_cutoff)

    # #Iterate through each publisher in the top publishers list
    for co_advertiser in top_brands['other_brand'].unique():
        # print(co_advertiser)
    
        query = f"""
        with loads as (
        select 
            case when dvce_created_tstamp <= '{time_cutoff}' then 'Pre-Max' else 'Post-Max' end as period,
            count(distinct to_date(dvce_created_tstamp)) as days,
            count(distinct event_id) as brand_impr
        from event.event e 
        where
            brand_name = '{co_advertiser}'
            and to_date(dvce_created_tstamp) < '2024-04-01'
            and to_date(dvce_created_tstamp) >= '2024-03-01'
            and event_name = 'widget_brand_display'
        group by all),
        conv as (
        select 
            case when event_created_at <= '{time_cutoff}' then 'Pre-Max' else 'Post-Max' end as period,
            count(distinct order_id) as conv_count,
            sum(billable_amount) as ad_spend
        from curated.ad_spend_revenue
        where 
            order_brand_name = '{co_advertiser}'
            and to_date(event_created_at) < '2024-04-01'
            and to_date(event_created_at) >= '2024-03-01'
        group by all
        )
        select *
        from loads
        inner join conv using (period)
        ;
        """
        
        df = pd.read_sql(query, connection)
        df['brand'] = brand_name
        df['co_advertiser'] = co_advertiser
        final_results = pd.concat([final_results, df], ignore_index=True)

    #Move the 'brand' and 'publisher' columns to the front
    cols = final_results.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    final_results = final_results[cols]

    #Change all column names to lowercase
    final_results.columns = final_results.columns.str.lower()
    output = final_results.groupby(['brand', 'period']).agg({'days':'mean', 'brand_impr': 'sum', 'conv_count': 'sum', 'ad_spend': 'sum'}).reset_index()
    return output

# blueland = find_publisher_performance('Blueland', brands, top_publishers)

In [85]:
find_coadvertiser_performance('Blueland', brands, top_brands)

Unnamed: 0,brand,period,days,brand_impr,conv_count,ad_spend
0,Blueland,Post-Max,22.0,1403974,3668,52500.0
1,Blueland,Pre-Max,9.0,855646,2083,26396.0


In [87]:
coadvertisers = []
for brand in brands['order_brand_name'].unique():
    coadvertisers.append(find_coadvertiser_performance(brand, brands, top_brands))
    #Print completion message
    print(f'Completed {brand} - brand query')

coadvertiser_output = pd.concat(coadvertisers, ignore_index=True)

Completed Blueland - brand query
Completed Dreamland Baby - brand query
Completed HATCH - brand query
Completed Kopari - brand query
Completed LIVELY - brand query
Completed Lunya - brand query
Completed Snuggle Me Organic - brand query
Completed Sol de Janeiro - brand query
Completed Surely Wines - brand query
Completed True Botanicals - brand query
Completed Tubby Todd Bath Co. - brand query
Completed vitruvi - brand query


In [89]:
coadvertiser_output['daily_impr'] = round(coadvertiser_output['brand_impr'] / coadvertiser_output['days'], 0)
#compute cvr as conv_count/brand_impr, round to 4 decimal places
coadvertiser_output['cvr'] = round(coadvertiser_output['conv_count'] / coadvertiser_output['brand_impr'], 4)
#compute cpa as ad_spend/conv_count, round to 2 decimal places
coadvertiser_output['cpa'] = round(coadvertiser_output['ad_spend'] / coadvertiser_output['conv_count'], 2)
#compute rpl as ad_spend/brand_impr, round to 3 decimal places
coadvertiser_output['rpl'] = round(coadvertiser_output['ad_spend'] / coadvertiser_output['brand_impr'], 3)
coadvertiser_output.sample(5)

Unnamed: 0,brand,period,days,brand_impr,conv_count,ad_spend,daily_impr,cvr,cpa,rpl
3,Dreamland Baby,Pre-Max,16.666667,836383,1200,21995.0,50182.98,0.001435,18.329167,0.026298
7,Kopari,Pre-Max,14.875,1470492,2960,35707.0,98856.605042,0.002013,12.063176,0.024282
21,Tubby Todd Bath Co.,Pre-Max,16.0,1172940,1487,27008.0,73308.75,0.001268,18.162744,0.023026
11,Lunya,Pre-Max,21.0,708117,1213,28026.0,33719.857143,0.001713,23.104699,0.039578
8,LIVELY,Post-Max,2.75,74870,74,1698.0,27225.454545,0.000988,22.945946,0.022679


In [90]:
coadvertiser_output.to_csv('data/coadvertiser_performance_output.csv', index=False)

## Identify Emergent brands
- Parse through brands then their top publishers, look for brands with the biggest increase in % of daily impressions before and after the budget.

### Version 1

In [107]:
def find_emergent_brands(brand_name, brand_df, publisher_df):
    brand_filter = brand_df[brand_df['order_brand_name'] == brand_name]
    if not brand_filter.empty:
        max_time = brand_filter.iloc[0]['reach_max_timestamp']
        max_time = datetime.strptime(max_time, '%Y-%m-%dT%H:%M:%SZ')
        time_cutoff = max_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    else:
        print(f"No max budget timestamp found for {brand_name}. Skipping.")
        return pd.DataFrame()

    #Find the top publishers for the brand
    top_publishers = publisher_df[publisher_df['brand_name'] == brand_name]
    #Filter for rows where publisher_session_count is greater than 100
    top_publishers = top_publishers[top_publishers['publisher_session_count'] >= 100]

    # Placeholder for the final results
    final_results = pd.DataFrame()
    #Iterate through each publisher in the top publishers list
    for publisher_name in top_publishers['publisher'].unique():
        query = f"""
        with loads as (
            select 
                brand_name,
                case when dvce_created_tstamp <= '{time_cutoff}' then 'Pre-Max' else 'Post-Max' end as period,
                count(distinct to_date(dvce_created_tstamp)) as days,
                count(distinct event_id) as brand_impr
            from event.event e 
            where
                publisher_name = '{publisher_name}'
                and to_date(dvce_created_tstamp) < '2024-04-01'
                and to_date(dvce_created_tstamp) >= '2024-03-01'
                and event_name = 'widget_brand_display'
            group by all),
        conv as (
            select 
                order_brand_name as brand_name,
                case when event_created_at <= '{time_cutoff}' then 'Pre-Max' else 'Post-Max' end as period,
                count(distinct order_id) as conv,
                sum(billable_amount) as ad_spend,
            from curated.ad_spend_revenue
            where
                event_brand_name = '{publisher_name}'
                and to_date(event_created_at) < '2024-04-01'
                and to_date(event_created_at) >= '2024-03-01'
                -- and event_name = 'widget_brand_display'
            group by all),
        granular as (
        select 
            brand_name,
            period,
            days,
            brand_impr,
            case when conv is null then 0 else conv end as conv,
            case when ad_spend is null then 0 else ad_spend end as ad_spend 
        from loads l 
        left join conv c using (brand_name, period)
        ),
        pivot as (
        select 
            brand_name,
            -- max(days) as days,
            max(case when period = 'Pre-Max' then round(brand_impr/days,1) else 0 end) as Pre_Daily_Impr,
            max(case when period = 'Post-Max' then round(brand_impr/days,1) else 0 end) as Post_Daily_Impr,
            max(case when period = 'Pre-Max' then round(conv/days,1) else 0 end) as Pre_Daily_Conv,
            max(case when period = 'Post-Max' then round(conv/days,1) else 0 end) as Post_Daily_Conv,
            max(case when period = 'Pre-Max' then round(ad_spend/days,1) else 0 end) as Pre_Daily_Rev,
            max(case when period = 'Post-Max' then round(ad_spend/days,1) else 0 end) as Post_Daily_Rev,
        from granular
        group by brand_name)
        select 
            *
        from pivot p
        where
            pre_daily_impr > 0
            and p.post_daily_impr > 100
            and (post_daily_impr - pre_daily_impr)/nullif(pre_daily_impr,0) >= 0.5
        ;
        """
        df = pd.read_sql(query, connection)
        #Print completion message
        # print(f'Completed {publisher_name} query')
        df['publisher'] = publisher_name
        df['brand'] = brand_name
        final_results = pd.concat([final_results, df], ignore_index=True)

    #Move the 'brand' and 'publisher' columns to the front
    cols = final_results.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    final_results = final_results[cols]

    #Change all column names to lowercase
    final_results.columns = final_results.columns.str.lower()
    output = final_results.groupby(['brand', 'brand_name']).agg({'pre_daily_impr': 'sum', 'post_daily_impr': 'sum', 'pre_daily_conv': 'sum', 'post_daily_conv':'sum',
                                                                 'pre_daily_rev': 'sum', 'post_daily_rev':'sum'}).reset_index()
    #Remove rows where 'brand_name' is in 'brands_df's 'order_brand_name' column
    output = output[~output['brand_name'].isin(brand_df['order_brand_name'])]

    #Calculate percent diff
    # output['impr_percent_diff'] = round((output['post_daily_impr'] - output['pre_daily_impr']) / output['pre_daily_impr'], 2)
    # output['raw_diff'] = output['post_daily_impr'] - output['pre_daily_impr']
    return output

In [108]:
df_output = []
for brand in brands['order_brand_name'].unique():
    df_output.append(find_emergent_brands(brand, brands, top_publishers))
    #Print completion message
    print(f'Completed {brand} - brand query')

emergent_brands = pd.concat(df_output, ignore_index=True)
emergent_brands.to_csv('data/emergent_brands_output.csv', index=False)

Completed Blueland - brand query
Completed Dreamland Baby - brand query
Completed HATCH - brand query
Completed Kopari - brand query
Completed LIVELY - brand query
Completed Lunya - brand query
Completed Snuggle Me Organic - brand query
Completed Sol de Janeiro - brand query
Completed Surely Wines - brand query
Completed True Botanicals - brand query
Completed Tubby Todd Bath Co. - brand query
Completed vitruvi - brand query


In [111]:
emergent_brands.sample(5)

Unnamed: 0,brand,brand_name,pre_daily_impr,post_daily_impr,pre_daily_conv,post_daily_conv,pre_daily_rev,post_daily_rev
196,True Botanicals,Beekman 1802,204.5,2502.9,1.5,2.0,15.0,28.3
190,Sol de Janeiro,UNTUCKit,729.7,1719.9,0.5,0.0,16.7,0.9
91,Kopari,Tushbaby,243.4,692.4,0.7,1.1,11.5,19.1
209,Tubby Todd Bath Co.,Bodily,101.1,226.1,0.3,0.6,6.6,14.5
152,Snuggle Me Organic,Monica & Andy Inc.,83.6,136.2,0.3,0.5,7.1,4.5


In [34]:
print(brand_category.full_category.unique())

['Food & Beverage, Health & Wellness' 'Baby, Health & Wellness'
 'Fashion & Apparel (Non-Binary/Unisex), Fashion & Apparel (Mens), Fashion & Apparel (Womens)']


### Version 2

#### Find Top Coadvertisers

In [8]:
def find_cobrand_sessions(brand_name, brand_df, publisher_df):
    brand_filter = brand_df[brand_df['order_brand_name'] == brand_name]
    if not brand_filter.empty:
        max_time = brand_filter.iloc[0]['reach_max_timestamp']
        max_time = datetime.strptime(max_time, '%Y-%m-%dT%H:%M:%SZ')
        time_cutoff = max_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    else:
        print(f"No max budget timestamp found for {brand_name}. Skipping.")
        return pd.DataFrame()
    
    #For each brand and its top publishers, find the top co-advertisers
    
    #1. Find the top publishers for the brand
    top_publishers = publisher_df[publisher_df['brand_name'] == brand_name]
    #Filter for rows where publisher_session_count is greater than 100
    top_publishers = top_publishers[top_publishers['publisher_session_count'] >= 100]

    # Placeholder for the final results
    final_results = pd.DataFrame()
    #Iterate through each publisher in the top publishers list
    for publisher_name in top_publishers['publisher'].unique():
        query = f"""
        with 
        relevant_sessions as (
        select 
            distinct session_id
        from event.event 
        where 
            brand_name = '{brand_name}' ---Brand Variable
            and to_date(dvce_created_tstamp) >= '2024-03-01' --- Variable (at some point)
            and dvce_created_tstamp <= '{time_cutoff}' --- Variable
            and publisher_name = '{publisher}' --- Variable
        ),
        sessions as (
        select 
            distinct session_id,
            -- widget_type,
            -- ml_model,
            publisher_name,
            dvce_created_tstamp as timestamp,
            brand_name,
        from event.event e 
        where 
            session_id in (select * from relevant_sessions)
            and brand_name is not null
            and vertical_display_position <= 2
        order by session_id, timestamp asc),
        distinct_brands AS (
            SELECT
                session_id,
                publisher_name,
                brand_name,
                MIN(timestamp) AS earliest_timestamp
            FROM sessions
            GROUP BY ALL
        ),
        numbered_brands AS (
            SELECT 
                session_id,
                publisher_name,
                brand_name,
                earliest_timestamp,
                ROW_NUMBER() OVER (PARTITION BY session_id ORDER BY earliest_timestamp) AS brand_order
            FROM distinct_brands
        ),
        sessions_with_min_three_brands AS (
            SELECT 
                session_id
            FROM numbered_brands
            GROUP BY ALL
            HAVING COUNT(DISTINCT brand_name) >= 3
        ),
        aggregated_sessions AS (
            SELECT 
                nb.session_id,
                nb.publisher_name,
                MIN(nb.earliest_timestamp) AS session_time,
                MAX(CASE WHEN nb.brand_order = 1 THEN nb.brand_name END) AS first_brand,
                MAX(CASE WHEN nb.brand_order = 2 THEN nb.brand_name END) AS second_brand,
                MAX(CASE WHEN nb.brand_order = 3 THEN nb.brand_name END) AS third_brand
            FROM numbered_brands nb
            JOIN sessions_with_min_three_brands ON nb.session_id = sessions_with_min_three_brands.session_id
            GROUP BY ALL
        ),
        clean_sessions as (
        SELECT 
            session_id,
            publisher_name,
            session_time,
            first_brand,
            second_brand,
            third_brand
        FROM aggregated_sessions
        ORDER BY session_time)
        select *
        from clean_sessions
        ;
        """
        df = pd.read_sql(query, connection)
        # #Print completion message
        # print(f'Completed {publisher_name} query')
        # df['publisher'] = publisher_name
        df['brand'] = brand_name
        final_results = pd.concat([final_results, df], ignore_index=True)

    #Move the 'brand' and 'publisher' columns to the front
    cols = final_results.columns.tolist()
    cols = cols[-2:] + cols[:-2]
    final_results = final_results[cols]

    #Change all column names to lowercase
    final_results.columns = final_results.columns.str.lower()

    return final_results

In [9]:
def find_top_brand_combos(df, target_brand):
    # List to store the pairs
    brand_pairs = []

    for index, row in df.iterrows():
        brands = [row['first_brand'], row['second_brand'], row['third_brand']]
        
        # Check if the target brand is in the current row
        if target_brand in brands:
            # Remove the target brand from the list
            brands.remove(target_brand)
            
            # Ensure there are exactly 2 other brands
            if len(brands) == 2:
                # Sort the brands to remove the order importance
                brands.sort()
                brand_pairs.append(tuple(brands))
    
    # Convert the list of pairs into a DataFrame and count occurrences
    pair_df = pd.DataFrame(brand_pairs, columns=['pair_brand_1', 'pair_brand_2'])
    counts_df = pair_df.value_counts().reset_index(name='event_count')

    #Add 'brand' column to the counts_df and move it to the front
    counts_df['brand'] = target_brand
    cols = counts_df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    counts_df = counts_df[cols]
    
    return counts_df.head()

# find_top_brand_combos(df_test, 'Blueland')

In [None]:
coadvertiser_output = []
for brand in brands['order_brand_name'].unique():
    #Find cobrand sessions
    sessions = find_cobrand_sessions(brand, brands, top_publishers)
    #Find top combos
    coadvertiser_output.append(find_top_brand_combos(sessions, brand))
    #Print completion message
    print(f'Completed {brand} - brand query')

top_coadvertisers = pd.concat(coadvertiser_output, ignore_index=True)
top_coadvertisers.to_csv('data/top_coadvertisers_output.csv', index=False)
top_coadvertisers.sample(5)

In [10]:
top_coadvertisers = pd.read_csv('data/top_coadvertisers_output.csv')

In [98]:
top_coadvertisers

Unnamed: 0,brand,pair_brand_1,pair_brand_2,event_count
0,Blueland,Sol de Janeiro,Viori Beauty,430
1,Blueland,Dropps,Viori Beauty,340
2,Blueland,Beachwaver,Sol de Janeiro,300
3,Blueland,Love in Faith,Viori Beauty,270
4,Blueland,Dropps,Tula Skincare,270
5,HATCH,Dagne Dover,Newton Baby,560
6,HATCH,Dagne Dover,Posh Peanut,220
7,HATCH,Newton Baby,Posh Peanut,210
8,HATCH,Dagne Dover,LIVELY,170
9,HATCH,Newton Baby,Willow Pumps,140


#### Find Emergent Brands

In [91]:
def process_session_brands(df):
    df_copy = df.copy()

    def remove_and_split(row):
        # Split session brands into a list using the new delimiter "-- "
        session_brands_list = row['session_brands'].split("-- ")
        # Remove pair_brand_1 and pair_brand_2 if they exist in the list
        remaining_brands = [brand for brand in session_brands_list if brand not in (row['pair_brand_1'], row['pair_brand_2'])]
        # Since we only want the first remaining brand, directly return the first item if exists
        return remaining_brands[0] if remaining_brands else None

    # Apply the function to each row
    df_copy['remaining_brand_1'] = df_copy.apply(remove_and_split, axis=1)

    # Continue with the aggregation and filtering as before
    agg = df_copy.groupby(['brand', 'pair_brand_1', 'pair_brand_2','remaining_brand_1']).agg({'session_id': 'count'}).reset_index()
    
    # Remove rows where 'remaining_brand_1' is in 'brands_df's 'order_brand_name' column, is null, blank or equal to 'brand'
    agg = agg[~agg['remaining_brand_1'].isin(brands['order_brand_name'])]
    agg = agg[~agg['remaining_brand_1'].isnull()]
    agg = agg[agg['remaining_brand_1'] != '']
    agg = agg[agg['remaining_brand_1'] != agg['brand']]

    # Rename columns
    agg.columns = ['brand','pair_brand_1', 'pair_brand_2', 'emergent_brand', 'session_count']
    
    return agg.sort_values(by='session_count', ascending=False).head()

In [90]:
def process_session_brands_no_pairs(df):
    df_copy = df.copy()

    def remove_and_split(row):
        # Split session brands into a list using the new delimiter "-- "
        session_brands_list = row['session_brands'].split("-- ")
        # Remove pair_brand_1 and pair_brand_2 if they exist in the list
        remaining_brands = [brand for brand in session_brands_list if brand not in (row['pair_brand_1'], row['pair_brand_2'])]
        # Since we only want the first remaining brand, directly return the first item if exists
        return remaining_brands[0] if remaining_brands else None

    # Apply the function to each row
    df_copy['remaining_brand_1'] = df_copy.apply(remove_and_split, axis=1)

    # Continue with the aggregation and filtering as before
    agg = df_copy.groupby(['brand', 'remaining_brand_1']).agg({'session_id': 'count'}).reset_index()
    
    # Remove rows where 'remaining_brand_1' is in 'brands_df's 'order_brand_name' column, is null, blank or equal to 'brand'
    agg = agg[~agg['remaining_brand_1'].isin(brands['order_brand_name'])]
    agg = agg[~agg['remaining_brand_1'].isnull()]
    agg = agg[agg['remaining_brand_1'] != '']
    agg = agg[agg['remaining_brand_1'] != agg['brand']]

    # Rename columns
    agg.columns = ['brand', 'emergent_brand', 'session_count']
    
    return agg.sort_values(by='session_count', ascending=False).head()

In [86]:
def find_emergent_brands_v2(brand_name, brand_df, cobrands):
    #Iterate through each cobrand combo in the cobrands list
    cobrand_combos = cobrands[cobrands['brand'] == brand_name]
    cobrand_combos = cobrand_combos[['pair_brand_1', 'pair_brand_2']]
    #remove rows where the pair brand is in 'brands_df's 'order_brand_name' column (meaning it also has max budget)
    cobrand_combos = cobrand_combos[~cobrand_combos['pair_brand_1'].isin(brand_df['order_brand_name'])]
    cobrand_combos = cobrand_combos[~cobrand_combos['pair_brand_2'].isin(brand_df['order_brand_name'])]
    # Placeholder for the final results
    final_results = pd.DataFrame()

    #identify the max budget timestamp for the brand
    brand_filter = brand_df[brand_df['order_brand_name'] == brand_name]
    if not brand_filter.empty:
        max_time = brand_filter.iloc[0]['reach_max_timestamp']
        max_time = datetime.strptime(max_time, '%Y-%m-%dT%H:%M:%SZ')
        time_cutoff = max_time.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
    else:
        print(f"No max budget timestamp found for {brand_name}. Skipping.")
        return pd.DataFrame()

    for index, row in cobrand_combos.iterrows():
        #identify the pair brands
        brand_1 = row['pair_brand_1']
        brand_2 = row['pair_brand_2']

        query = f"""
        select
            session_id,
            min(dvce_created_tstamp) as time_stamp,
            LISTAGG(distinct brand_name, '-- ') within group (order by brand_name) as session_brands
        from event.event
        where 
            vertical_display_position <= 2
            and brand_name is not null 
            and dvce_created_tstamp >= '{time_cutoff}'
            and to_date(dvce_created_tstamp) <= '2024-04-01'
        group by all 
        having
            count(case when brand_name in ('{brand_1}') then 1 end) > 0
            and count(case when brand_name in ('{brand_2}') then 1 end) > 0
        ;
        """
        df = pd.read_sql(query, connection)
        df['brand'] = brand_name
        df['pair_brand_1'] = brand_1
        df['pair_brand_2'] = brand_2
        # #print completion message
        # print(f'Completed {brand_1} and {brand_2} query')
        #change all column names to lowercase
        df.columns = df.columns.str.lower()
        output = process_session_brands(df)
        final_results = pd.concat([final_results, output], ignore_index=True)

    return final_results

In [92]:
newbrand_output = []
for brand in brands['order_brand_name'].unique():
    if brand != 'Sol de Janeiro':
        #Find new brands
        newbrand_output.append(find_emergent_brands_v2(brand, brands, top_coadvertisers))
        #Print completion message
        print(f'Completed {brand} - brand query')
    else:
        print(f'Skipping {brand}')

new_brands = pd.concat(newbrand_output, ignore_index=True)
# new_brands.to_csv('data/new_brands_output.csv', index=False)
new_brands.sample(5)

Completed Blueland - brand query
Completed Dreamland Baby - brand query
Completed HATCH - brand query
Completed Kopari - brand query
Completed LIVELY - brand query
Completed Lunya - brand query
Completed Snuggle Me Organic - brand query
Skipping Sol de Janeiro
Completed Surely Wines - brand query
Completed True Botanicals - brand query
Completed Tubby Todd Bath Co. - brand query
Completed vitruvi - brand query


Unnamed: 0,brand,pair_brand_1,pair_brand_2,emergent_brand,session_count
59,LIVELY,SolaWave,Vegamour,Canopy,25
26,HATCH,Newton Baby,Posh Peanut,Caden Lane,5
77,Tubby Todd Bath Co.,Love Your Melon,Posh Peanut,Dollar Shave Club,30
30,HATCH,Newton Baby,Willow Pumps,Caden Lane,23
13,Blueland,Dropps,Tula Skincare,ILIA,17


In [93]:
new_brands_pair = new_brands.groupby(by = ['brand', 'pair_brand_1','pair_brand_2', 'emergent_brand'], as_index = False).sum()
#Create a emergent brand rank column based on session count desc for each brand
# new_brands['emergent_brand_rank'] = new_brands.groupby('brand')['session_count'].rank(method='first', ascending=False)
new_brands_pair.sample(5)

Unnamed: 0,brand,pair_brand_1,pair_brand_2,emergent_brand,session_count
5,Blueland,Dropps,Viori Beauty,For Days,260
47,Kopari,For Days,Gibsonlook,Carve Designs,13
15,HATCH,Dagne Dover,Newton Baby,Caden Lane,4
90,vitruvi,Dagne Dover,UNTUCKit,Caden Lane,11
49,Kopari,For Days,Gibsonlook,Oradina,3866


## Assess Brand Category

### Find Brand Category and Sub Category

In [42]:
new_brands = pd.read_csv('data/new_brands_output.csv')
top_brands = pd.read_csv('data/top_brands_output.csv')
emergent_brands = pd.read_csv('data/emergent_brands_output.csv')
top_publishers = pd.read_csv('data/top_publishers_output.csv')

In [94]:
brand_names = []
for brand in brands['order_brand_name'].unique():
    brand_names.append(brand)

for brand in top_brands['other_brand'].unique():
    brand_names.append(brand)

for brand in emergent_brands['brand_name'].unique():
    brand_names.append(brand)

for brand in top_publishers['publisher'].unique():
    brand_names.append(brand)

for brand in new_brands['emergent_brand'].unique():
    brand_names.append(brand)

for brand in new_brands_pair['emergent_brand'].unique():
    brand_names.append(brand)

for brand in new_brands_pair['pair_brand_1'].unique():
    brand_names.append(brand)

for brand in new_brands_pair['pair_brand_2'].unique():
    brand_names.append(brand)

brand_names = list(set(brand_names))
len(brand_names)

122

In [95]:
#Convert brand_names to a string that could be inserted as a list into a SQL query's 'in' clause
brand_list = ', '.join([f"'{brand}'" for brand in brand_names])
# print(brand_list)

query = f"""
select 
    cb.name as brand,
    cc.name as category,
from postgres.core_brand cb 
left join postgres.core_brand_categories cbc 
    on cb.core_brand_id = cbc.brand_id
left join postgres.core_category cc
    on cc.core_category_id = cbc.category_id
where 
    active = True 
    and cb.name in ({brand_list})
"""
brand_categories = pd.read_sql(query, connection)
brand_categories.columns = brand_categories.columns.str.lower()
brand_category = brand_categories.groupby('brand').agg(
    primary_category=('category', 'first'),
    full_category=('category', lambda x: ', '.join(map(str, x)))
).reset_index()

#### Create Embeddings

In [96]:
import os
from openai import OpenAI

# client = openai.OpenAI()
load_dotenv('/Users/peter/.env')
client = OpenAI(api_key=os.getenv("OPENAI_KEY"))

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

brand_category['category_embed'] = brand_category.primary_category.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))
brand_category['full_category_embed'] = brand_category.full_category.apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))

# brand_category.to_csv('data/brand_category_embedding_map.csv', index=False)

In [97]:
brand_category_embeddings.sample(5)

Unnamed: 0,brand,primary_category,full_category,category_embed,full_category_embed
92,Reel Paper,Home & Kitchen,Home & Kitchen,"[0.009597312659025192, 0.012385941110551357, -...","[0.009597312659025192, 0.012385941110551357, -..."
100,SPONGELLE,Body & Beauty (Womens),"Body & Beauty (Womens), Body & Beauty (Mens), ...","[-0.01728791370987892, -0.0016343615716323256,...","[-0.009983417578041553, -0.010922878049314022,..."
74,MANSCAPED,Body & Beauty (Mens),Body & Beauty (Mens),"[-0.008707517758011818, -0.008376936428248882,...","[-0.008707517758011818, -0.008376936428248882,..."
82,Once Upon a Farm,Kid,"Kid, Baby","[0.023553989827632904, -0.011848241090774536, ...","[-0.00619884068146348, -0.002152020111680031, ..."
103,Surely Wines,Food & Beverage,Food & Beverage,"[0.015231032855808735, -0.005232362076640129, ...","[0.015231032855808735, -0.005232362076640129, ..."


### Find Distance of Emergent Brands' Categories

In [60]:
#use the brand_category_embeddings to add category_embed and full_category_embed to the 'brand' and 'emergent_brand' columns in the new_brands DataFrame, adding 4 new columns
newbrand_embed = new_brands.merge(brand_category_embeddings[['brand', 'category_embed', 'full_category_embed']], left_on='brand', right_on='brand', how='left')
newbrand_embed = newbrand_embed.merge(brand_category_embeddings[['brand', 'category_embed', 'full_category_embed']], left_on='emergent_brand', right_on='brand', how='left', suffixes=('', '_emergent'))
#drop the 'brand_emergent' column
newbrand_embed.drop(columns=['brand_emergent'], inplace=True)
newbrand_embed.sample(5)

Unnamed: 0,brand,emergent_brand,session_count,category_embed,full_category_embed,category_embed_emergent,full_category_embed_emergent
81,Tubby Todd Bath Co.,Ancient Nutrition,11,"[0.023553989827632904, -0.011848241090774536, ...","[-0.00619884068146348, -0.002152020111680031, ...","[0.015231032855808735, -0.005232362076640129, ...","[0.02081373892724514, -0.00243279617279768, -0..."
31,HATCH,Nanit,17,"[-0.01223513763397932, -0.0004307856725063175,...","[-0.005679960362613201, 0.002076472155749798, ...","[0.005692871753126383, -0.018332626670598984, ...","[-0.0008204608457162976, -0.00582684762775898,..."
29,HATCH,Tushbaby,2,"[-0.01223513763397932, -0.0004307856725063175,...","[-0.005679960362613201, 0.002076472155749798, ...","[0.005692871753126383, -0.018332626670598984, ...","[0.005692871753126383, -0.018332626670598984, ..."
23,HATCH,Lalo,47,"[-0.01223513763397932, -0.0004307856725063175,...","[-0.005679960362613201, 0.002076472155749798, ...","[0.005692871753126383, -0.018332626670598984, ...","[0.005740483291447163, -0.018355796113610268, ..."
66,Snuggle Me Organic,UNTUCKit,5,"[0.005692871753126383, -0.018332626670598984, ...","[0.005692871753126383, -0.018332626670598984, ...","[-0.00217951787635684, -0.008492489345371723, ...","[-0.00217951787635684, -0.008492489345371723, ..."


In [64]:
from scipy.spatial.distance import cosine
import ast  # For safely evaluating strings that represent lists

def parse_embedding(embedding_str):
    # Safely evaluate the string to a list if it's not already in list form
    if isinstance(embedding_str, str):
        try:
            return ast.literal_eval(embedding_str)
        except ValueError:
            return []  # Return an empty list or some other placeholder in case of a parsing error
    return embedding_str

newbrand_embed['category_similarity'] = newbrand_embed.apply(
    lambda row: 1 - cosine(
        parse_embedding(row['category_embed']),
        parse_embedding(row['category_embed_emergent'])
    ),
    axis=1
)

newbrand_embed['full_category_similarity'] = newbrand_embed.apply(
    lambda row: 1 - cosine(
        parse_embedding(row['full_category_embed']),
        parse_embedding(row['full_category_embed_emergent'])
    ),
    axis=1
)

newbrand_embed.sample(5)

Unnamed: 0,brand,emergent_brand,session_count,category_embed,full_category_embed,category_embed_emergent,full_category_embed_emergent,category_similarity,full_category_similarity
59,LIVELY,Canopy,25,"[-0.01223513763397932, -0.0004307856725063175,...","[-0.01223513763397932, -0.0004307856725063175,...","[0.010733497329056263, 0.01849176734685898, 0....","[0.02026437036693096, 0.003061452182009816, -0...",0.812589,0.821888
30,HATCH,Caden Lane,23,"[-0.01223513763397932, -0.0004307856725063175,...","[-0.005679960362613201, 0.002076472155749798, ...","[0.023553989827632904, -0.011848241090774536, ...","[0.007945980876684189, 1.2648578376683872e-05,...",0.740109,0.76206
88,vitruvi,SPONGELLE,11,"[0.009597312659025192, 0.012385941110551357, -...","[0.009597312659025192, 0.012385941110551357, -...","[-0.01728791370987892, -0.0016343615716323256,...","[-0.009983417578041553, -0.010922878049314022,...",0.81779,0.804946
43,Kopari,Beekman 1802,2771,"[-0.014363016933202744, -0.012803830206394196,...","[-0.01167981419712305, -0.019516976550221443, ...","[-0.00954959075897932, -0.0013523079687729478,...","[-0.009724291041493416, -0.00884086824953556, ...",0.892614,0.942933
56,LIVELY,Hismile,257,"[-0.01223513763397932, -0.0004307856725063175,...","[-0.01223513763397932, -0.0004307856725063175,...","[-0.00954959075897932, -0.0013523079687729478,...","[-0.01236685924232006, -0.008194033056497574, ...",0.819253,0.831618


In [80]:
#merge the category_similarity and full_category_similarity columns to the new_brands DataFrame 
new_brands = new_brands.merge(newbrand_embed[['brand', 'emergent_brand', 'category_similarity', 'full_category_similarity']], on=['brand', 'emergent_brand'], how='left')
# new_brands.to_csv('data/new_brands_output.csv', index=False)
new_brands.sample(5)

### Assess By Brand

In [82]:
brands_df = pd.read_csv('data/new_brands_output_sheets.csv')
brands_df.sample(5)

Unnamed: 0,brand,emergent_brand,session_count,category_similarity,full_category_similarity
3,Blueland,Glamnetic,259,0.82,0.82
10,Blueland,Beachwaver,55,0.82,0.81
20,Kopari,Caraway,2833,0.81,0.8
18,Kopari,Gibsonlook,2833,0.86,0.86
32,LIVELY,D.S. & Durga,52,0.9,0.87


In [84]:
def calculate_weighted_averages_and_correlations(brands_df):
    results = []

    unique_brands = brands_df['brand'].unique()

    for brand in unique_brands:
        df_filtered = brands_df[brands_df['brand'] == brand]
        
        # Number of emergent brands
        num_emergent_brands = df_filtered['emergent_brand'].nunique()
        
        # Total session count
        total_session_count = df_filtered['session_count'].sum()
        
        # Calculate weighted averages with rounding
        weighted_avg_category_similarity = round((df_filtered['category_similarity'] * df_filtered['session_count']).sum() / df_filtered['session_count'].sum(), 2)
        weighted_avg_full_category_similarity = round((df_filtered['full_category_similarity'] * df_filtered['session_count']).sum() / df_filtered['session_count'].sum(), 2)
        
        # Calculate correlations with rounding
        category_similarity_correlation = round(df_filtered[['session_count', 'category_similarity']].corr().iloc[0, 1], 2)
        full_category_similarity_correlation = round(df_filtered[['session_count', 'full_category_similarity']].corr().iloc[0, 1], 2)
        
        results.append({
            'brand': brand,
            'num_emergent_brands': num_emergent_brands,
            'total_session_count': total_session_count,
            'weighted_avg_category_similarity': weighted_avg_category_similarity,
            'weighted_avg_full_category_similarity': weighted_avg_full_category_similarity,
            'category_similarity_correlation': category_similarity_correlation,
            'full_category_similarity_correlation': full_category_similarity_correlation
        })

    results_df = pd.DataFrame(results)
    
    return results_df

calculate_weighted_averages_and_correlations(new_brands)

Unnamed: 0,brand,num_emergent_brands,total_session_count,weighted_avg_category_similarity,weighted_avg_full_category_similarity,category_similarity_correlation,full_category_similarity_correlation
0,Blueland,10,20878,0.82,0.81,-0.03,-0.18
1,HATCH,9,17783,0.76,0.8,0.65,0.37
2,Kopari,9,775103,0.85,0.87,-0.13,-0.04
3,LIVELY,9,10501,0.83,0.84,-0.68,-0.67
4,Lunya,5,17922,0.87,0.87,-0.91,-0.94
5,Snuggle Me Organic,9,1050,0.9,0.84,0.08,0.59
6,Tubby Todd Bath Co.,8,16288,0.78,0.8,-0.15,-0.17
7,vitruvi,8,1508,0.78,0.8,-0.27,-0.37


In [100]:
urls = f"""
https://www.dropps.com/2684219/checkouts/0df78119612874a18f8677527835e6cc/thank_you
https://www.crownaffair.com/checkouts/cn/247d4d5aa36c3a3566b5d933d5cd5a11/thank_you
https://www.ettitude.com/checkouts/cn/1e7417f7bad3b615a5ab6f8ab09f4e73/thank_you
https://www.morphe.com/checkouts/co/3a992fa3d91cd8bae0252697298bdef8/thank_you
https://shop.hedleyandbennett.com/1870608/orders/ca7fed88277b5084517c4b658914ff3c
https://www.brightcellars.com/checkouts/cn/Z2NwLXVzLWVhc3QxOjAxSFQ1VlpaVjc4SjA0RlhBR1ZKUEIzVFNG/thank-you
https://dryftsleep.com/checkouts/cn/2f7b8ea10a0adc0ef5ddc83084752c30/thank_you
https://foodhuggers.com/3079937/checkouts/eeddf30f351e16cec7debf7836a744eb/thank_you
https://fnxfit.com/checkouts/c/4c4a45c4d08f6f5a98e3c1acd00bbfe3/thank_you
https://store.dsanddurga.com/checkouts/cn/65cd3e2cc7df31928de79757d44a45d4/thank_you
"""

#format urls so that it can be inserted into a SQL query 'in' clause
urls = urls.replace('\n', "', '")
urls = urls[:-4]
print(urls)

"', 'https://www.dropps.com/2684219/checkouts/0df78119612874a18f8677527835e6cc/thank_you', 'https://www.crownaffair.com/checkouts/cn/247d4d5aa36c3a3566b5d933d5cd5a11/thank_you', 'https://www.ettitude.com/checkouts/cn/1e7417f7bad3b615a5ab6f8ab09f4e73/thank_you', 'https://www.morphe.com/checkouts/co/3a992fa3d91cd8bae0252697298bdef8/thank_you', 'https://shop.hedleyandbennett.com/1870608/orders/ca7fed88277b5084517c4b658914ff3c', 'https://www.brightcellars.com/checkouts/cn/Z2NwLXVzLWVhc3QxOjAxSFQ1VlpaVjc4SjA0RlhBR1ZKUEIzVFNG/thank-you', 'https://dryftsleep.com/checkouts/cn/2f7b8ea10a0adc0ef5ddc83084752c30/thank_you', 'https://foodhuggers.com/3079937/checkouts/eeddf30f351e16cec7debf7836a744eb/thank_you', 'https://fnxfit.com/checkouts/c/4c4a45c4d08f6f5a98e3c1acd00bbfe3/thank_you', 'https://store.dsanddurga.com/checkouts/cn/65cd3e2cc7df31928de79757d44a45d4/thank_you"