In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import ast
from scipy.stats import beta
from snowflake import connector
import pandas as pd

from dotenv import load_dotenv
from tabulate import tabulate
import openai

## Ingest and Process Data

In [10]:
df = pd.read_csv('data/products.csv')
#change all columns to lowercase
df.columns = map(str.lower, df.columns)
df.sample(5)

Unnamed: 0,event_brand_name,remote_order_id,session_id,pub_date,event_line_items,event_id,conv_date,conv_order_id,conv_brand_name,conv_line_items
20827,Lalo,4138977984576,8f92d7a3-4221-4875-9be2-1de5ce3a0fb4,2024-03-13,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",4f0dbf58-bfa0-45df-ac48-0185c378c0bb,2024-03-16,5202958712971,Nanit,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
10350,ILIA,5434766655657,46afedc6-f491-4e80-855e-f32e1f0d1689,2024-03-04,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",55c3862d-64d7-407c-a2ad-d1b67cea35e0,2024-03-09,5384084553830,Sol de Janeiro,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
19660,Monica & Andy Inc.,6002202214462,87ae212d-6f26-4643-8c74-50428400ddd0,2024-01-05,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",f5282dca-9bc7-4379-ba92-043768e4f479,2024-01-12,5390977335490,Tubby Todd Bath Co.,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
31621,August Uncommon Tea,5710096761127,d9c0660f-4ad9-4ffd-b3bc-e1cdb46b7fb0,2024-01-15,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",ede354ac-3c83-4610-9db1-0bb9f0698a81,2024-01-16,5366134014123,Pretzels.com,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
24928,Laura Geller,5670798917693,ac001e52-c866-418e-95fb-af7ffbc9c50f,2024-02-15,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",82965b9e-fab7-4f43-bbc2-40f1492a9180,2024-02-25,5639432208451,Love in Faith,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."


In [12]:
def parse_products_sales(s):
    # Initialize empty lists to hold products and sales
    product_names = []
    sales = []
    
    # Check if the input is a string. If not, return empty lists.
    if not isinstance(s, str):
        return product_names, sales
    
    try:
        # Convert the string representation of the list of dicts to an actual list of dicts
        products_list = ast.literal_eval(s.replace('\n', '').replace('null', 'None'))
        
        # Extract product names and calculate sales
        product_names = [product['product_name'] for product in products_list]
        sales = [float(product['quantity']) * (float(product['price']) - float(product['discount_amount'])) for product in products_list]
        
        return product_names, sales
    except (ValueError, SyntaxError) as e:
        # If there's an error in converting, print the error and return empty lists
        print(f"Error parsing row: {e}")
        return product_names, sales

In [15]:
df[['pub_products', 'pub_sales']] = df.apply(lambda row: parse_products_sales(row['event_line_items']), axis=1, result_type='expand')
df[['conv_products', 'conv_sales']] = df.apply(lambda row: parse_products_sales(row['conv_line_items']), axis=1, result_type='expand')
df.sample(5)

Unnamed: 0,event_brand_name,remote_order_id,session_id,pub_date,event_line_items,event_id,conv_date,conv_order_id,conv_brand_name,conv_line_items,pub_products,pub_sales,conv_products,conv_sales
9066,Bask Suncare,5538448736498,3e3a16bc-1257-4cc4-8b6b-9fd069c51561,2024-03-08,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",274ed6a9-db88-41ff-bf1c-2f66353b3163,2024-03-14,5287299088561,HATCH,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...","[Route Package Protection - $3.55, SPF 50 Six-...","[3.55, 115.2]",[The Essential Nursing And Pumping Bra - Black...,[78.0]
5742,Lalo,4127745081408,27c5abac-d8e9-4968-a905-abf526bab5ea,2024-02-09,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",250f5c50-5b76-4077-b1d3-fce4e25a6d4b,2024-03-17,5291716673713,HATCH,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...","[The Play Table - Natural, Play Legs - Blueber...","[170.0, 40.0, 40.0]",[The Boyfriend Maternity Jean - Destroyed Ligh...,"[198.0, 198.0, 128.0, 218.0, 88.0, 98.0]"
21427,Laura Geller,5583386345533,939e615a-0a0c-4f99-a94b-e44b46844477,2024-01-05,,44f3b1e5-3415-454b-8d99-860720841b39,2024-01-05,5282994815078,Sol de Janeiro,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",[],[],"[Delícia Drench™ Body Butter Sample, Brazilian...","[0.0, 22.0, 0.0, 38.0]"
16815,Caden Lane,5227258806332,73a0cd97-1689-4663-8fc3-1a8bba16927a,2024-03-02,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",5e281451-89e4-40aa-a61a-ad62cdfe61fc,2024-03-02,5451605573826,Coterie,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",[Henlee's Hydrangea Newborn Baby Knot Gown & H...,"[36.0, 48.0]",[The Newborn Gift],[125.0]
12365,Doggy Do Good,5641372369212,54d18f36-3a25-48fe-83b9-127d104f1087,2024-01-24,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",705523cc-9f2d-4f74-9c52-dfd987592b7a,2024-02-07,4860557197464,For Days,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",[Refill Pack],[35.99],"[Laundry Detergent Sheets - Unscented, Take Ba...","[19.99, 20.0]"


## Find Top Product Combos

In [36]:
def analyze_conversion_products_sales(df):
    expanded_data = []
    for idx, row in df.iterrows():
        for product, sale in zip(row['pub_products'], row['pub_sales']):
            expanded_data.append({
                'publisher_brand_name': row['event_brand_name'],
                'publisher_product': product,
                'sale': sale,
                'conv_brand_name': row['conv_brand_name'],
                'conv_products': row['conv_products'],
                'conv_sales': row['conv_sales']
            })

    expanded_df = pd.DataFrame(expanded_data)
    
    # Aggregate sales by publisher product and sort
    pub_products_sales = expanded_df.groupby(['publisher_brand_name', 'publisher_product'])['sale'].sum().reset_index()
    pub_products_sales_sorted = pub_products_sales.sort_values(by='sale', ascending=False)
    
    # For each publisher product, identify the top 5 conversion products by sales
    top_conversion_products = {}
    for _, row in pub_products_sales_sorted.iterrows():
        publisher_product = row['publisher_product']
        relevant_rows = expanded_df[expanded_df['publisher_product'] == publisher_product]
        conversion_sales = {}
        for _, conv_row in relevant_rows.iterrows():
            for conv_product, conv_sale, conv_brand_name in zip(conv_row['conv_products'], conv_row['conv_sales'], [conv_row['conv_brand_name']] * len(conv_row['conv_products'])):
                conversion_sales[f"{conv_brand_name} - {conv_product}"] = conversion_sales.get(f"{conv_brand_name} - {conv_product}", 0) + conv_sale
        
        # Sort conversion products by sales and take the top 5
        sorted_conv_sales = sorted(conversion_sales.items(), key=lambda x: x[1], reverse=True)[:5]
        top_conversion_products[publisher_product] = sorted_conv_sales
    
    # Prepare the final DataFrame
    final_data = []
    for pub_product, conv_products in top_conversion_products.items():
        row = {
            'publisher_brand_name': expanded_df[expanded_df['publisher_product'] == pub_product]['publisher_brand_name'].iloc[0],
            'publisher_product': pub_product
        }
        for i, (conv_product_sale, sale) in enumerate(conv_products, 1):
            conv_product, conv_brand_name = conv_product_sale.split(' - ', 1)
            row[f'top_{i}_conv_brand_name'] = conv_product
            row[f'top_{i}_conv_product'] = conv_brand_name
            row[f'top_{i}_conv_sale'] = sale
        final_data.append(row)

    return pd.DataFrame(final_data)


In [38]:
top_product_sales = analyze_conversion_products_sales(df)
top_product_sales.head()

Unnamed: 0,publisher_brand_name,publisher_product,top_1_conv_brand_name,top_1_conv_product,top_1_conv_sale,top_2_conv_brand_name,top_2_conv_product,top_2_conv_sale,top_3_conv_brand_name,top_3_conv_product,top_3_conv_sale,top_4_conv_brand_name,top_4_conv_product,top_4_conv_sale,top_5_conv_brand_name,top_5_conv_product,top_5_conv_sale
0,Newton Baby,Crib Mattress - Standard / White,Nanit,Nanit Pro Camera - Wall Mount / White,1495.0,Nanit,Nanit Pro Camera - Floor Stand / White,399.0,Nanit,Best Sellers Bundle - Floor Stand / Green,383.0,Nanit,The Best-Selling Camera Bundle - Wall Mount / ...,303.0,Dagne Dover,"Indi Diaper Backpack in Dark Moss, Large",215.0
1,Newton Baby,Waterproof Crib Mattress - Standard / White,Nanit,Nanit Pro Camera - Floor Stand / White,798.0,Nanit,Nanit Pro Camera Multi-Pack - Twin / Floor Stand,679.0,Nanit,Nanit Pro Camera - Floor Stand,399.0,Nanit,The Best-Selling Camera Bundle - Floor Stand /...,383.0,Nanit,First Year Bundle - Floor Stand / Pink Sunburst,383.0
2,Nanit,Nanit Pro Camera - Floor Stand / White,Newton Baby,Waterproof Crib Mattress - Standard / White,1049.97,Newton Baby,Crib Mattress - Standard / White,899.97,Lalo,The Chair - Coconut,470.0,Newton Baby,Travel Crib & Play Yard - Play Yard,449.98,Newton Baby,Waterproof Crib Mattress - Standard / Grey,349.99
3,Nood,The Flasher™ 2.0 - Matte Black,Koio,Mello in Summit Regenerative - 10 (US) / 43 (EU),395.0,Sunday Citizen,Premium Bamboo Bubble Comforter - Taupe / King...,290.0,Sol de Janeiro,Biggie Biggie Beija Flor™ Elasti-Cream,255.0,Ursa Major,Golden Hour Recovery Cream - 1.57 FL OZ,156.0,Vegamour,GRO+ Advanced Hair Serum (3 Pack) - Subscription,149.0
4,Willow Pumps,Willow Go™ Wearable Breast Pump - 21 & 24mm,Nanit,The Best-Selling Camera Bundle - Floor Stand /...,359.0,Newton Baby,Waterproof Crib Mattress - Standard / White,349.99,Dagne Dover,"Indi Diaper Backpack in Dark Moss, Large",182.75,HATCH,The Mackenzie Sweater Dress - Black / 1,147.6,M.M.LaFleur,Brodie Top - Airy Stripe Knit :: Meringue - M,135.0


In [50]:
top_product_sales.loc[top_product_sales['publisher_brand_name'] == 'Caraway'].head(25)

Unnamed: 0,publisher_brand_name,publisher_product,top_1_conv_brand_name,top_1_conv_product,top_1_conv_sale,top_2_conv_brand_name,top_2_conv_product,top_2_conv_sale,top_3_conv_brand_name,top_3_conv_product,top_3_conv_sale,top_4_conv_brand_name,top_4_conv_product,top_4_conv_sale,top_5_conv_brand_name,top_5_conv_product,top_5_conv_sale
52,Caraway,Cookware Set - Cream,Branch,Verve Chair - Mist / Standard,599.0,Afloral,Faux Potted Palm Tree Plant in Cement Planter ...,280.0,Lalo,Giant Coloring Sheets - Dinosaurs / 3 Pack,130.0,Lalo,Giant Coloring Sheets - Food / 3 Pack,130.0,Lalo,Giant Coloring Sheets - Safari / 3 Pack,130.0
140,Caraway,Food Storage Set - Cream,Dagne Dover,"Indi Diaper Backpack in Camel, Large",215.0,Dagne Dover,"Indi Diaper Backpack in Onyx, Medium",195.0,Lalo,Giant Coloring Sheets - Dinosaurs / 3 Pack,130.0,Lalo,Giant Coloring Sheets - Food / 3 Pack,130.0,Lalo,Giant Coloring Sheets - Safari / 3 Pack,130.0
161,Caraway,Cutting Board Set - FSC Wood,vitruvi,Best Sleep Bundle - Darks,334.58,Lunya,Soft Modal Rib Tee Short Set - Immersed Black ...,128.0,Lalo,The Play Tent - Natural,125.0,Evereden,Kids Happy Hair Duo - 2-piece set + Free hair ...,108.0,Evereden,Kids Happy Hair Day - 3-piece set,79.0
233,Caraway,Cookware Set - Gray,True Classic,Classic V-Neck 12-Pack - M,99.99,jane iredale,PurePressed® Base Mineral Foundation REFILL SP...,92.0,Caden Lane,Perfectly Pink Personalized Color Blanket - Th...,88.0,Caden Lane,Berry Happy Convertible Zip Romper - 6-12 months,36.0,Caden Lane,Weslie's Wildflower Convertible Zip Romper - 6...,36.0
347,Caraway,Minis Duo - Cream,Branch,Verve Chair - Mist / Standard,599.0,Afloral,Faux Potted Palm Tree Plant in Cement Planter ...,280.0,Lalo,Giant Coloring Sheets - Dinosaurs / 3 Pack,130.0,Lalo,Giant Coloring Sheets - Food / 3 Pack,130.0,Lalo,Giant Coloring Sheets - Safari / 3 Pack,130.0
473,Caraway,Half Bakeware Set - Cream,Lalo,The Play Tent - Natural,125.0,Caden Lane,Millie's Dusty Rose Garden Personalized Toddle...,68.0,Caden Lane,Millie's Floral Personalized Newborn Baby Knot...,56.0,Caden Lane,Millie's Floral Personalized Newborn Baby Knot...,56.0,Lalo,Utensils - Oatmeal / 4 Pack,42.5
666,Caraway,Bakeware Set - Navy,vitruvi,Best Sleep Bundle - Darks,334.58,ILIA,Larch 15C,48.0,ILIA,Mythic,28.0,ILIA,After Midnight,28.0,ILIA,Free Mini Fullest Volumizing Mascara,0.0
667,Caraway,Bakeware Set - Gray,Lunya,Cozy Cotton Silk Pocket Henley - Speckled Tins...,218.0,Caden Lane,Perfectly Pink Personalized Color Blanket - Th...,88.0,Caden Lane,Package Protection - $1.35,1.35,,,,,,
669,Caraway,Bakeware Set - Sage,Caden Lane,Out Of This World | Personalized Kids Blanket,68.0,For Days,Take Back Bag - L / BLACK,60.0,Caden Lane,Baby Blues Personalized Knotted Hat - Sapphire...,22.0,Caden Lane,Package Protection - $1.95,1.95,,,
670,Caraway,Cookware Set - Sage,Caden Lane,Out Of This World | Personalized Kids Blanket,68.0,Monica & Andy Inc.,Double Ruffle Let's Dance Dress_Red Hearts - R...,38.0,Caden Lane,Baby Blues Personalized Knotted Hat - Sapphire...,22.0,Caden Lane,Package Protection - $1.95,1.95,,,


In [46]:
def analyze_conversion_products_frequency(df):
    expanded_data = []
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        for product, sale in zip(row['pub_products'], row['pub_sales']):
            # Extract the corresponding conversion brand names and products
            conv_brands_products = list(zip(row['conv_brand_name'], row['conv_products']))
            # Collect conversion data for this publisher product
            conv_data = []
            for conv_brand, conv_product in conv_brands_products:
                conv_data.append({'conv_brand': conv_brand, 'conv_product': conv_product})

            # Add data to the expanded list
            expanded_data.append({
                'publisher_brand_name': row['event_brand_name'],
                'publisher_product': product,
                'sale': sale,
                'conv_data': conv_data
            })
    
    expanded_df = pd.DataFrame(expanded_data)
    
    # Count frequency of each publisher product
    pub_product_counts = expanded_df.groupby(['publisher_brand_name', 'publisher_product']).size().reset_index(name='frequency')
    pub_product_counts_sorted = pub_product_counts.sort_values(by='frequency', ascending=False)
    
    # For each publisher product, count frequency of each associated conversion product
    top_conversion_products = {}
    for _, row in pub_product_counts_sorted.iterrows():
        publisher_product = row['publisher_product']
        relevant_rows = expanded_df[expanded_df['publisher_product'] == publisher_product]
        conversion_product_counts = {}
        for _, conv_row in relevant_rows.iterrows():
            for conv_product, conv_brand_name in zip(conv_row['conv_products']), conv_row['conv_brand_name']:
                key = f"{conv_brand_name} - {conv_product}"
                conversion_product_counts[key] = conversion_product_counts.get(key, 0) + 1
        
        # Sort conversion products by their frequency and take the top 5
        sorted_conv_product_counts = sorted(conversion_product_counts.items(), key=lambda x: x[1], reverse=True)[:5]
        top_conversion_products[publisher_product] = sorted_conv_product_counts
    
    # Prepare the final DataFrame
    final_data = []
    for pub_product, conv_products in top_conversion_products.items():
        row = {
            'publisher_brand_name': expanded_df[expanded_df['publisher_product'] == pub_product]['publisher_brand_name'].iloc[0],
            'publisher_product': pub_product
        }
        for i, (conv_product_count, count) in enumerate(conv_products, 1):
            conv_brand_name, conv_product = conv_product_count.split(' - ', 1)
            row[f'top_{i}_conv_brand_name'] = conv_brand_name
            row[f'top_{i}_conv_product'] = conv_product
            row[f'top_{i}_conv_frequency'] = count
        final_data.append(row)

    return pd.DataFrame(final_data)


In [47]:
analyze_conversion_products_frequency(df.head(50))

KeyError: 'conv_products'

In [None]:
### what are users actually buying next (from another publisher)