In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import ast
from scipy.stats import beta
from snowflake import connector
import pandas as pd

from dotenv import load_dotenv
from tabulate import tabulate
import openai

Python(74453) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


## Ingest and Process Data

In [2]:
df = pd.read_csv('data/products.csv')
#change all columns to lowercase
df.columns = map(str.lower, df.columns)
df.sample(5)

Unnamed: 0,event_brand_name,remote_order_id,session_id,pub_date,event_line_items,event_id,conv_date,conv_order_id,conv_brand_name,conv_line_items
27923,Caden Lane,5192854241340,c0375b9e-2ba1-45e4-811e-826973d7daae,2024-02-16,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",4a2342f8-cfae-4d21-a1c6-a1d1625b93e0,2024-02-16,5433862947010,Tubby Todd Bath Co.,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
30476,Tula Skincare,5055586336814,d1d2ebd4-6355-481c-8620-c9537bd189c5,2024-03-14,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",43e8ca55-ce8a-42f9-8c6a-c1d89c55b623,2024-03-16,5839080128757,Ami Colé,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
15371,Dieux Skin,5521279483997,68f0325c-a27a-456c-9f25-7b75bc660f5d,2024-01-17,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",f9ede864-0fb1-478d-912a-b490e73239e6,2024-01-28,5285272354887,Dagne Dover,"[\n {\n ""discount_amount"": ""28.35"",\n ""..."
11177,Bear Mattress,5771750932585,4cab47d8-8181-4696-b97e-5585ac6a04bf,2024-02-23,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",fafd397a-4a5d-4ebb-a7dc-577e0842d0df,2024-03-01,5226625925180,Caden Lane,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."
31394,Bodily,5486905786465,d8193d70-6bc5-4d8f-98e1-0ad809eb942b,2024-03-12,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",18ab00fb-86a6-44e7-8f1e-a9c4932c19f0,2024-03-14,5241649430588,Caden Lane,"[\n {\n ""discount_amount"": ""0.00"",\n ""p..."


In [3]:
def parse_products_sales(s):
    # Initialize empty lists to hold products and sales
    product_names = []
    sales = []
    
    # Check if the input is a string. If not, return empty lists.
    if not isinstance(s, str):
        return product_names, sales
    
    try:
        # Convert the string representation of the list of dicts to an actual list of dicts
        products_list = ast.literal_eval(s.replace('\n', '').replace('null', 'None'))
        
        # Extract product names and calculate sales
        product_names = [product['product_name'] for product in products_list]
        sales = [float(product['quantity']) * (float(product['price']) - float(product['discount_amount'])) for product in products_list]
        
        return product_names, sales
    except (ValueError, SyntaxError) as e:
        # If there's an error in converting, print the error and return empty lists
        print(f"Error parsing row: {e}")
        return product_names, sales

In [4]:
df[['pub_products', 'pub_sales']] = df.apply(lambda row: parse_products_sales(row['event_line_items']), axis=1, result_type='expand')
df[['conv_products', 'conv_sales']] = df.apply(lambda row: parse_products_sales(row['conv_line_items']), axis=1, result_type='expand')
df.sample(5)

Unnamed: 0,event_brand_name,remote_order_id,session_id,pub_date,event_line_items,event_id,conv_date,conv_order_id,conv_brand_name,conv_line_items,pub_products,pub_sales,conv_products,conv_sales
34920,Love in Faith,5636475781187,f0dccc3d-d8a1-4f96-89f2-7795778735b2,2024-03-07,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",e3895d9f-b521-4a14-87a9-c02b7b52c5bf,2024-03-07,5440365559874,BlendJet,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...","[Be Kind Sweatshirt - 3X / Brown, One Nation U...","[30.0, 9.0, 10.0]","[Route Package Protection - $1.35, BJ2 Replace...","[1.35, 24.95]"
8523,Girlfriend Collective,5567463948351,3acb02bf-984a-4a95-b478-292ad10054ff,2024-01-24,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",53fc2c10-aaa2-4671-8472-5af29ba28a18,2024-01-24,5563193163831,Blueland,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...","[Earth Compressive High-Rise Legging - 28.5"" I...","[78.0, 42.0]","[Laundry Starter Set - 60, Dishwasher Refill P...","[20.0, 21.0]"
27254,Laura Geller,5704016330813,bb7917e9-9fa4-49c3-beff-a761d2973026,2024-02-29,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",a465dc67-14cb-4641-a42c-f30a52ea8bf6,2024-02-29,5646478639171,Love in Faith,"[\n {\n ""discount_amount"": ""10.00"",\n ""...","[Spackle Skin Perfecting Primer: Hydrate, Bake...","[20.0, 36.0]",[Serve The Lord Long Sleeve - 4X / Heather Sto...,"[12.0, 12.0, 12.0, 11.0]"
35857,Little Words Project,5434323566673,f76a0fc0-574d-4cdf-a1ce-b4083db0e38a,2024-02-29,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",2f5407bb-4b4f-4ade-85a4-05f343c285c1,2024-02-29,5225520726076,Caden Lane,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...",[Custom Joyful Stone - Joyful Stone / Black / ...,[35.0],"[Package Protection - $0.98, All Color Me Paja...","[0.98, 36.0]"
9576,Love in Faith,5659285815363,4188c06a-7704-4af4-ac94-ea3b6403658a,2024-03-08,"[\n {\n ""discount_amount"": ""10.00"",\n ""...",f3dd6f22-24ce-4c7f-8701-debcbfdc72fd,2024-03-14,5449767026754,BlendJet,"[\n {\n ""discount_amount"": ""0.00"",\n ""p...","[Stand Tall Long Sleeve - L / Heather Navy, Gr...","[12.0, 9.0, 9.0, 12.0, 9.0, 9.0, 9.0]","[Large Jar (20 oz), BlendJet 2 - Carbon Fiber,...","[15.95, 59.95, 14.95]"


## Find Top Product Combos

In [5]:
def analyze_conversion_products_sales(df):
    expanded_data = []
    for idx, row in df.iterrows():
        for product, sale in zip(row['pub_products'], row['pub_sales']):
            expanded_data.append({
                'publisher_brand_name': row['event_brand_name'],
                'publisher_product': product,
                'sale': sale,
                'conv_brand_name': row['conv_brand_name'],
                'conv_products': row['conv_products'],
                'conv_sales': row['conv_sales']
            })

    expanded_df = pd.DataFrame(expanded_data)
    
    # Aggregate sales by publisher product and sort
    pub_products_sales = expanded_df.groupby(['publisher_brand_name', 'publisher_product'])['sale'].sum().reset_index()
    pub_products_sales_sorted = pub_products_sales.sort_values(by='sale', ascending=False)
    
    # For each publisher product, identify the top 5 conversion products by sales
    top_conversion_products = {}
    for _, row in pub_products_sales_sorted.iterrows():
        publisher_product = row['publisher_product']
        relevant_rows = expanded_df[expanded_df['publisher_product'] == publisher_product]
        conversion_sales = {}
        for _, conv_row in relevant_rows.iterrows():
            for conv_product, conv_sale, conv_brand_name in zip(conv_row['conv_products'], conv_row['conv_sales'], [conv_row['conv_brand_name']] * len(conv_row['conv_products'])):
                conversion_sales[f"{conv_brand_name} - {conv_product}"] = conversion_sales.get(f"{conv_brand_name} - {conv_product}", 0) + conv_sale
        
        # Sort conversion products by sales and take the top 5
        sorted_conv_sales = sorted(conversion_sales.items(), key=lambda x: x[1], reverse=True)[:5]
        top_conversion_products[publisher_product] = sorted_conv_sales
    
    # Prepare the final DataFrame
    final_data = []
    for pub_product, conv_products in top_conversion_products.items():
        row = {
            'publisher_brand_name': expanded_df[expanded_df['publisher_product'] == pub_product]['publisher_brand_name'].iloc[0],
            'publisher_product': pub_product
        }
        for i, (conv_product_sale, sale) in enumerate(conv_products, 1):
            conv_product, conv_brand_name = conv_product_sale.split(' - ', 1)
            row[f'top_{i}_conv_brand_name'] = conv_product
            row[f'top_{i}_conv_product'] = conv_brand_name
            row[f'top_{i}_conv_sale'] = sale
        final_data.append(row)

    return pd.DataFrame(final_data)


In [6]:
top_product_sales = analyze_conversion_products_sales(df)
top_product_sales.head()

Unnamed: 0,publisher_brand_name,publisher_product,top_1_conv_brand_name,top_1_conv_product,top_1_conv_sale,top_2_conv_brand_name,top_2_conv_product,top_2_conv_sale,top_3_conv_brand_name,top_3_conv_product,top_3_conv_sale,top_4_conv_brand_name,top_4_conv_product,top_4_conv_sale,top_5_conv_brand_name,top_5_conv_product,top_5_conv_sale
0,Newton Baby,Crib Mattress - Standard / White,Nanit,Nanit Pro Camera - Wall Mount / White,1495.0,Nanit,Nanit Pro Camera - Floor Stand / White,399.0,Nanit,Best Sellers Bundle - Floor Stand / Green,383.0,Nanit,The Best-Selling Camera Bundle - Wall Mount / ...,303.0,Dagne Dover,"Indi Diaper Backpack in Dark Moss, Large",215.0
1,Newton Baby,Waterproof Crib Mattress - Standard / White,Nanit,Nanit Pro Camera - Floor Stand / White,798.0,Nanit,Nanit Pro Camera Multi-Pack - Twin / Floor Stand,679.0,Nanit,Nanit Pro Camera - Floor Stand,399.0,Nanit,The Best-Selling Camera Bundle - Floor Stand /...,383.0,Nanit,First Year Bundle - Floor Stand / Pink Sunburst,383.0
2,Nanit,Nanit Pro Camera - Floor Stand / White,Newton Baby,Waterproof Crib Mattress - Standard / White,1049.97,Newton Baby,Crib Mattress - Standard / White,899.97,Lalo,The Chair - Coconut,470.0,Newton Baby,Travel Crib & Play Yard - Play Yard,449.98,Newton Baby,Waterproof Crib Mattress - Standard / Grey,349.99
3,Nood,The Flasher™ 2.0 - Matte Black,Koio,Mello in Summit Regenerative - 10 (US) / 43 (EU),395.0,Sunday Citizen,Premium Bamboo Bubble Comforter - Taupe / King...,290.0,Sol de Janeiro,Biggie Biggie Beija Flor™ Elasti-Cream,255.0,Ursa Major,Golden Hour Recovery Cream - 1.57 FL OZ,156.0,Vegamour,GRO+ Advanced Hair Serum (3 Pack) - Subscription,149.0
4,Willow Pumps,Willow Go™ Wearable Breast Pump - 21 & 24mm,Nanit,The Best-Selling Camera Bundle - Floor Stand /...,359.0,Newton Baby,Waterproof Crib Mattress - Standard / White,349.99,Dagne Dover,"Indi Diaper Backpack in Dark Moss, Large",182.75,HATCH,The Mackenzie Sweater Dress - Black / 1,147.6,M.M.LaFleur,Brodie Top - Airy Stripe Knit :: Meringue - M,135.0


In [8]:
top_product_sales.to_csv('data/top_product_output.csv', index=False)

In [None]:
# top_product_sales.loc[top_product_sales['publisher_brand_name'] == 'Caraway'].head(25)

In [9]:
def analyze_conversion_products_frequency(df):
    expanded_data = []
    # Iterate over each row in the DataFrame
    for idx, row in df.iterrows():
        for product, sale in zip(row['pub_products'], row['pub_sales']):
            # Extract the corresponding conversion brand names and products
            conv_brands_products = list(zip(row['conv_brand_name'], row['conv_products']))
            # Collect conversion data for this publisher product
            conv_data = []
            for conv_brand, conv_product in conv_brands_products:
                conv_data.append({'conv_brand': conv_brand, 'conv_product': conv_product})

            # Add data to the expanded list
            expanded_data.append({
                'publisher_brand_name': row['event_brand_name'],
                'publisher_product': product,
                'sale': sale,
                'conv_data': conv_data
            })
    
    expanded_df = pd.DataFrame(expanded_data)
    
    # Count frequency of each publisher product
    pub_product_counts = expanded_df.groupby(['publisher_brand_name', 'publisher_product']).size().reset_index(name='frequency')
    pub_product_counts_sorted = pub_product_counts.sort_values(by='frequency', ascending=False)
    
    # For each publisher product, count frequency of each associated conversion product
    top_conversion_products = {}
    for _, row in pub_product_counts_sorted.iterrows():
        publisher_product = row['publisher_product']
        relevant_rows = expanded_df[expanded_df['publisher_product'] == publisher_product]
        conversion_product_counts = {}
        for _, conv_row in relevant_rows.iterrows():
            for conv_product, conv_brand_name in zip(conv_row['conv_products']), conv_row['conv_brand_name']:
                key = f"{conv_brand_name} - {conv_product}"
                conversion_product_counts[key] = conversion_product_counts.get(key, 0) + 1
        
        # Sort conversion products by their frequency and take the top 5
        sorted_conv_product_counts = sorted(conversion_product_counts.items(), key=lambda x: x[1], reverse=True)[:5]
        top_conversion_products[publisher_product] = sorted_conv_product_counts
    
    # Prepare the final DataFrame
    final_data = []
    for pub_product, conv_products in top_conversion_products.items():
        row = {
            'publisher_brand_name': expanded_df[expanded_df['publisher_product'] == pub_product]['publisher_brand_name'].iloc[0],
            'publisher_product': pub_product
        }
        for i, (conv_product_count, count) in enumerate(conv_products, 1):
            conv_brand_name, conv_product = conv_product_count.split(' - ', 1)
            row[f'top_{i}_conv_brand_name'] = conv_brand_name
            row[f'top_{i}_conv_product'] = conv_product
            row[f'top_{i}_conv_frequency'] = count
        final_data.append(row)

    return pd.DataFrame(final_data)


In [10]:
analyze_conversion_products_frequency(df).to_csv('data/top_product_frequency_output.csv', index=False)

KeyError: 'conv_products'

In [None]:
### what are users actually buying next (from another publisher)