In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import beta
import plotly.express as px
from datetime import datetime
from dotenv import load_dotenv
from snowflake import connector

import warnings
# Filter out all warnings
warnings.filterwarnings('ignore', category=Warning)

In [39]:
df = pd.read_csv('/Users/peter/datascience/terminator/stipend/Product Testing Order Audit - May 2024.csv')
#drop the last 3 columns
df = df.iloc[:, :-3]
#rename the columns as: 'disco_email', 'purchase_date', 'purchase_email', 'purchase_url' 
df.columns = ['disco_email', 'purchase_date', 'purchase_email', 'purchase_url']
#convert the purchase_date column to datetime
df['purchase_date'] = pd.to_datetime(df['purchase_date'])
#clean the url column
df['base_url'] = df['purchase_url'].str.split('/', n=3).str[:3].str.join('/')
#lowercase all of the values in purchase_email
df['purchase_email'] = df['purchase_email'].str.lower()
print(len(df))
df.sample(5)

29


Unnamed: 0,disco_email,purchase_date,purchase_email,purchase_url,base_url
16,guido@disconetwork.com,2024-05-20,aaron.c.guido@gmail.com,https://captainblankenship.com/8878788/orders/...,https://captainblankenship.com
1,dan@disconetwork.com,2024-05-14,kaufman.dan@gmail.com,https://consciouslybaby.com/5288296548/orders/...,https://consciouslybaby.com
5,kevin@disconetwork.com,2024-05-31,kevinliu.sf@gmail.com,https://store.dsanddurga.com/3797669/orders/6c...,https://store.dsanddurga.com
12,peter@disconetwork.com,2024-05-27,mapleminlai@gmail.com,https://www.glamnetic.com/checkouts/cn/1926a02...,https://www.glamnetic.com
20,Claudia@disconetwork.com,2024-05-16,claudia@disconetwork.com,https://www.crownaffair.com/8803680319/orders/...,https://www.crownaffair.com


In [6]:
load_dotenv('/Users/peter/.env')

# establish Snowflake connection
connection = connector.connect(user=os.getenv("SNOWFLAKE_USERNAME"), 
                  password=os.getenv("SNOWFLAKE_PASSWORD"), 
                  account=os.getenv("SNOWFLAKE_ACCOUNT"), 
                  role=os.getenv("SNOWFLAKE_ROLE"), 
                  warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
                  database = 'DISCO_CORE' 
                  )

In [29]:
def run_orders_query(df):
    # This function assumes 'purchase_email', 'base_url', and 'purchase_date' columns exist in the df
    conditions = []
    for index, row in df.iterrows():
        # Add and subtract days
        add_date = row['purchase_date'] + pd.DateOffset(days=2)
        sub_date = row['purchase_date'] - pd.DateOffset(days=2)

        # Create the condition for each row
        condition = f"""(email = '{row['purchase_email'].lower()}'
        and order_status_url ilike '{row['base_url'].lower()}%'
        and created_at_gmt between '{sub_date.strftime('%Y-%m-%d')}' and '{add_date.strftime('%Y-%m-%d')}')"""
        conditions.append(condition)

    # Join all conditions with 'OR'
    full_condition = ' OR\n\t'.join(conditions)

    # Create the final query
    orders_query = f""" 
    SELECT 
        email as purchase_email,
        order_status_url,
        created_at_gmt,
    FROM orders.order_data_flattened
    WHERE
        {full_condition}
    ;"""

    #Run the query
    df = pd.read_sql(orders_query, connection)
    #lowercase all of the column names 
    df.columns = map(str.lower, df.columns)

    #Create a new column called 'base_url' that contains the base url of the order_status_url
    df['base_url'] = df['order_status_url'].str.split('/', n=3).str[:3].str.join('/')
    return df

output = run_orders_query(df)
#lowercase all of the column names
# output.columns = map(str.lower, output.columns)
output.sample(5)

Unnamed: 0,purchase_email,order_status_url,created_at_gmt,base_url
20,mercadozachary@yahoo.com,https://www.ursamajorvt.com/18867385/orders/ed...,2024-05-13 22:52:52,https://www.ursamajorvt.com
9,tdkaraffa@gmail.com,https://www.oarsandalps.com/9469446/orders/306...,2024-05-25 18:58:02,https://www.oarsandalps.com
18,mapleminlai@gmail.com,https://www.glamnetic.com/13923024944/orders/e...,2024-05-28 04:10:21,https://www.glamnetic.com
10,lydiabonak@gmail.com,https://www.wearlively.com/11156560/orders/1eb...,2024-05-31 13:32:16,https://www.wearlively.com
7,kevinliu.sf@gmail.com,https://shop.liquid-iv.com/13381013/orders/e21...,2024-05-31 18:20:50,https://shop.liquid-iv.com


In [41]:
merged_output = df.merge(output, on=['purchase_email', 'base_url'], how='left')
not_found = merged_output[merged_output.order_status_url.isnull()]
#remove the last row 
not_found = not_found.iloc[:-1]
not_found[['disco_email', 'purchase_date', 'purchase_email', 'purchase_url']]

Unnamed: 0,disco_email,purchase_date,purchase_email,purchase_url
18,gabrielle@disconetwork.com,2024-05-22,gabriellesannicola@gmail.com,https://checkout-us.wearewild.com/checkouts/cn...
21,claudia@disconetwork.com,2024-05-16,claudiaesber@gmail.com,https://www.crownaffair.com/checkouts/cn/4dcc6...
26,monika@disconetwork.com,2024-05-13,monikacoupons36@gmail.com,https://fnxfit.com/22562171/orders/1fe07d9feb5...
