In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy.stats import beta

import warnings
# Filter out all warnings
warnings.filterwarnings('ignore', category=Warning)

### Converting Timestamps

In [90]:
mixpanel = pd.read_csv('/Users/peter/Documents/Data Science/Caden_Lane_Analysis/mixpanel_cadenlane_jan_conv.csv')
mixpanel['ORDER_TIME'] = pd.to_datetime(mixpanel['ORDER_DATE'], utc = True, format='ISO8601')
mixpanel['EVENT_TIME'] = pd.to_datetime(mixpanel['DISPLAY_TIME'], utc = True, format='ISO8601')
mixpanel_conv = mixpanel[['CL_ORDER_NUM', 'ORDER_TIME', 'EVENT_TIME', 'TIME_DELTA_DAYS']]
#Create order date and event date columns
mixpanel_conv['ORDER_DATE'] = mixpanel_conv['ORDER_TIME'].dt.date
mixpanel_conv['EVENT_DATE'] = mixpanel_conv['EVENT_TIME'].dt.date
mixpanel_conv.to_csv('/Users/peter/Documents/Data Science/Caden_Lane_Analysis/mixpanel_jan_conv_utc.csv', index = False)

In [51]:
snowplow_df = pd.read_csv('Caden Lane Conv.csv')
snowplow_df['event_time'] = pd.to_datetime(snowplow_df['EVENT_CREATED_AT'], utc = True, format='ISO8601')
snowplow_df['order_time'] = pd.to_datetime(snowplow_df['ORDER_CREATED_AT'], utc = True, format='ISO8601')
snowplow_df.dtypes

ORDER_ID                                   int64
ORDER_IDENTITY_ID                        float64
ORDER_PUBLISHER_UUID                      object
ORDER_BRAND_ID                             int64
ORDER_BRAND_NAME                          object
ORDER_CREATED_AT                          object
ORDER_COST                               float64
REMOTE_ORDER_ID                            int64
ORDER_NAME                                 int64
LAST_ORDER_CREATED_AT                     object
EVENT_ID                                  object
EVENT_IDENTITY_ID                        float64
EVENT_PUBLISHER_UUID                      object
EVENT_BRAND_ID                             int64
EVENT_BRAND_NAME                          object
EVENT_CREATED_AT                          object
EVENT_CLASSIFICATION                      object
HASH_IDENTIFIER                           object
ATTRIBUTION_METHOD                        object
CONVERSION_TYPE                           object
CUSTOMER_TYPE       

In [57]:
disco_df = snowplow_df[['event_time', 'order_time', 'EVENT_CLASSIFICATION', 'CONVERSION_TYPE', 'CUSTOMER_TYPE', 'ATTRIBUTION_METHOD', 'DAYS_TO_ATTRIBUTION', 'BILLABLE_AMOUNT']]
disco_df.sample(5)

Unnamed: 0,event_time,order_time,EVENT_CLASSIFICATION,CONVERSION_TYPE,CUSTOMER_TYPE,ATTRIBUTION_METHOD,DAYS_TO_ATTRIBUTION,BILLABLE_AMOUNT
1295,2023-12-29 20:32:08.464000+00:00,2024-01-01 05:22:19+00:00,display,cross-sell,lapsed,direct_email_match,3,32.0
1234,2024-01-02 20:15:56.578000+00:00,2024-01-03 18:28:40+00:00,display,cross-sell,lapsed,direct_email_match,1,32.0
593,2024-01-01 21:32:17.794000+00:00,2024-01-02 14:03:58+00:00,display,cross-sell,returning,order_mapping_hash,1,20.0
1078,2024-01-14 03:19:00.445000+00:00,2024-01-14 09:06:30+00:00,display,cross-sell,new,device_identifier_hash,0,32.0
323,2024-01-09 20:50:34.816000+00:00,2024-01-12 16:47:50+00:00,display,cross-sell,returning,direct_email_match,3,20.0


In [64]:
cadenlane = pd.read_csv('/Users/peter/Documents/Data Science/Caden_Lane_Analysis/Caden Lane - Touchpoints for order list - Touchpoints for each order.csv')
cadenlane['order_time'] = pd.to_datetime(cadenlane['event_time'].str.replace(' UTC', ''), utc=True)
#rename event_time to event_time_utc
cadenlane.rename(columns={'event_time':'event_time_utc'}, inplace=True)
# Assuming your DataFrame is named cadenlane and the column containing the date is named 'ad_interaction_time'
cadenlane['event_time'] = pd.to_datetime(cadenlane['ad_interaction_time'].str.replace(' UTC', ''), utc=True, format='ISO8601')
#drop the columns campaign_name, campaign_id, ad_name, ad_id, ad_interaction_time, event_time_utc
cadenlane = cadenlane.drop(['campaign_name', 'campaign_id', 'ad_name', 'ad_id', 'ad_interaction_time', 'event_time_utc'], axis=1)
#Create an attribution_days column by subtracting the event_time from the order_time and converting to days
cadenlane['attribution_days'] = (cadenlane['order_time'] - cadenlane['event_time']).dt.days
cadenlane.sample(5)

Unnamed: 0,order_number,northbeam_platform,source,order_time,event_time,attribution_days
45696,2369258,Organic,direct,2024-01-16 21:43:52+00:00,2023-07-19 12:29:50.907000+00:00,181
5267,2339142,Klaviyo,utm medium - email,2024-01-02 13:09:15+00:00,2023-08-20 11:01:55.015000+00:00,135
18205,2347052,Organic,direct,2024-01-06 00:07:49+00:00,2023-12-17 20:34:54.280000+00:00,19
38424,2363012,Google Ads,adwords,2024-01-14 00:40:06+00:00,2023-12-24 16:01:29.094000+00:00,20
7795,2340934,Organic,direct,2024-01-03 04:39:33+00:00,2023-10-13 01:40:26.911000+00:00,82


## Matching Process

In [71]:
def match_and_merge(company_df, client_df):
    # Prepare the output dataframe with additional columns for distances and client data
    output_columns = list(company_df.columns) + ['event_distance', 'order_distance', 'avg_distance'] + ['client_' + col for col in client_df.columns]
    output_df = pd.DataFrame(columns=output_columns)
    
    for index, row in company_df.iterrows():
        # Calculate the differences in seconds for event_time and order_time
        event_diff = np.abs((client_df['event_time'] - row['event_time']).dt.total_seconds())
        order_diff = np.abs((client_df['order_time'] - row['order_time']).dt.total_seconds())
        
        # Calculate the average distance and find the index of the minimum average distance
        avg_diff = (event_diff + order_diff) / 2
        min_diff_idx = avg_diff.idxmin()
        
        # Extract the row from the client dataframe with the minimum average distance
        matching_client_row = client_df.loc[min_diff_idx]
        
        # Prepare the row to append to the output dataframe
        new_row = row.to_list() + [event_diff[min_diff_idx], order_diff[min_diff_idx], avg_diff[min_diff_idx]] + [matching_client_row[col] for col in client_df.columns]
        
        # Append the new row to the output dataframe
        output_df.loc[len(output_df)] = new_row
    
    # Adjust column names for client dataframe columns to avoid conflicts and clarify source
    client_column_names = ['client_' + col for col in client_df.columns]
    output_df.rename(columns=dict(zip(client_df.columns, client_column_names)), inplace=True)
    
    return output_df

In [72]:
avg_distance_conv_match = match_and_merge(disco_df, cadenlane)
avg_distance_conv_match.sample(5)

Unnamed: 0,client_event_time,client_order_time,EVENT_CLASSIFICATION,CONVERSION_TYPE,CUSTOMER_TYPE,ATTRIBUTION_METHOD,DAYS_TO_ATTRIBUTION,BILLABLE_AMOUNT,event_distance,order_distance,avg_distance,client_order_number,client_northbeam_platform,client_source,client_order_time.1,client_event_time.1,client_attribution_days
126,2024-01-27 17:52:32.649000+00:00,2024-01-28 12:39:25+00:00,display,cross-sell,new,direct_email_match,1,32.0,4674.337,6016.0,5345.1685,2393418,Google Ads,adwords,2024-01-28 14:19:41+00:00,2024-01-27 16:34:38.312000+00:00,0
662,2024-01-13 21:24:13.253000+00:00,2024-01-19 20:16:59+00:00,display,cross-sell,lapsed,direct_email_match,6,32.0,1713.366,2626.0,2169.683,2374887,Klaviyo,utm medium - email,2024-01-19 19:33:13+00:00,2024-01-13 21:52:46.619000+00:00,5
543,2024-01-18 15:26:24.020000+00:00,2024-01-18 15:44:58+00:00,display,cross-sell,returning,device_identifier_hash,0,20.0,830.016,26.0,428.008,2372337,Organic,direct,2024-01-18 15:45:24+00:00,2024-01-18 15:40:14.036000+00:00,0
389,2023-12-29 22:05:38.431000+00:00,2024-01-04 00:56:05+00:00,display,cross-sell,new,consumer_hash,6,32.0,520.948,23622.0,12071.474,2341842,Organic,direct,2024-01-03 18:22:23+00:00,2023-12-29 21:56:57.483000+00:00,4
332,2024-01-12 02:41:05.317000+00:00,2024-01-18 22:52:31+00:00,display,cross-sell,lapsed,direct_email_match,6,32.0,5859.603,9463.0,7661.3015,2372887,Organic,direct,2024-01-18 20:14:48+00:00,2024-01-12 01:03:25.714000+00:00,6


In [68]:
avg_distance_conv_match.to_csv('conv_match.csv', index=False)

## Track Attribution

In [74]:
conv_data = pd.read_csv('/Users/peter/Documents/Data Science/Caden_Lane_Analysis/Caden Lane Conv Attribution - Matched.csv')
#drop 'claim?' column 
conv_data = conv_data.drop(['claim?'], axis=1)

In [79]:
def assign_claim(input_df):
    df = input_df.copy()
    # Calculate the count of each 'cl_order_num' in the dataframe
    #df['cl_order_num_count'] = df.groupby('cl_order_num')['cl_order_num'].transform('count')
    
    # Initialize the 'claim' column to 'No' by default
    df['claim'] = 'No'
    
    # Iterate over each unique 'cl_order_num'
    for order_num in df['cl_order_num'].unique():
        # Subset of rows for the current 'cl_order_num'
        subset = df[df['cl_order_num'] == order_num]
        
        # Process based on 'cl_order_num_count'
        if subset['cl_order_num_count'].iloc[0] == 1:
            # Single row case
            df.loc[subset.index, 'claim'] = subset.apply(lambda row: 'Yes' if row['avg_sec_dist'] <= 600 else ('Maybe' if 600 < row['avg_sec_dist'] <= 1000 else 'No'), axis=1)
        else:
            # Multiple rows case, find the row with the lowest 'avg_sec_dist'
            min_avg_sec_dist_row = subset.loc[subset['avg_sec_dist'].idxmin()]
            # Update the 'claim' value for the row with the lowest 'avg_sec_dist'
            df.loc[min_avg_sec_dist_row.name, 'claim'] = 'Yes' if min_avg_sec_dist_row['avg_sec_dist'] <= 600 else ('Maybe' if 600 < min_avg_sec_dist_row['avg_sec_dist'] <= 1000 else 'No')
            # Note: All other rows for this 'cl_order_num' remain 'No' as initialized
            
    return df.sort_values(by='avg_sec_dist', ascending=True)

In [82]:
conv_output = assign_claim(conv_data)
#create a new column called 'order_sequence' that is the rank of 'cl_event_time' within each 'cl_order_num' ascending
conv_output['cl_order_seq'] = conv_output.groupby('cl_order_num')['cl_event_time'].rank(ascending=True)
conv_output.sample(5)

Unnamed: 0,disco_event_time,disco_order_time,DAYS_TO_ATTRIBUTION,EVENT_TYPE,CUSTOMER_TYPE,ATTRIBUTION_METHOD,BILLABLE_AMOUNT,event_sec_dist,order_sec_dist,avg_sec_dist,cl_order_num_count,cl_order_num,northbeam_platform,cl_source,cl_order_time,cl_event_time,cl_attribution_days,claim,cl_order_seq
925,2024-01-18 13:39:17.116000+00:00,2024-01-19 00:48:55+00:00,1,display,new,direct_email_match,32,918,12450,6684,3,2373017,Facebook Ads,fb,2024-01-18 21:21:25+00:00,2024-01-18 13:54:34.730000+00:00,0,No,3.0
40,2024-01-02 16:17:41.030000+00:00,2024-01-02 17:59:08+00:00,0,click,returning,consumer_hash,20,32,29,30,1,2339663,Disco,utm medium - disco_paid,2024-01-02 17:59:37+00:00,2024-01-02 16:17:09.208000+00:00,0,Yes,1.0
846,2024-01-01 18:52:47.207000+00:00,2024-01-05 15:34:04+00:00,4,display,returning,order_mapping_hash,20,9294,2524,5909,1,2345859,Klaviyo,utm medium - email,2024-01-05 14:52:00+00:00,2024-01-01 16:17:53.053000+00:00,3,No,1.0
1000,2024-01-07 03:54:08.209000+00:00,2024-01-08 17:21:02+00:00,1,display,new,order_mapping_hash,32,9581,5225,7403,2,2352585,Organic,direct,2024-01-08 18:48:07+00:00,2024-01-07 01:14:27.054000+00:00,1,No,2.0
906,2024-01-16 15:18:38.502000+00:00,2024-01-23 02:57:04+00:00,7,display,returning,direct_email_match,20,12327,669,6498,8,2382887,Organic,direct,2024-01-23 03:08:13+00:00,2024-01-16 18:44:05.702000+00:00,6,No,3.5


In [83]:
conv_output.to_csv('conv_output.csv', index=False)