In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter

ChannelAttribution Pro Package

https://channelattribution.io/docs/chpro/gettingstarted-pro

In [None]:
import os
os.system("wget https://app.channelattribution.io/repository/install/installChProColab.py")
os.system("python installChProColab.py")

from ChannelAttributionPro import *
pd.set_option('display.expand_frame_repr', False)
password = '**************************password*************************'

Visit www.channelattribution.io for more information about ChannelAttributionPro
Version: 3.9.1


# **Data & Pre-processing**

In [None]:
df = pd.read_csv('/content/drive/My Drive/Masters Thesis/data/attribution_data.csv')
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display
3,00000FkCnDfDDf0iC97iC703B,2018-07-29T07:44:51Z,impression,0,0.0,Online Display
4,0000nACkD9nFkBBDECD3ki00E,2018-07-03T09:44:57Z,impression,0,0.0,Paid Search


In [None]:
df['time'] = pd.to_datetime(df['time'])

In [None]:
# visit_order per cookie based on time order
df = df.sort_values(['cookie', 'time'], ascending=[False, True])
df['visit_order'] = df.groupby('cookie').cumcount() + 1

In [None]:
df_paths = df.groupby('cookie')['channel'].aggregate(lambda x: x.tolist()).reset_index()
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')

In [None]:
total_conversions = sum(df_paths['conversion'])
print("No. of conversions: ", total_conversions)

No. of conversions:  17639


In [None]:
df_paths['path'] = df_paths['channel'].apply(lambda x: ' > '.join(x))
df_paths.drop(columns = ['channel'], inplace = True)
attribution_df = df_paths.groupby(['path'], as_index = False).sum()
attribution_df.drop(columns = ['cookie'], inplace = True)
attribution_df.head()

Unnamed: 0,path,conversion
0,Facebook,2054
1,Facebook > Facebook,538
2,Facebook > Facebook > Facebook,230
3,Facebook > Facebook > Facebook > Facebook,85
4,Facebook > Facebook > Facebook > Facebook > Fa...,41


In [None]:
# reference: https://www.geeksforgeeks.org/highlight-the-maximum-value-in-each-column-in-pandas/
def highlight_max_attribution(row, df):
    '''
    highlight the maximum in a Series green.
    '''
    is_max = row.loc['Attributed Credit'] == df['Attributed Credit'].max()
    return ['background-color: lightgreen' if is_max else '' for v in row.index]

# Heuristic Models

Reference: https://channelattribution.io/docs/chpro/functions/heuristic-models

In [None]:
traditional=heuristic_models(attribution_df, "path", "conversion", password=password)

**heuristic models estimation**
Attribution, number of paths elaborated: 11,374


In [None]:
last_touch_attribution = traditional.groupby("channel")[["last_touch_conversions"]].sum().reset_index()
last_touch_attribution.columns = ['Channel', 'Attributed Credit']
last_touch_attribution['Attributed Credit %'] = round((last_touch_attribution['Attributed Credit']/last_touch_attribution['Attributed Credit'].sum())*100, 2)
last_touch_attribution['Attributed Credit %'] = last_touch_attribution['Attributed Credit %'].apply(lambda x: f"{x:.2f}%")

last_touch_attribution.style.apply(highlight_max_attribution, df=last_touch_attribution, axis=1)

Unnamed: 0,Channel,Attributed Credit,Attributed Credit %
0,Facebook,5301.0,30.05%
1,Instagram,2244.0,12.72%
2,Online Display,2139.0,12.13%
3,Online Video,3408.0,19.32%
4,Paid Search,4547.0,25.78%


In [None]:
first_touch_attribution = traditional.groupby("channel")[["first_touch_conversions"]].sum().reset_index()
first_touch_attribution.columns = ['Channel', 'Attributed Credit']
first_touch_attribution['Attributed Credit %'] = round((first_touch_attribution['Attributed Credit']/first_touch_attribution['Attributed Credit'].sum())*100, 2)
first_touch_attribution['Attributed Credit %'] = first_touch_attribution['Attributed Credit %'].apply(lambda x: f"{x:.2f}%")

first_touch_attribution.style.apply(highlight_max_attribution, df=first_touch_attribution, axis=1)

Unnamed: 0,Channel,Attributed Credit,Attributed Credit %
0,Facebook,5177.0,29.35%
1,Instagram,2329.0,13.20%
2,Online Display,2160.0,12.25%
3,Online Video,3216.0,18.23%
4,Paid Search,4757.0,26.97%


In [None]:
linear_touch_attribution = traditional.groupby("channel")[["linear_touch_conversions"]].sum().reset_index()
linear_touch_attribution.columns = ['Channel', 'Attributed Credit']
linear_touch_attribution['Attributed Credit %'] = round((linear_touch_attribution['Attributed Credit']/linear_touch_attribution['Attributed Credit'].sum())*100, 2)
linear_touch_attribution['Attributed Credit %'] = linear_touch_attribution['Attributed Credit %'].apply(lambda x: f"{x:.2f}%")

linear_touch_attribution.style.apply(highlight_max_attribution, df=linear_touch_attribution, axis=1)

Unnamed: 0,Channel,Attributed Credit,Attributed Credit %
0,Facebook,4947.6,28.05%
1,Instagram,2627.1,14.89%
2,Online Display,2143.6,12.15%
3,Online Video,3192.183333,18.10%
4,Paid Search,4728.516667,26.81%


# Markov Model

Reference: https://channelattribution.io/docs/chpro/functions/markov-model

In [None]:
res=markov_model(attribution_df, "path", "conversion", type = "re", password=password)
path_attribution=res["attribution"]

**markov model estimation**
Building transition matrix, number of paths elaborated: 11,374
Number of simulations: 100000 - Convergence reached: 1.59% < 5.00%
Percentage of simulated paths that successfully end before maximum number of steps (136) is reached: 100.00%


In [None]:
path_attribution_summary = path_attribution.groupby("channel")['total_conversions'].sum().reset_index()
path_attribution_summary.columns = ['Channel', 'Attributed Credit']
path_attribution_summary['Attributed Credit %'] = round((path_attribution_summary['Attributed Credit']/path_attribution_summary['Attributed Credit'].sum())*100, 2)
path_attribution_summary['Attributed Credit %'] = path_attribution_summary['Attributed Credit %'].apply(lambda x: f"{x:.2f}%")

path_attribution_summary.style.apply(highlight_max_attribution, df=path_attribution_summary, axis=1)

Unnamed: 0,Channel,Attributed Credit,Attributed Credit %
0,Facebook,5264.457823,29.85%
1,Instagram,3484.208016,19.75%
2,Online Display,2024.765503,11.48%
3,Online Video,2867.123765,16.25%
4,Paid Search,3998.444892,22.67%


# Shapley Value



Reference: https://channelattribution.io/docs/chpro/functions/shapley

In [None]:
shapley_model=shapley(attribution_df, "path", "conversion", password=password)
shapley_path_attribution=shapley_model["attribution"]

In [None]:
shapley_attribution = shapley_path_attribution.groupby("channel")["total_conversions"].sum().reset_index()
shapley_attribution.columns = ['Channel', 'Attributed Credit']
shapley_attribution['Attributed Credit %'] = round((shapley_attribution['Attributed Credit']/shapley_attribution['Attributed Credit'].sum())*100, 2)
shapley_attribution['Attributed Credit %'] = shapley_attribution['Attributed Credit %'].apply(lambda x: f"{x:.2f}%")

shapley_attribution.style.apply(highlight_max_attribution, df=shapley_attribution, axis=1)

Unnamed: 0,Channel,Attributed Credit,Attributed Credit %
0,Facebook,4716.404051,26.74%
1,Instagram,2855.137533,16.19%
2,Online Display,2178.145834,12.35%
3,Online Video,3198.808492,18.13%
4,Paid Search,4690.50409,26.59%


# Time Decay Model

In [None]:
converted = df_paths[df_paths['conversion'] == 1]['cookie']

In [None]:
# Select cookies in original dataset which are in converted only
df_converted = df[df['cookie'].isin(converted)]
df_converted.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel,visit_order
586722,ooonih0kon3FDAB90EfADEFnn,2018-07-13 08:43:38+00:00,impression,0,0.0,Facebook,1
586723,ooonih0kon3FDAB90EfADEFnn,2018-07-13 09:35:35+00:00,conversion,1,7.5,Instagram,2
586704,oookCEDh03D7oo3f0FkB799E7,2018-07-18 17:53:15+00:00,impression,0,0.0,Paid Search,1
586705,oookCEDh03D7oo3f0FkB799E7,2018-07-23 18:08:02+00:00,impression,0,0.0,Online Video,2
586706,oookCEDh03D7oo3f0FkB799E7,2018-07-29 20:22:26+00:00,impression,0,0.0,Online Video,3


References:
* https://api-docs.freewheel.tv/beeswax/docs/antenna-sql-tutorial-custom-attribution-model#:~:text=Thus%2C%20the%20major%20factor%20in,prior%20to%20the%20conversion%20event



* https://www.optimizesmart.com/time-decay-attribution-model-in-google-analytics/

Decay Factor < 1 (e.g., 0.5): Credits decrease exponentially as time difference increases. A factor of 0.5 means that an interaction one time unit (day in this example) before another will have half the credit. This makes the model sensitive to time, heavily favoring recent interactions.

In [None]:
# Calculate the latest conversion time for each cookie
df_converted['max_timestamp'] = df_converted.groupby('cookie')['time'].transform('max')

# Define the time decay function
def time_decay_credit(row):
    decay_factor = 0.5
    time_diff = (row['max_timestamp'] - row['time']).days
    # return decay_factor ** time_diff
    # formula (0.5) ** (time_diff/7) = 2^(-x/7)
    return 2 ** (-time_diff/7)

# Apply the time decay function
df_converted['time_decay'] = df_converted.apply(time_decay_credit, axis=1)

In [None]:
# Calculate the time decay attribution for each channel
time_decay_attribution = df_converted.groupby('channel')['time_decay'].sum().reset_index()
time_decay_attribution.columns = ['Channel', 'Time decay value']

# Calculate attributed credit percentage
time_decay_attribution['Attributed Credit'] = (time_decay_attribution['Time decay value'] / time_decay_attribution['Time decay value'].sum())*total_conversions
time_decay_attribution['Attributed Credit %'] = round((time_decay_attribution['Time decay value'] / time_decay_attribution['Time decay value'].sum())*100, 2)
time_decay_attribution['Attributed Credit %'] = time_decay_attribution['Attributed Credit %'].apply(lambda x: f"{x:.2f}%")

# Sort by attributed credit in descending order
# attribution_df = attribution_df.sort_values('Attributed Credit', ascending=False)
time_decay_attribution.style.apply(highlight_max_attribution, df=time_decay_attribution, axis=1)

Unnamed: 0,Channel,Time decay value,Attributed Credit,Attributed Credit %
0,Facebook,15459.504162,5750.748303,32.60%
1,Instagram,6578.676695,2447.18805,13.87%
2,Online Display,4090.423392,1521.587959,8.63%
3,Online Video,12359.98701,4597.765464,26.07%
4,Paid Search,8929.619303,3321.710225,18.83%


---