### Checking Memory Usage

In [1]:
import psutil
def mem_usage():
    
    mem = psutil.virtual_memory()
    THRESHOLD = 100 * 1024 * 1024  # 100MB
    
    mem_total=round(psutil.virtual_memory().total / (1024.0 ** 3),3)
    mem_avail=round(psutil.virtual_memory().available / (1024.0 ** 3),3)
    mem_used=round(psutil.virtual_memory().used / (1024.0 ** 3),3)

    if mem.available <= THRESHOLD:
        print('Memory Usage Warning!')
    else:
        print('Sufficient Memory is available! \n',)
        print('Total CPU Count=', psutil.cpu_count())
        print(f'Total Memory= {mem_total} GB')
        print(f'Total Used Memory= {mem_used} GB')
        print(f'Total Available Memory= {mem_avail} GB')
        
mem_usage()

Sufficient Memory is available! 

Total CPU Count= 8
Total Memory= 31.417 GB
Total Used Memory= 0.5 GB
Total Available Memory= 30.549 GB


### Libraries

In [2]:
!pip install -U -q PyDrive
#!pip install --upgrade plotly
#!pip install plotly==4.8.2
#!pip install ChannelAttribution

import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
import plotly.express as px
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import matplotlib.style as style
import seaborn as sns
import glob
from ChannelAttribution import *

Looking for attribution at path level? Try ChannelAttributionPro! Visit www.channelattribution.net for more information.


### Importing data from GCP

In [3]:
import google.auth
from google.cloud import bigquery
from google.cloud import bigquery_storage

credentials, your_project_id = google.auth.default(
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

bqclient = bigquery.Client(credentials=credentials, project='festive-radar-307222',)
bqstorageclient = bigquery_storage.BigQueryReadClient(credentials=credentials)

In [4]:
#query_string = """SELECT * FROM `festive-radar-307222.mta_data.mta_data_agg_0`;"""
query_string = """SELECT * FROM `festive-radar-307222.mta_data.mta_data_agg_1`;"""
#query_string = """SELECT * FROM `festive-radar-307222.mta_data.mta_data_agg_2`;"""

df = (
    bqclient.query(query_string)
    .result()
    .to_dataframe(bqstorage_client=bqstorageclient)
)

### Model Development

In [5]:
df.head(5)

Unnamed: 0,user_analytics_id,session_cumsum,conv_1_life,visitor_source
0,e02ff5a5fb614470b58aba7d8e45b0ee,1,0,Organic - SEO
1,eb08d2ce3091477eacdafd0e7257cc7a,1,0,Organic - SEO
2,83f0452cb9b3418995348edba451dfe8,1,0,Paid Search - B
3,a65c376301224406be0fd8d016acaf15,1,0,Organic - SEO
4,c5622c8eacaa4a61bca357453acbd743,1,0,Organic - SEO


In [6]:
df.columns

Index(['user_analytics_id', 'session_cumsum', 'conv_1_life', 'visitor_source'], dtype='object')

In [7]:
data1 = df.sort_values(by=['user_analytics_id', 'session_cumsum'])

In [36]:
#data1[data1['user_analytics_id'] == 'd5c48aee85cf46cf9c8639434e705bbc']

Unnamed: 0,user_analytics_id,session_cumsum,conv_1_life,visitor_source
9387014,d5c48aee85cf46cf9c8639434e705bbc,2,0,Direct
8373236,d5c48aee85cf46cf9c8639434e705bbc,3,0,Organic - Brand
9502501,d5c48aee85cf46cf9c8639434e705bbc,4,0,Earned Media
9190600,d5c48aee85cf46cf9c8639434e705bbc,5,0,Organic - Brand
9644715,d5c48aee85cf46cf9c8639434e705bbc,6,0,Email
9205110,d5c48aee85cf46cf9c8639434e705bbc,7,0,Organic - Brand


In [8]:
# Finding the channel path for each user

data2 = data1.groupby(['user_analytics_id'], as_index=False).agg({'visitor_source': lambda x: ' > '.join(map(str, x)),'conv_1_life':max}).rename(columns={'visitor_source':'marketing_channel_subset'})

In [10]:
# Checking user level path for d5c48aee85cf46cf9c8639434e705bbc
data2[data2['user_analytics_id'] == 'd5c48aee85cf46cf9c8639434e705bbc']['marketing_channel_subset']

14363711    Direct > Organic - Brand > Earned Media > Orga...
Name: marketing_channel_subset, dtype: object

In [11]:
# Finding the non-conversions for each user

data2['non_conv'] = data2['conv_1_life'].apply(lambda x: 1 if x == 0 else 0)
data2.head()

Unnamed: 0,user_analytics_id,marketing_channel_subset,conv_1_life,non_conv
0,000003f0a7c94f0881a2569974046205,Organic - SEO,0,1
1,0000042cc6d6499b85e3229158e7bb1f,Organic - SEO,0,1
2,0000048d47894d2b9bae3b26b64e2190,Organic - SEO,0,1
3,00000490b3e54a3b96d9abe9fce085ca,Organic - SEO,0,1
4,00000667738644d1a023d5c73b00e69d,Organic - SEO,0,1


In [12]:
print('Total conversions: {}'.format(sum(data2.conv_1_life)))
print('Total data points: {}'.format(len(data2)))
print('Total conversion rate: {}%'.format(round(sum(data2.conv_1_life) / len(data2)*100, 3)))

Total conversions: 47767
Total data points: 17200179
Total conversion rate: 0.278%


In [13]:
# For each type of conversion path, get # conversions and # non-conversions
data3 = data2.groupby(['marketing_channel_subset'], as_index=False).agg(sum)
data3.head()

Unnamed: 0,marketing_channel_subset,conv_1_life,non_conv
0,Blog,5,2377
1,Blog > Direct,1,42
2,Blog > Direct > Email,0,1
3,Blog > Direct > Organic - SEO,0,1
4,Blog > Direct > Organic - SEO > Direct,0,1


#### Unique paths from Marketing channel subset

In [14]:
# 22 channels
data3.shape

(14527, 3)

#### Max length of a path (Max channels present in a single path)

In [25]:
data3.marketing_channel_subset.str.count('>').max()

266

Eg. 
Organic - SEO > Direct > Organic - SEO > Organic - SEO > Email

Would become

Organic - SEO > Direct > Organic - SEO > Email

#### Top customer conversion paths

In [16]:
data3.sort_values(by=['conv_1_life'], ascending=False).head(100)

Unnamed: 0,marketing_channel_subset,conv_1_life,non_conv
11226,Partnerships,12309,979354
4934,Organic - SEO,5432,12108350
59,Direct,5327,521863
9845,Paid Search - NB,2900,287353
8834,Paid Search - B,2227,80666
...,...,...,...
10528,Paid Search - NB > Organic - SEO > Direct,22,459
7164,Organic - SEO > Paid Search - B > Email,22,153
8380,Organic - SEO > Referral,22,2640
11502,Partnerships > Direct > Partnerships,21,425


## Heuristic & Markov models

In [17]:
H = heuristic_models(data3, "marketing_channel_subset", "conv_1_life")
H.head()

Unnamed: 0,channel_name,first_touch,last_touch,linear_touch
0,Blog,6.0,24.0,19.42619
1,Direct,7755.0,7414.0,7481.817356
2,Email,1103.0,6248.0,4276.690414
3,Organic - SEO,8901.0,10330.0,9406.365824
4,Display,35.0,145.0,115.033835


In [26]:
M = markov_model(data3, "marketing_channel_subset", "conv_1_life", var_null='non_conv')
M.columns=["channel_name","markov_model"]
#M.head(20)

Number of simulations: 100000 - Convergence reached: 2.55% < 5.00%
Percentage of simulated paths that successfully end before maximum number of steps (40) is reached: 99.99%


In [56]:
R=pd.merge(H,M,on="channel_name",how="inner")
R.sort_values(by='channel_name', inplace=True)

In [55]:
# Pass the scalar value then sum of all rows will be filled
Rmodel=R
Rmodel.loc['Total'] = Rmodel['first_touch'].sum()
print(Rmodel)

           channel_name  first_touch  last_touch  linear_touch  markov_model
0                  Blog          6.0        24.0     19.426190     21.254068
13      Connected TV/TV          0.0         2.0      1.116667      3.036295
1                Direct       7755.0      7414.0   7481.817356   6773.975146
4               Display         35.0       145.0    115.033835    194.322909
5          Earned Media         22.0       368.0    236.657337    394.718408
2                 Email       1103.0      6248.0   4276.690414   6503.744851
6              Facebook        984.0       713.0    831.640188    734.783499
19                Gmail         24.0        28.0     25.969120     21.254068
18           Influencer        132.0       108.0    120.550000    103.234045
17             Internal          0.0         5.0      1.483333      3.036295
14             Lead Gen        134.0        86.0    106.895238    118.415523
9       Organic - Brand        186.0       813.0    508.706151    798.545703

In [57]:
# Total Conversions having PARTNERSHIPS as FIRST touch point

data3[data3.marketing_channel_subset.str.contains('^Partnerships.*')].conv_1_life.sum()

18881

In [58]:
# Total Conversions having PARTNERSHIPS as LAST touch point

data3[data3.marketing_channel_subset.str.contains('Partnerships$')].conv_1_life.sum()

13817

#### Calculating the percentages for each channel per model

In [59]:
R['first_touch'] = R['first_touch']/(R['first_touch'].sum())*100
R['last_touch'] = R['last_touch']/(R['last_touch'].sum())*100
R['linear_touch'] = R['linear_touch']/(R['linear_touch'].sum())*100
R['markov_model'] = R['markov_model']/(R['markov_model'].sum())*100

In [60]:
R

Unnamed: 0,channel_name,first_touch,last_touch,linear_touch,markov_model
0,Blog,0.012561,0.050244,0.040669,0.044495
13,Connected TV/TV,0.0,0.004187,0.002338,0.006356
1,Direct,16.235058,15.521176,15.663151,14.181287
4,Display,0.073272,0.303557,0.240823,0.406814
5,Earned Media,0.046057,0.770406,0.495441,0.826341
2,Email,2.309126,13.08016,8.953232,13.615561
6,Facebook,2.06,1.492662,1.741035,1.538266
19,Gmail,0.050244,0.058618,0.054366,0.044495
18,Influencer,0.276341,0.226098,0.252371,0.21612
17,Internal,0.0,0.010467,0.003105,0.006356


In [65]:
# Saving first touch model attribution value to CSV
R[['channel_name','first_touch']].to_csv(r'/home/ss13449/Data/conv_value.csv', index = False)

In [62]:
R1=pd.melt(R, id_vars="channel_name")
fig = px.bar(R1, x="channel_name", y="value", color="variable", 
             hover_data=['value'], 
             barmode = 'group',
             color_discrete_sequence=px.colors.qualitative.Bold)
fig.update_layout({ 'plot_bgcolor': 'rgba(0, 0, 0, 0)', 'paper_bgcolor': 'rgba(0, 0, 0, 0)'})
fig.show()

### 1. Email attribution is significantly reduced in first touch model upon improving the user channel paths
### 2. Organic - SEO/Direct is also deriving consistent attribution from all models
### 3. Top attributing channel from Markov models are: 

    a. Partnerships
    
    b. Organic - SEO
    
    c. Direct
    
    d. Email
    
    e. Paid Search - NB
    
    f. Paid Search B

In [64]:
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from plotly import tools
R1 = R.sort_values(['first_touch'], ascending=False)
R2 = R.sort_values(['last_touch'], ascending=True)
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=("First Touch", "Linear Touch", "Last Touch", "Markov Conv model")
)
trace1 = go.Bar(x=R.channel_name, y=R.first_touch.values, name="First Touch model")
trace2 = go.Bar(x=R.channel_name, y=R.last_touch.values, name="Last Touch model")
trace3 = go.Bar(x=R.channel_name, y=R.linear_touch.values, name="Linear Touch model")
trace4 = go.Bar(x=R.channel_name, y=R.markov_model.values, name="Markov Conv model")

fig.append_trace(trace1, 1,1)
fig.append_trace(trace2, 2, 1)
fig.append_trace(trace3, 1, 2)
fig.append_trace(trace4, 2, 2)

fig['layout'].update(height=1000, width=1100)
fig.update_layout(barmode='stack')
fig.show()