In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_auc_score

from imblearn.over_sampling import SMOTE
from collections import Counter

# **Data & Pre-processing**

In [None]:
df = pd.read_csv('/content/drive/My Drive/Masters Thesis/data/attribution_data.csv')
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display
3,00000FkCnDfDDf0iC97iC703B,2018-07-29T07:44:51Z,impression,0,0.0,Online Display
4,0000nACkD9nFkBBDECD3ki00E,2018-07-03T09:44:57Z,impression,0,0.0,Paid Search


In [None]:
df['time'] = pd.to_datetime(df['time'])

In [None]:
# visit_order per cookie based on time order
df = df.sort_values(['cookie', 'time'], ascending=[False, True])
df['visit_order'] = df.groupby('cookie').cumcount() + 1

In [None]:
# Calculate the latest conversion time for each cookie
df['max_timestamp'] = df.groupby('cookie')['time'].transform('max')

# Define the time decay function
def time_decay_credit(row):
    decay_factor = 0.5
    time_diff = (row['max_timestamp'] - row['time']).days
    # return decay_factor ** time_diff
    # formula (0.5) ** (time_diff/7) = 2^(-x/7)
    return 2 ** (-time_diff/7)

# Apply the time decay function
df['time_decay'] = df.apply(time_decay_credit, axis=1)

In [None]:
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel,visit_order,max_timestamp,time_decay
586736,ooooohAFofEnonEikhAi3fF9o,2018-07-14 17:17:12+00:00,impression,0,0.0,Paid Search,1,2018-07-14 17:17:12+00:00,1.0
586734,ooooiBh70D3k3BfAhDFfii9h7,2018-07-03 12:57:25+00:00,impression,0,0.0,Paid Search,1,2018-07-19 08:17:59+00:00,0.226431
586735,ooooiBh70D3k3BfAhDFfii9h7,2018-07-19 08:17:59+00:00,impression,0,0.0,Online Video,2,2018-07-19 08:17:59+00:00,1.0
586731,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-06 23:30:38+00:00,impression,0,0.0,Online Display,1,2018-07-12 23:50:54+00:00,0.552045
586732,ooooEiB0CCoEf9fiiC90Dfhfk,2018-07-12 23:50:45+00:00,impression,0,0.0,Online Display,2,2018-07-12 23:50:54+00:00,1.0


In [None]:
df_paths = df.groupby('cookie')[['channel', 'time_decay']].aggregate(lambda x: x.tolist()).reset_index()
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')
df_paths.drop(columns = ['cookie'], inplace = True)

In [None]:
df_paths.head()

Unnamed: 0,channel,time_decay,conversion
0,"[Instagram, Online Display, Online Display, On...","[0.08411876203952225, 0.33647504815808904, 0.6...",0
1,"[Paid Search, Paid Search, Paid Search, Paid S...","[0.45286183213195336, 0.5, 1.0, 1.0, 1.0, 1.0]",0
2,"[Paid Search, Paid Search, Paid Search, Paid S...","[0.7429971445684742, 0.820335356007638, 0.9057...",0
3,[Instagram],[1.0],0
4,[Paid Search],[1.0],0


In [None]:
total_conversions = sum(df_paths['conversion'])
print("No. of conversions: ", total_conversions)

No. of conversions:  17639


In [None]:
# reference: https://www.geeksforgeeks.org/highlight-the-maximum-value-in-each-column-in-pandas/
def highlight_max_attribution(row, df):
    '''
    highlight the maximum in a Series green.
    '''
    is_max = row.loc['Attributed Credit'] == df['Attributed Credit'].max()
    return ['background-color: lightgreen' if is_max else '' for v in row.index]

# **LSTM + Attention & Time decay**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model

from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.metrics import AUC

In [None]:
# Extract sequences and labels
sequences = df_paths['channel'].tolist()
time_decays = df_paths['time_decay'].tolist()
labels = df_paths['conversion'].tolist()

# Tokenization: to convert text into sequences of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
# Converts each sequence in sequences to a sequence of integers based on the tokenizer's vocabulary.
sequences_encoded = tokenizer.texts_to_sequences(sequences)

# Padding
max_len = max(len(seq) for seq in sequences_encoded)
padded_sequences = pad_sequences(sequences_encoded, maxlen=max_len, padding='post')
padded_time_decays = pad_sequences(time_decays, maxlen=max_len, padding='post', dtype='float32')

# Combine padded sequences with padded time decay
combined_input = np.array([np.column_stack((seq, dec)) for seq, dec in zip(padded_sequences, padded_time_decays)])
X = combined_input

# Convert labels to numpy array
y = np.array(labels)

# Vocabulary size: size of the tokenizer's vocabulary + 1 for the padding token.
vocab_size = len(tokenizer.word_index) + 1

In [None]:
combined_input.shape, padded_sequences.shape

((240108, 134, 2), (240108, 134))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=111)

In [None]:
# Model definition
input_layer = Input(shape=(max_len, 2))
lstm_layer = LSTM(64, return_sequences=True)(input_layer)

# Attention mechanism
attention_layer = Attention()([lstm_layer, lstm_layer])

# Flatten the attention output to feed into dense layers
attention_flat = tf.keras.layers.Flatten()(attention_layer)
dense_layer = Dense(32, activation='relu')(attention_flat)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC(name='auc')])

In [None]:
# Train model
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 318ms/step - accuracy: 0.9211 - auc: 0.5491 - loss: 0.2731 - val_accuracy: 0.9262 - val_auc: 0.5854 - val_loss: 0.2577
Epoch 2/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m436s[0m 313ms/step - accuracy: 0.9262 - auc: 0.5826 - loss: 0.2573 - val_accuracy: 0.9262 - val_auc: 0.5856 - val_loss: 0.2599
Epoch 3/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m383s[0m 314ms/step - accuracy: 0.9268 - auc: 0.5783 - loss: 0.2560 - val_accuracy: 0.9262 - val_auc: 0.5855 - val_loss: 0.2563
Epoch 4/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 320ms/step - accuracy: 0.9264 - auc: 0.5824 - loss: 0.2556 - val_accuracy: 0.9263 - val_auc: 0.5862 - val_loss: 0.2573
Epoch 5/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m440s[0m 318ms/step - accuracy: 0.9280 - auc: 0.5826 - loss: 0.2532 - val_accuracy: 0.9265 - val_auc: 0.5879 - val_loss: 0.2561


<keras.src.callbacks.history.History at 0x7b55deb64610>

In [None]:
model.save('/content/drive/My Drive/Masters Thesis/saved_models/lstmAttention_model-timedecay.h5')



In [None]:
# model = tf.keras.models.load_model('/content/drive/My Drive/Masters Thesis/saved_models/lstmAttention_model-timedecay.h5', custom_objects={'Attention': Attention})

In [None]:
model.summary()

In [None]:
# Evaluate model
results = model.evaluate(X_test, y_test)

[1m1501/1501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 33ms/step - accuracy: 0.9277 - auc: 0.5824 - loss: 0.2537


In [None]:
# Print all results
print(f'Loss: {round(results[0],2)}')
print(f'Accuracy: {round(results[1],2)}')
if len(results) > 2:
    print(f'AUC: {round(results[2],2)}')  # If AUC or other metrics are included

Loss: 0.26
Accuracy: 0.93
AUC: 0.59


## Assigning Attribution values

In [None]:
model.layers[2]

<Attention name=attention, built=True>

In [None]:
attention_model = Model(inputs=model.input, outputs=model.layers[2].output)
attention_weights = attention_model.predict(X_test)

[1m1501/1501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 33ms/step


In [None]:
channel_names = [channel.title() for channel in list(tokenizer.word_index.keys())]

In [None]:
def compute_unnormalized_attributions(X_sequences, y_labels, attention_weights, channel_names):
    # Initialize a dictionary to store total attributions and counts for each channel
    channel_attributions = {channel: 0 for channel in channel_names}
    channel_counts = {channel: 0 for channel in channel_names}

    # Process each sequence
    for i, (sequence, label) in enumerate(zip(X_sequences, y_labels)):
        if label == 1:  # Only consider sequences ending in a conversion
            seq_attention_weights = attention_weights[i]

            # Average attention weights across the sequence length
            avg_attention_weights = np.mean(np.abs(seq_attention_weights), axis=1)

            # Map the averaged attention weights to channels
            for j, channel_time_index in enumerate(sequence):
                # Extract channel index from the tuple
                channel_index = int(channel_time_index[0])
                # Ensure channel_index is a single integer
                if 0 <= channel_index < len(channel_names):
                    channel_name = channel_names[channel_index]
                    channel_attributions[channel_name] += avg_attention_weights[j]
                    channel_counts[channel_name] += 1

    # Calculate mean attributions for each channel
    mean_attributions = {channel: channel_attributions[channel] / channel_counts[channel] if channel_counts[channel] > 0 else 0
                         for channel in channel_names}

    return mean_attributions

# Compute unnormalized attributions for each channel
mean_attributions = compute_unnormalized_attributions(X_test, y_test, attention_weights, channel_names)

In [None]:
# Convert to DataFrame for easy visualization
df_mean_attributions = pd.DataFrame(list(mean_attributions.items()), columns=['Channel', 'Mean Attribution'])
df_mean_attributions

Unnamed: 0,Channel,Mean Attribution
0,Facebook,0.003333
1,Paid Search,0.005803
2,Online Video,0.003511
3,Instagram,0.006794
4,Online Display,0.006082


In [None]:
# Normalize attribution scores to get relative importance for channels
total_conversions = np.sum(y)
score = df_mean_attributions['Mean Attribution']/ df_mean_attributions['Mean Attribution'].sum()
channel_attribution_credit = score / score.sum() * total_conversions
channel_attribution_percentages = round(score / score.sum() * 100, 2).apply(lambda x: f"{x:.2f}%")

In [None]:
# Create a DataFrame for presentation of channel attributions
channel_attribution_df = pd.DataFrame({
    'Channel': channel_names,
    'Attribution Score': score,
    'Attributed Credit': channel_attribution_credit,
    'Attribution Percentage': channel_attribution_percentages
})

desired_order = ['Facebook', 'Instagram', 'Online Display', 'Online Video', 'Paid Search']
channel_attribution_df['Channel'] = pd.Categorical(channel_attribution_df['Channel'], categories=desired_order, ordered=True)
channel_attribution_df = channel_attribution_df.sort_values('Channel').reset_index(drop=True)

channel_attribution_df.style.apply(highlight_max_attribution, df=channel_attribution_df, axis=1)

Unnamed: 0,Channel,Attribution Score,Attributed Credit,Attribution Percentage
0,Facebook,0.130588,2303.450246,13.06%
1,Instagram,0.266197,4695.441708,26.62%
2,Online Display,0.238303,4203.422158,23.83%
3,Online Video,0.137561,2426.431368,13.76%
4,Paid Search,0.227352,4010.254519,22.74%


---