In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_auc_score

from imblearn.over_sampling import SMOTE
from collections import Counter

# **Data & Pre-processing**

In [None]:
df = pd.read_csv('/content/drive/My Drive/Masters Thesis/data/attribution_data.csv')
df.head()

Unnamed: 0,cookie,time,interaction,conversion,conversion_value,channel
0,00000FkCnDfDDf0iC97iC703B,2018-07-03T13:02:11Z,impression,0,0.0,Instagram
1,00000FkCnDfDDf0iC97iC703B,2018-07-17T19:15:07Z,impression,0,0.0,Online Display
2,00000FkCnDfDDf0iC97iC703B,2018-07-24T15:51:46Z,impression,0,0.0,Online Display
3,00000FkCnDfDDf0iC97iC703B,2018-07-29T07:44:51Z,impression,0,0.0,Online Display
4,0000nACkD9nFkBBDECD3ki00E,2018-07-03T09:44:57Z,impression,0,0.0,Paid Search


In [None]:
df['time'] = pd.to_datetime(df['time'])

In [None]:
# Extract date
df['date'] = df['time'].dt.date

In [None]:
# visit_order per cookie based on time order
df = df.sort_values(['cookie', 'time'], ascending=[False, True])
df['visit_order'] = df.groupby('cookie').cumcount() + 1

In [None]:
df_paths = df.groupby('cookie')['channel'].aggregate(lambda x: x.tolist()).reset_index()
df_last_interaction = df.drop_duplicates('cookie', keep='last')[['cookie', 'conversion']]
df_paths = pd.merge(df_paths, df_last_interaction, how='left', on='cookie')
df_paths.drop(columns = ['cookie'], inplace = True)

In [None]:
total_conversions = sum(df_paths['conversion'])
print("No. of conversions: ", total_conversions)

No. of conversions:  17639


In [None]:
# reference: https://www.geeksforgeeks.org/highlight-the-maximum-value-in-each-column-in-pandas/
def highlight_max_attribution(row, df):
    '''
    highlight the maximum in a Series green.
    '''
    is_max = row.loc['Attributed Credit'] == df['Attributed Credit'].max()
    return ['background-color: lightgreen' if is_max else '' for v in row.index]

# LSTM + Attention (no class weights)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Attention, GlobalAveragePooling1D
from tensorflow.keras.metrics import AUC

In [None]:
# Extract sequences and labels
sequences = df_paths['channel'].tolist()
labels = df_paths['conversion'].tolist()

# Tokenization: to convert text into sequences of integers
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
# Converts each sequence in sequences to a sequence of integers based on the tokenizer's vocabulary
sequences_encoded = tokenizer.texts_to_sequences(sequences)

# Padding
max_len = max(len(seq) for seq in sequences_encoded)
X = pad_sequences(sequences_encoded, maxlen=max_len, padding='post')

# Convert labels to numpy array
y = np.array(labels)

# Vocabulary size: size of the tokenizer's vocabulary + 1 for the padding token.
vocab_size = len(tokenizer.word_index) + 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model definition
input_layer = Input(shape=(max_len,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=50, input_length=max_len)(input_layer)
lstm_layer = LSTM(64, return_sequences=True)(embedding_layer)

# Attention mechanism
attention_layer = Attention()([lstm_layer, lstm_layer])

# Flatten the attention output to feed into dense layers
attention_flat = tf.keras.layers.Flatten()(attention_layer)
dense_layer = Dense(32, activation='relu')(attention_flat)
output_layer = Dense(1, activation='sigmoid')(dense_layer)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', AUC(name='auc')])



In [None]:
# Train model
model.fit(X_train, y_train, epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m471s[0m 389ms/step - accuracy: 0.9214 - auc: 0.5299 - loss: 0.2710 - val_accuracy: 0.9269 - val_auc: 0.5867 - val_loss: 0.2567
Epoch 2/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m475s[0m 396ms/step - accuracy: 0.9255 - auc: 0.5690 - loss: 0.2609 - val_accuracy: 0.9269 - val_auc: 0.5900 - val_loss: 0.2567
Epoch 3/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m483s[0m 402ms/step - accuracy: 0.9265 - auc: 0.5700 - loss: 0.2587 - val_accuracy: 0.9269 - val_auc: 0.5880 - val_loss: 0.2565
Epoch 4/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m489s[0m 391ms/step - accuracy: 0.9261 - auc: 0.5749 - loss: 0.2592 - val_accuracy: 0.9268 - val_auc: 0.5880 - val_loss: 0.2565
Epoch 5/5
[1m1201/1201[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m521s[0m 407ms/step - accuracy: 0.9263 - auc: 0.5710 - loss: 0.2590 - val_accuracy: 0.9269 - val_auc: 0.5913 - val_loss: 0.2568


<keras.src.callbacks.history.History at 0x7ba23c46afb0>

In [None]:
# model.save('/content/drive/My Drive/Masters Thesis/saved_models/lstmAttention_model.h5')



In [None]:
model = tf.keras.models.load_model('/content/drive/My Drive/Masters Thesis/saved_models/lstmAttention_model.h5', custom_objects={'Attention': Attention})



In [None]:
model.summary()

In [None]:
# Evaluate model
results = model.evaluate(X_test, y_test)

[1m1501/1501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 37ms/step - accuracy: 0.9239 - auc: 0.5840 - loss: 0.2652


In [None]:
# Print all results
print(f'Loss: {round(results[0],2)}')
print(f'Accuracy: {round(results[1],2)}')
if len(results) > 2:
    print(f'AUC: {round(results[2],2)}')  # If AUC or other metrics are included

Loss: 0.26
Accuracy: 0.93
AUC: 0.59


## Assigning Attribution values

In [None]:
model.layers[3]

<Attention name=attention, built=True>

In [None]:
attention_model = Model(inputs=model.input, outputs=model.layers[3].output)
attention_weights = attention_model.predict(X_test)

[1m1501/1501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 44ms/step


In [None]:
channel_names = [channel.title() for channel in list(tokenizer.word_index.keys())]

In [None]:
def compute_unnormalized_attributions(X_sequences, y_labels, attention_weights, channel_names):
    # Initialize a dictionary to store total attributions and counts for each channel
    channel_attributions = {channel: 0 for channel in channel_names}
    channel_counts = {channel: 0 for channel in channel_names}

    # Process each sequence
    for i, (sequence, label) in enumerate(zip(X_sequences, y_labels)):
        if label == 1:  # Only consider sequences ending in a conversion
            seq_attention_weights = attention_weights[i]

            # Average attention weights across the sequence length
            avg_attention_weights = np.mean(np.abs(seq_attention_weights), axis=1)
            # print(avg_attention_weights.shape)
            # print(sequence)
            # Map the averaged attention weights to channels
            for j, channel_index in enumerate(sequence):
                if channel_index < len(channel_names):  # Ensure the channel index is valid
                    channel_name = channel_names[channel_index]
                    channel_attributions[channel_name] += avg_attention_weights[j]
                    channel_counts[channel_name] += 1

    # Calculate mean attributions for each channel
    mean_attributions = {channel: channel_attributions[channel] / channel_counts[channel] if channel_counts[channel] > 0 else 0
                         for channel in channel_names}

    return mean_attributions

# Compute unnormalized attributions for each channel
mean_attributions = compute_unnormalized_attributions(X_test, y_test, attention_weights, channel_names)

# Convert to DataFrame for easy visualization
df_mean_attributions = pd.DataFrame(list(mean_attributions.items()), columns=['Channel', 'Mean Attribution'])
df_mean_attributions

Unnamed: 0,Channel,Mean Attribution
0,Facebook,0.009885
1,Paid Search,0.048592
2,Online Video,0.018785
3,Instagram,0.074338
4,Online Display,0.048353


In [None]:
# Normalize attribution scores to get relative importance for channels
total_conversions = np.sum(y)
score = df_mean_attributions['Mean Attribution']/ df_mean_attributions['Mean Attribution'].sum()
channel_attribution_credit = score / score.sum() * total_conversions
channel_attribution_percentages = round(score / score.sum() * 100, 2).apply(lambda x: f"{x:.2f}%")

In [None]:
# Create a DataFrame for presentation of channel attributions
channel_attribution_df = pd.DataFrame({
    'Channel': channel_names,
    'Attribution Score': score,
    'Attributed Credit': channel_attribution_credit,
    'Attribution Percentage': channel_attribution_percentages
})

desired_order = ['Facebook', 'Instagram', 'Online Display', 'Online Video', 'Paid Search']
channel_attribution_df['Channel'] = pd.Categorical(channel_attribution_df['Channel'], categories=desired_order, ordered=True)
channel_attribution_df = channel_attribution_df.sort_values('Channel').reset_index(drop=True)

channel_attribution_df.style.apply(highlight_max_attribution, df=channel_attribution_df, axis=1)

Unnamed: 0,Channel,Attribution Score,Attributed Credit,Attribution Percentage
0,Facebook,0.049437,872.012461,4.94%
1,Instagram,0.371779,6557.803821,37.18%
2,Online Display,0.24182,4265.467148,24.18%
3,Online Video,0.093949,1657.166641,9.39%
4,Paid Search,0.243015,4286.549929,24.30%


---