In [1]:
# Notebook 2 for product attribution analysis - sample to show different models that can be used
# This will contain RNN/LSTM, CNN/ViT, Recommender Systems, Multi-task Learning, etc
# Synthetic data again

In [2]:
# sample customer sales data across different channels like e commerce website , social media, print, tv 

In [3]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Define sample data parameters
num_customers = 1000
channels = ['E-commerce', 'Social Media', 'Print', 'TV']
product_categories = ['Electronics', 'Clothing', 'Home & Garden', 'Sports & Outdoor']
regions = ['East', 'West', 'North', 'South']

# Sample product features dictionary
product_features = {
    'Electronics': [
        'Screen Size: 15 inches, Processor: Intel Core i7, RAM: 16GB',
        'Screen Size: 55 inches, Resolution: 4K Smart TV',
        'Processor: AMD Ryzen 5, GPU: NVIDIA GeForce GTX 1660',
        'Screen Size: 13.3 inches, Processor: Apple M1 Chip, SSD: 512GB'
    ],
    'Clothing': [
        'Material: Cotton, Size: M, Color: Blue',
        'Material: Polyester, Size: L, Color: Black',
        'Material: Silk, Size: S, Color: Red',
        'Material: Wool, Size: XL, Color: Grey'
    ],
    'Home & Garden': [
        'Type: Sofa, Color: Grey, Material: Fabric',
        'Type: Dining Table, Color: Brown, Material: Wood',
        'Type: Outdoor Grill, Color: Stainless Steel',
        'Type: Bed Frame, Color: White, Material: Metal'
    ],
    'Sports & Outdoor': [
        'Type: Running Shoes, Size: 10, Color: Black',
        'Type: Camping Tent, Size: 4-person, Color: Green',
        'Type: Backpack, Color: Blue, Capacity: 30L',
        'Type: Bicycle, Color: Red, Frame: Aluminum'
    ]
}

# Generate sample data
np.random.seed(0)
data = {
    'CustomerID': np.random.randint(1, 200,num_customers),
    'Channel': np.random.choice(channels, num_customers),
    'PurchaseAmount ($)': np.random.randint(50, 500, num_customers),
    'PurchaseDate': [datetime(2024, 6, 30) - timedelta(days=random.randint(0, 365)) for _ in range(num_customers)],
    'ProductCategory': np.random.choice(product_categories, num_customers),
    'Region': np.random.choice(regions, num_customers)
}

# Add product features based on product category
data['ProductFeatures'] = [random.choice(product_features[category]) for category in data['ProductCategory']]

# Create DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV file
#df.to_csv('./sample_customer_sales_data.csv', index=False)

print(df.head())
df.info()

   CustomerID       Channel  PurchaseAmount ($) PurchaseDate  \
0         173            TV                 205   2023-12-23   
1          48            TV                 391   2023-08-24   
2         118  Social Media                  51   2024-05-04   
3         193    E-commerce                 315   2023-09-20   
4          68  Social Media                 165   2023-09-18   

    ProductCategory Region                                    ProductFeatures  
0       Electronics  North  Screen Size: 13.3 inches, Processor: Apple M1 ...  
1       Electronics   East  Processor: AMD Ryzen 5, GPU: NVIDIA GeForce GT...  
2     Home & Garden  North     Type: Bed Frame, Color: White, Material: Metal  
3  Sports & Outdoor  North         Type: Bicycle, Color: Red, Frame: Aluminum  
4          Clothing  North              Material: Wool, Size: XL, Color: Grey  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column              Non-N

In [4]:
df.groupby(['CustomerID','Channel','PurchaseDate']).sum('PurchaseAmount').sort_values(['CustomerID'],ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PurchaseAmount ($)
CustomerID,Channel,PurchaseDate,Unnamed: 3_level_1
199,TV,2023-08-22,332
199,Social Media,2023-09-02,437
199,E-commerce,2024-06-20,52
199,E-commerce,2024-04-07,492
199,E-commerce,2023-12-29,317
...,...,...,...
1,Print,2023-07-04,243
1,TV,2023-10-24,485
1,TV,2024-03-21,395
1,TV,2024-06-06,286


In [5]:
df.shape

(1000, 7)

In [6]:
df['ProductFeatures'] = df['ProductFeatures'].apply(str)
df['ProductFeatures'] = df['ProductFeatures'].apply(lambda x: x.split(', '))


In [7]:

def str_to_dict(row):
    z= {}
    for i in row:
        a = i.split(':')
        z[a[0]] = a[1]
    return z

df['ProductFeatures'] = df['ProductFeatures'].apply(str_to_dict)

In [8]:
df = pd.concat([df.drop(['ProductFeatures'], axis=1), df['ProductFeatures'].apply(pd.Series)], axis=1)
df = df.fillna('')

In [9]:
df.head()

Unnamed: 0,CustomerID,Channel,PurchaseAmount ($),PurchaseDate,ProductCategory,Region,Screen Size,Processor,SSD,GPU,Type,Color,Material,Frame,Size,Resolution,RAM,Capacity
0,173,TV,205,2023-12-23,Electronics,North,13.3 inches,Apple M1 Chip,512GB,,,,,,,,,
1,48,TV,391,2023-08-24,Electronics,East,,AMD Ryzen 5,,NVIDIA GeForce GTX 1660,,,,,,,,
2,118,Social Media,51,2024-05-04,Home & Garden,North,,,,,Bed Frame,White,Metal,,,,,
3,193,E-commerce,315,2023-09-20,Sports & Outdoor,North,,,,,Bicycle,Red,,Aluminum,,,,
4,68,Social Media,165,2023-09-18,Clothing,North,,,,,,Grey,Wool,,XL,,,


In [10]:
df['Year'] = df['PurchaseDate'].dt.year
df['Month'] = df['PurchaseDate'].dt.month
#df['Day'] = df['PurchaseDate'].dt.day
df = df.drop('PurchaseDate', axis=1)

In [11]:
df.columns

Index(['CustomerID', 'Channel', 'PurchaseAmount ($)', 'ProductCategory',
       'Region', 'Screen Size', 'Processor', 'SSD', 'GPU', 'Type', 'Color',
       'Material', 'Frame', 'Size', 'Resolution', 'RAM', 'Capacity', 'Year',
       'Month'],
      dtype='object')

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

cols = df.columns
noncat_cols = ['CustomerID', 'PurchaseAmount ($)']

cat_cols = [i for i in cols if i not in noncat_cols]
cat_preproc = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=False), cat_cols)],
    remainder='passthrough' )
encoded_data = cat_preproc.fit_transform(df)
encoded_feature_names = cat_preproc.named_transformers_['cat'].get_feature_names_out(cat_cols)

df_final = pd.DataFrame(encoded_data, columns=list(encoded_feature_names) + ['CustomerID', 'PurchaseAmount'])


df_final.head()
#df_final.info()

Unnamed: 0,Channel_E-commerce,Channel_Print,Channel_Social Media,Channel_TV,ProductCategory_Clothing,ProductCategory_Electronics,ProductCategory_Home & Garden,ProductCategory_Sports & Outdoor,Region_East,Region_North,...,Month_5,Month_6,Month_7,Month_8,Month_9,Month_10,Month_11,Month_12,CustomerID,PurchaseAmount
0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,173.0,205.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,48.0,391.0
2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118.0,51.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,193.0,315.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,68.0,165.0


In [13]:
df.head()

Unnamed: 0,CustomerID,Channel,PurchaseAmount ($),ProductCategory,Region,Screen Size,Processor,SSD,GPU,Type,Color,Material,Frame,Size,Resolution,RAM,Capacity,Year,Month
0,173,TV,205,Electronics,North,13.3 inches,Apple M1 Chip,512GB,,,,,,,,,,2023,12
1,48,TV,391,Electronics,East,,AMD Ryzen 5,,NVIDIA GeForce GTX 1660,,,,,,,,,2023,8
2,118,Social Media,51,Home & Garden,North,,,,,Bed Frame,White,Metal,,,,,,2024,5
3,193,E-commerce,315,Sports & Outdoor,North,,,,,Bicycle,Red,,Aluminum,,,,,2023,9
4,68,Social Media,165,Clothing,North,,,,,,Grey,Wool,,XL,,,,2023,9


In [14]:
X = df_final[['CustomerID']].values
y = df_final.drop(['CustomerID'], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape)

(800, 1) (800, 80)


In [15]:
# RNN model for product attribution modelling past sequential data to learn long term non-linear 
# dependencies in customer touchpoint lifecycle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.optimizers import Adam

model = Sequential()
model.add(LSTM(units=100, input_shape=(X_train.shape[1], 1), return_sequences=True))
model.add(Dropout(0.1))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mse')
model.summary()

print(X_train.shape, X_test.shape)
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

accuracy = model.evaluate(X_test, y_test)
print(accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 1, 100)            40800     
                                                                 
 dropout (Dropout)           (None, 1, 100)            0         
                                                                 
 dense (Dense)               (None, 1, 1)              101       
                                                                 
Total params: 40901 (159.77 KB)
Trainable params: 40901 (159.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
(800, 1) (200, 1)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1223.0616455078125


In [16]:
# LSTM model for product attribution modelling past sequential data to learn long term non-linear 
# dependencies in customer touchpoint lifecycle

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Generate sample data with integer and float features
def generate_data(sequence_length, num_sequences):
    X = []
    y = []
    z = []
    for _ in range(num_sequences):
        int_sequence = np.random.randint(1, 10, sequence_length)
        float_sequence = np.sin(np.linspace(0, 2 * np.pi, sequence_length))
        combined_sequence = np.column_stack((int_sequence, float_sequence))
        X.append(int_sequence)
        y.append(float_sequence[-1])
        z.append(float_sequence)
    X = np.array(X)
    y = np.array(y)
    z = np.array(z)
    return X, y, z

# Parameters
sequence_length = 10
num_sequences = 1000
batch_size = 32
epochs = 10

# Generate data
X, y, z = generate_data(sequence_length, num_sequences)
print(X[:10])
print('y: ',y[:10], y.shape)
print('z: ', z[:10], z.shape)


# Reshape data for LSTM (samples, timesteps, features)
# The data is already in the correct shape, so we can use it directly

# Build LSTM model
model = Sequential([
    LSTM(50, activation='relu', input_shape=(sequence_length, 1)),
    Dense(1)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, y, epochs=epochs, batch_size=batch_size)

# Evaluate the model
loss = model.evaluate(X, y)
print(f'Model loss: {loss}')


[[7 9 4 8 4 4 6 6 5 9]
 [3 9 6 8 6 5 5 8 9 9]
 [6 7 7 4 7 1 6 7 6 5]
 [4 1 6 2 6 2 2 9 8 4]
 [6 3 7 4 9 5 9 1 9 5]
 [1 6 6 3 5 9 9 5 2 9]
 [9 9 4 4 3 6 8 6 9 8]
 [1 3 8 4 6 2 6 2 2 3]
 [5 4 2 2 3 9 2 8 9 3]
 [7 6 2 7 3 6 7 2 6 9]]
y:  [-2.4492936e-16 -2.4492936e-16 -2.4492936e-16 -2.4492936e-16
 -2.4492936e-16 -2.4492936e-16 -2.4492936e-16 -2.4492936e-16
 -2.4492936e-16 -2.4492936e-16] (1000,)
z:  [[ 0.00000000e+00  6.42787610e-01  9.84807753e-01  8.66025404e-01
   3.42020143e-01 -3.42020143e-01 -8.66025404e-01 -9.84807753e-01
  -6.42787610e-01 -2.44929360e-16]
 [ 0.00000000e+00  6.42787610e-01  9.84807753e-01  8.66025404e-01
   3.42020143e-01 -3.42020143e-01 -8.66025404e-01 -9.84807753e-01
  -6.42787610e-01 -2.44929360e-16]
 [ 0.00000000e+00  6.42787610e-01  9.84807753e-01  8.66025404e-01
   3.42020143e-01 -3.42020143e-01 -8.66025404e-01 -9.84807753e-01
  -6.42787610e-01 -2.44929360e-16]
 [ 0.00000000e+00  6.42787610e-01  9.84807753e-01  8.66025404e-01
   3.42020143e-01 -3.42020143e-0

In [17]:
# autoencoders for collaborative filtering

import numpy as np
import pandas as pd
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Sample data (user_id, item_id, rating)
data = {
    'user_id': [1, 1, 1, 2, 2, 3, 3, 3, 4, 4],
    'item_id': ['A', 'B', 'C', 'A', 'B', 'A', 'B', 'C', 'A', 'C'],
    'rating': [5, 3, 4, 4, 2, 5, 1, 4, 3, 2]
}

# Load data into DataFrame
df = pd.DataFrame(data)

# Create mappings for user_id and item_id
user_mapping = {id: idx for idx, id in enumerate(df['user_id'].unique())}
item_mapping = {id: idx for idx, id in enumerate(df['item_id'].unique())}

# Map user_id and item_id to indices
df['user_index'] = df['user_id'].map(user_mapping)
df['item_index'] = df['item_id'].map(item_mapping)

# Prepare inputs and outputs
X = df[['user_index', 'item_index']].values
y = df['rating'].values.reshape(-1, 1)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Number of users and items
num_users = len(user_mapping)
num_items = len(item_mapping)
latent_dim = 10  # Latent dimension for embeddings

# Autoencoder model
input_user = Input(shape=(1,))
input_item = Input(shape=(1,))

# User embedding
user_embedding = Dense(latent_dim, activation='relu')(input_user)

# Item embedding
item_embedding = Dense(latent_dim, activation='relu')(input_item)

# Concatenate user and item embeddings
concat = Dense(latent_dim, activation='relu')(user_embedding)
concat = Dense(latent_dim, activation='relu')(concat)

# Output layer
output = Dense(1)(concat)

# Model
model = Model(inputs=[input_user, input_item], outputs=output)

# Compile model
model.compile(optimizer='adam', loss='mse')

# Train model
model.fit([X_train[:,0], X_train[:,1]], y_train, epochs=50, batch_size=64, validation_data=([X_test[:,0], X_test[:,1]], y_test))

# Make predictions
predictions = model.predict([X_test[:,0], X_test[:,1]])

# Example prediction for user 1 and item 'B'
test_user = 1
test_item = 'B'
test_user_index = user_mapping[test_user]
test_item_index = item_mapping[test_item]

predicted_rating = model.predict([[test_user_index], [test_item_index]])
print(f"Predicted rating for user {test_user} and item {test_item}: {predicted_rating[0][0]}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


ValueError: in user code:

    File "/Users/nitinsinghal/anaconda3/lib/python3.10/site-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/Users/nitinsinghal/anaconda3/lib/python3.10/site-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/nitinsinghal/anaconda3/lib/python3.10/site-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/Users/nitinsinghal/anaconda3/lib/python3.10/site-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/Users/nitinsinghal/anaconda3/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/nitinsinghal/anaconda3/lib/python3.10/site-packages/keras/src/engine/input_spec.py", line 219, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1) dtype=int64>]


In [None]:
# Multi task learning
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate
from tensorflow.keras.models import Model

# Define input layer
input_layer = Input(shape=(X_train.shape[1], 1))

# Shared layers
shared_dense1 = Dense(64, activation='relu')(input_layer)
shared_dense2 = Dense(32, activation='relu')(shared_dense1)

# Sales prediction branch
sales_dense1 = Dense(16, activation='relu')(shared_dense2)
sales_output = Dense(1, activation='linear', name='sales_output')(sales_dense1)

# Engagement prediction branch
engagement_dense1 = Dense(16, activation='relu')(shared_dense2)
engagement_output = Dense(1, activation='sigmoid', name='engagement_output')(engagement_dense1)

# Define model
model = Model(inputs=input_layer, outputs=[sales_output, engagement_output])

# Compile model
model.compile(optimizer='adam', loss={'sales_output': 'mse', 'engagement_output': 'binary_crossentropy'},
              metrics={'sales_output': 'mae', 'engagement_output': 'accuracy'})

# Train model
model.fit(X_train, {'sales_output': y_sales, 'engagement_output': y_engagement}, epochs=10, batch_size=32)

In [None]:
# CNN model for product attribution using 1D CNN model to understand spatial relationship
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense

model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1])))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Fit the model
model.fit(X_train, y_train, epochs=200, batch_size=32, validation_data=(X_val, y_val))