# Generate Clickstream Data
**This notebook generates clickstream data with explicit frequent viewing of products before placing an order.**

**THIS NOTEBOOK CAN BE RUN IN PARALLEL WITH `1_setup.ipynb`**

**Recommended settings to run this notebook in SageMaker Studio:**

- Image: Data Science
- Kernel: Python3
- Instance type: <font color='blue'>ml.m5.large (2 vCPU + 8 GiB)</font>

---

## Contents

1. [Background](#Background)

## Background


This is for tracking viewing habits and translating them to useful conversion figures.

## Setup 

### Prerequisites

In [162]:
!pip install Faker confluent-kafka



#### Imports

In [163]:
from botocore.client import ClientError
from collections import defaultdict
from faker import Faker
import pandas as pd
import numpy as np
import sagemaker
import datetime
import hashlib
import random
import boto3
import math
import os

#### Seed for Reproducibility

In [164]:
faker = Faker()
faker.seed_locale('en_US', 0)
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
faker.seed_instance(SEED)

#### Constants

In [178]:
# TOTAL_UNIQUE_TRANSACTIONS = 5400000 # 5.4 Million
# TOTAL_UNIQUE_USERS = 10000
BUCKET = sagemaker.Session().default_bucket()

In [165]:
import json
import time
import random
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker and Kafka Producer
fake = Faker()
# Possible event types
event_types = ['page_view', 'click', 'add_to_cart', 'purchase']

def delivery_report(err, msg):
    """Called once for each message produced to indicate delivery result."""
    if err is not None:
        print(f'Message delivery failed: {err}')
    else:
        print(f'Message delivered to {msg.topic()} [{msg.partition()}]')

def generate_dummy_event():
    """Generates a dummy customer interaction event."""
    return {
        'event_id': fake.uuid4(),
        'timestamp': fake.iso8601(),
        'customer_id': random.randint(1, 1000),
        'session_id': fake.uuid4(),
        'event_type': random.choice(event_types),
        'product_id': random.randint(1, 500),
        'product_category': random.choice(['electronics', 'fashion', 'home', 'books', 'toys']),
        'price': round(random.uniform(10.0, 500.0), 2)
    }

#### Simple generation of random events with no clear directions towards a purchase

In [7]:
import json
import time
import random
import datetime
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker and Kafka Producer
fake = Faker()
# Define extended event types including scrolling behavior
event_types = ['page_view', 'scroll', 'click', 'add_to_cart', 'purchase']
def generate_session_events(customer_id):
    """Generates an ordered list of dummy events for a given session."""
    session_id = fake.uuid4()
    event_chain = []
    # Start with a random timestamp (e.g., within the current month)
    current_time = fake.date_time_this_month().strftime('%Y-%m-%d %H:%M:%S')
    # Random number of events for the session (between 5 and 10 events)
    n_events = random.randint(5, 10)
    
    for i in range(n_events):
        # Randomly choose an event type
        event_type = random.choice(event_types)
        # Increment time by a random delta (5 to 30 seconds) to simulate order
        current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
        delta = datetime.timedelta(seconds=random.randint(5, 30))
        current_dt += delta
        current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
        
        event = {
            'event_id': fake.uuid4(),
            'timestamp': current_time,
            'customer_id': customer_id,
            'session_id': session_id,
            'event_type': event_type,
            'product_id': random.randint(1, 500),
            'product_category': random.choice(['electronics', 'fashion', 'home', 'books', 'toys']),
            'price': round(random.uniform(10.0, 500.0), 2),
            'order_in_session': i + 1  # Indicates the order of the event within the session
        }
        event_chain.append(event)
    return event_chain

# Continuously generate and send session events
currentSessions, totalSessions = 0, 10
allSessions = []
while currentSessions < totalSessions:
    # Pick a random customer
    customer_id = random.randint(1, 1000)
    session_events = generate_session_events(customer_id)
    allSessions.append(session_events)
    currentSessions += 1

In [103]:
print([len(sess) for sess in allSessions])
for event in allSessions[1][:]:
    print(event)
    print("---------------------------")

[6, 7, 10, 5, 9, 8, 6, 7, 10, 9]
{'event_id': '575ded00-8dd0-4300-a7e6-7818bb1150b5', 'timestamp': '2025-03-04 03:19:39', 'customer_id': 472, 'session_id': '6fe11191-4fea-42da-afc9-e2d5389adb88', 'event_type': 'purchase', 'product_id': 9, 'product_category': 'books', 'price': 265.23, 'order_in_session': 1}
---------------------------
{'event_id': '5e12d07f-de9d-48fd-b635-9e6124f6dd68', 'timestamp': '2025-03-04 03:19:55', 'customer_id': 472, 'session_id': '6fe11191-4fea-42da-afc9-e2d5389adb88', 'event_type': 'click', 'product_id': 110, 'product_category': 'books', 'price': 468.94, 'order_in_session': 2}
---------------------------
{'event_id': '388d26f5-a35a-4770-be1d-0b72ec7f03b9', 'timestamp': '2025-03-04 03:20:16', 'customer_id': 472, 'session_id': '6fe11191-4fea-42da-afc9-e2d5389adb88', 'event_type': 'page_view', 'product_id': 425, 'product_category': 'electronics', 'price': 300.48, 'order_in_session': 3}
---------------------------
{'event_id': '24b68581-3110-4c51-83f2-e5b8a03876c0

#### Generation of  events with clear directions towards a purchase

Based on these factors, users will indicate higher chances of an actual purchase:
- moderate dwell times
- 2 or more consecutive interactions of page clicks, views, scrolling

In [174]:
import json
import time
import random
import datetime
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker and Kafka Producer
fake = Faker()

# Define a dwell time distribution (in seconds) as (min, max, probability)
dwell_distribution = [
    (1, 3, 0.05),    # 10% of events: very quick interactions
    (3, 10, 0.70),   # 60% of events: typical dwell times
    (10, 30, 0.20),  # 20% of events: longer engagement
    (30, 60, 0.05)   # 10% of events: very long dwell times
]

def get_dwell_time():
    """Return a dwell time sampled from the defined distribution."""
    r = random.random()
    cumulative = 0
    for start, end, prob in dwell_distribution:
        cumulative += prob
        if r <= cumulative:
            return round(random.uniform(start, end), 2)
    return round(random.uniform(3, 10), 2)

def generate_session_events(customer_id):
    """
    Simulate a user session on a well-functioning site.
    
    - The session consists of one or more product chains.
    - Within each chain, a single product is focused on.
    - The user starts with a page_view and performs a series of events (page_view, scroll, click).
    - The chance to add to cart increases when:
        • There are multiple consecutive interactions (e.g. clicks),
        • And the dwell time between events is moderate.
    - At the end of the session, if any product was added to cart, a purchase event is generated.
    """
    session_id = fake.uuid4()
    events = []
    current_time = fake.date_time_this_month().strftime('%Y-%m-%d %H:%M:%S')
    order_in_session = 1
    cart = []  # Track product_ids added to cart

    # Decide the number of product chains in the session (e.g., 1 to 3)
    num_chains = random.randint(1, 10)
    
    for _ in range(num_chains):
        # For each chain, choose a single product to focus on
        product_id = random.randint(1, 500)
        product_category = random.choice(['electronics', 'fashion', 'home', 'books', 'toys'])
        consecutive_interactions = 0
        chain_length = random.randint(3, 6)
        
        for i in range(chain_length):
            dwell = get_dwell_time()
            # Update the timestamp based on the dwell time
            current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
            current_dt += datetime.timedelta(seconds=dwell)
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            
            # First event in chain is always a page_view
            if i == 0:
                event_type = 'page_view'
            else:
                # Choose randomly among browsing events
                event_type = random.choice(['page_view', 'scroll', 'click'])
                if event_type == 'click':
                    consecutive_interactions += 1
                else:
                    # Reduce consecutive interaction count slightly for non-clicks
                    consecutive_interactions = max(0, consecutive_interactions - 1)
            
            # Increase chance for add_to_cart if conditions are met
            if (i > 1 and product_id not in cart and consecutive_interactions >= 2 and 3 <= dwell <= 10):
                if random.random() < 0.7:
                    event_type = 'add_to_cart'
                    cart.append(product_id)
            
            event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': event_type,
                'product_id': product_id,
                'product_category': product_category,
                'price': round(random.uniform(10.0, 500.0), 2),
                'order_in_session': order_in_session
            }
            events.append(event)
            order_in_session += 1

    # At the end of the session, if any products were added to the cart, generate a purchase event.
    if cart:
        dwell = get_dwell_time()
        current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
        current_dt += datetime.timedelta(seconds=dwell)
        current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
        purchase_event = {
            'event_id': fake.uuid4(),
            'timestamp': current_time,
            'customer_id': customer_id,
            'session_id': session_id,
            'event_type': 'purchase',
            'purchased_items': cart,  # List of product_ids
            'total_amount': sum(round(random.uniform(10.0, 500.0), 2) for _ in cart),
            'order_in_session': order_in_session
        }
        events.append(purchase_event)
        order_in_session += 1

    # Optionally, the user may continue browsing after purchase (simulate one additional chain)
    if random.random() < 0.5:
        product_id = random.randint(1, 500)
        product_category = random.choice(['electronics', 'fashion', 'home', 'books', 'toys'])
        chain_length = random.randint(2, 4)
        for _ in range(chain_length):
            dwell = get_dwell_time()
            current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
            current_dt += datetime.timedelta(seconds=dwell)
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': random.choice(['page_view', 'scroll']),
                'product_id': product_id,
                'product_category': product_category,
                'price': round(random.uniform(10.0, 500.0), 2),
                'order_in_session': order_in_session
            }
            events.append(event)
            order_in_session += 1

    return events

# currentSessions, totalSessions = 0, 50000
# allSessions3 = []
# # Continuously generate and send session events
# while currentSessions < totalSessions:
#     print(currentSessions)
#     # Pick a random customer
#     customer_id = random.randint(1, 1000)
#     session_events = generate_session_events(customer_id)
#     allSessions3.extend(session_events)
#     currentSessions += 1

In [167]:
print(len(allSessions3))
# for event in allSessions3[:10]:
#     print(event)
#     print("---------------------------")
df_all = pd.DataFrame(allSessions3)
print(len(df_all))
df_all.head()

1334096
1334096


Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount
0,4bdbc1d0-6636-441f-bdc4-afe69932acab,2025-03-01 06:57:58,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,28.7,1,,
1,d886de40-1e0c-4e97-ac71-26e12b8c272d,2025-03-01 06:58:03,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,88.23,2,,
2,42fccefc-3336-4b36-9eb6-1a46af559025,2025-03-01 06:58:08,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,90.25,3,,
3,28d34073-4c11-45fc-af7d-06ab813a8da3,2025-03-01 06:58:33,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,45.0,books,195.02,4,,
4,c4dd3800-cb8c-458e-b60d-b2fac36c8390,2025-03-01 06:58:35,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,453.65,5,,


In [168]:
int_vals = {
    "purchase": 0,
    "page_view": 4,
    "scroll": 2,
    "click": 1,
    "add_to_cart": 6
}
df_all["interaction_value"] = df_all["event_type"].map(int_vals)

In [169]:
print(len(df_all[df_all["event_type"] == "purchase"]))
df_all[df_all["event_type"] == "click"]

24293


Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount,interaction_value
3,28d34073-4c11-45fc-af7d-06ab813a8da3,2025-03-01 06:58:33,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,45.0,books,195.02,4,,,1
10,34bbb04d-1428-4b77-b4c1-fe03a17c5ac8,2025-03-01 06:59:29,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,458.0,toys,215.55,11,,,1
13,65f6f0bb-366f-49e1-a8d2-b1311eaee05e,2025-03-01 07:00:09,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,458.0,toys,233.82,14,,,1
17,33c16782-fe81-4525-85ab-d47703b779b5,2025-03-01 07:01:49,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,426.0,home,362.94,18,,,1
21,5b1ba343-6635-479d-a85b-9cb0cff2ca2c,2025-03-01 07:02:06,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,279.0,books,40.68,22,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1334072,8ae4c45f-6978-4941-bf9e-e9fd3cbfd981,2025-03-02 12:10:49,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,click,220.0,books,272.45,18,,,1
1334075,eeeaf879-eb6d-447c-a8a1-3d98e63cf8da,2025-03-02 12:11:00,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,click,408.0,fashion,212.75,21,,,1
1334087,68084c92-5cd8-401c-8a93-e91348c5b698,2025-03-02 12:13:06,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,click,275.0,electronics,276.84,33,,,1
1334090,78443a62-df5c-4093-9405-35d75f7b0aa0,2025-03-02 12:13:54,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,click,396.0,electronics,198.44,36,,,1


In [170]:
df_grouped = df_all.groupby(["customer_id"])
df_grouped.agg({'event_type': ','.join}) 

Unnamed: 0_level_0,event_type
customer_id,Unnamed: 1_level_1
1,"page_view,click,click,scroll,scroll,click,page..."
2,"page_view,click,page_view,page_view,page_view,..."
3,"page_view,click,scroll,page_view,scroll,scroll..."
4,"page_view,page_view,scroll,click,click,page_vi..."
5,"page_view,scroll,page_view,page_view,click,pag..."
...,...
996,"page_view,page_view,page_view,click,scroll,pag..."
997,"page_view,page_view,click,click,page_view,scro..."
998,"page_view,scroll,scroll,page_view,page_view,sc..."
999,"page_view,page_view,page_view,click,page_view,..."


In [171]:
df_all['cumsum_interactions'] = df_all.groupby(['customer_id'])['interaction_value'].cumsum()

#view updated DataFrame
df_all

Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount,interaction_value,cumsum_interactions
0,4bdbc1d0-6636-441f-bdc4-afe69932acab,2025-03-01 06:57:58,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,28.70,1,,,4,4
1,d886de40-1e0c-4e97-ac71-26e12b8c272d,2025-03-01 06:58:03,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,88.23,2,,,4,8
2,42fccefc-3336-4b36-9eb6-1a46af559025,2025-03-01 06:58:08,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,90.25,3,,,4,12
3,28d34073-4c11-45fc-af7d-06ab813a8da3,2025-03-01 06:58:33,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,45.0,books,195.02,4,,,1,13
4,c4dd3800-cb8c-458e-b60d-b2fac36c8390,2025-03-01 06:58:35,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,453.65,5,,,4,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334091,4118da14-a020-487b-977f-2b559c44f991,2025-03-02 12:14:03,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,add_to_cart,396.0,electronics,218.07,37,,,6,3687
1334092,f89ce18b-160a-476e-9576-3ccaffc1a298,2025-03-02 12:14:14,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,click,396.0,electronics,134.46,38,,,1,3688
1334093,6f7f307f-cad2-411d-80be-ea4aeb1c9a27,2025-03-02 12:14:19,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,purchase,,,,39,"[408, 396]",566.95,0,3688
1334094,01b85e41-98b8-4b0c-87b6-850e18bad754,2025-03-02 12:14:27,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,page_view,246.0,books,170.43,40,,,4,3692


In [172]:
df_all[df_all["event_type"] == "purchase"]

Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount,interaction_value,cumsum_interactions
22,34dcc464-4a71-4b30-8b1a-85ecd01c9aba,2025-03-01 07:02:09,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,purchase,,,,23,[458],222.27,0,69
39,1f29221b-a24e-4ca3-a148-3644ddabd7c2,2025-03-04 10:17:52,25,1ee2f05d-9700-404a-b3f2-2e3e189bad74,purchase,,,,15,[396],446.35,0,42
76,c63d44d4-c9dd-44af-bea9-0d44122da3ef,2025-03-02 00:10:38,139,b8a7cc9e-b042-4f7e-9b52-a033b90f9d54,purchase,,,,37,"[36, 137]",425.38,0,108
135,9f3951b8-9d36-40bb-b757-1e0a7780bd99,2025-03-02 18:19:46,660,e311687e-c534-4571-97a4-0b1d1c016ee5,purchase,,,,12,[401],384.13,0,33
199,482bff43-b006-4150-96ec-f6d84cfc567a,2025-03-03 04:14:33,448,d9798699-046c-48d1-ade8-b3618f68bd9a,purchase,,,,41,[196],376.75,0,116
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333865,368881ee-288e-4a3e-bdb6-a5edd391002a,2025-03-01 16:03:49,429,d02a2109-fc68-4446-8a93-f687ec728df4,purchase,,,,44,[352],373.31,0,4314
1333942,1e4961ef-24e0-4028-84ef-8e33649b69f2,2025-03-03 18:04:55,136,044565ae-2664-4e6c-97bf-9dfae2ae5439,purchase,,,,10,"[160, 153]",698.82,0,3744
1333981,7e1114fd-a5dc-4414-88d9-e408b7f0979b,2025-03-03 01:16:50,841,1d3bf0ff-6b7b-43b3-a330-732d0d30677c,purchase,,,,39,[481],358.73,0,3569
1334035,67224395-03d3-4c5f-82da-f894a73b8daa,2025-03-01 19:49:46,377,21502a8d-4e9e-46ed-94a2-7dc21010ba7b,purchase,,,,37,"[272, 18, 107, 263]",864.11,0,3563


In [175]:
data_dir = os.path.join(os.getcwd(), 'data/raw_clicks')
os.makedirs(data_dir, exist_ok=True)

In [179]:
df_all.to_csv("data/raw_clicks/clickstream.csv", index=False)
df_all.to_csv(f's3://{BUCKET}/raw_clicks/clickstream.csv', index=False)