# Generate Clickstream Data
**This notebook generates clickstream data with explicit frequent viewing of products before placing an order.**

**THIS NOTEBOOK CAN BE RUN IN PARALLEL WITH `1_setup.ipynb`**

**Recommended settings to run this notebook in SageMaker Studio:**

- Image: Data Science
- Kernel: Python3
- Instance type: <font color='blue'>ml.m5.large (2 vCPU + 8 GiB)</font>

---

## Contents

1. [Background](#Background)

## Background


This is for tracking viewing habits and translating them to useful conversion figures.

## Setup 

### Prerequisites

In [1]:
!pip install Faker confluent-kafka

Collecting Faker
  Downloading faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Collecting confluent-kafka
  Downloading confluent_kafka-2.8.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (22 kB)
Downloading faker-37.0.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m107.5 MB/s[0m eta [36m0:00:00[0m
Downloading confluent_kafka-2.8.2-cp311-cp311-manylinux_2_28_x86_64.whl (3.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m145.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Faker, confluent-kafka
Successfully installed Faker-37.0.0 confluent-kafka-2.8.2


#### Imports

In [2]:
from botocore.client import ClientError
from collections import defaultdict
from faker import Faker
import pandas as pd
import numpy as np
import sagemaker
import datetime
import hashlib
import random
import boto3
import math
import os



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


#### Seed for Reproducibility

In [3]:
faker = Faker()
faker.seed_locale('en_US', 0)
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
faker.seed_instance(SEED)

#### Constants

In [4]:
# TOTAL_UNIQUE_TRANSACTIONS = 5400000 # 5.4 Million
# TOTAL_UNIQUE_USERS = 10000
BUCKET = sagemaker.Session().default_bucket()

In [165]:
import json
import time
import random
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker and Kafka Producer
fake = Faker()
# Possible event types
event_types = ['page_view', 'click', 'add_to_cart', 'purchase']

def delivery_report(err, msg):
    """Called once for each message produced to indicate delivery result."""
    if err is not None:
        print(f'Message delivery failed: {err}')
    else:
        print(f'Message delivered to {msg.topic()} [{msg.partition()}]')

def generate_dummy_event():
    """Generates a dummy customer interaction event."""
    return {
        'event_id': fake.uuid4(),
        'timestamp': fake.iso8601(),
        'customer_id': random.randint(1, 1000),
        'session_id': fake.uuid4(),
        'event_type': random.choice(event_types),
        'product_id': random.randint(1, 500),
        'product_category': random.choice(['electronics', 'fashion', 'home', 'books', 'toys']),
        'price': round(random.uniform(10.0, 500.0), 2)
    }

#### Simple generation of random events with no clear directions towards a purchase

In [7]:
import json
import time
import random
import datetime
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker and Kafka Producer
fake = Faker()
# Define extended event types including scrolling behavior
event_types = ['page_view', 'scroll', 'click', 'add_to_cart', 'purchase']
def generate_session_events(customer_id):
    """Generates an ordered list of dummy events for a given session."""
    session_id = fake.uuid4()
    event_chain = []
    # Start with a random timestamp (e.g., within the current month)
    current_time = fake.date_time_this_month().strftime('%Y-%m-%d %H:%M:%S')
    # Random number of events for the session (between 5 and 10 events)
    n_events = random.randint(5, 10)
    
    for i in range(n_events):
        # Randomly choose an event type
        event_type = random.choice(event_types)
        # Increment time by a random delta (5 to 30 seconds) to simulate order
        current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
        delta = datetime.timedelta(seconds=random.randint(5, 30))
        current_dt += delta
        current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
        
        event = {
            'event_id': fake.uuid4(),
            'timestamp': current_time,
            'customer_id': customer_id,
            'session_id': session_id,
            'event_type': event_type,
            'product_id': random.randint(1, 500),
            'product_category': random.choice(['electronics', 'fashion', 'home', 'books', 'toys']),
            'price': round(random.uniform(10.0, 500.0), 2),
            'order_in_session': i + 1  # Indicates the order of the event within the session
        }
        event_chain.append(event)
    return event_chain

# Continuously generate and send session events
currentSessions, totalSessions = 0, 10
allSessions = []
while currentSessions < totalSessions:
    # Pick a random customer
    customer_id = random.randint(1, 1000)
    session_events = generate_session_events(customer_id)
    allSessions.append(session_events)
    currentSessions += 1

In [103]:
print([len(sess) for sess in allSessions])
for event in allSessions[1][:]:
    print(event)
    print("---------------------------")

[6, 7, 10, 5, 9, 8, 6, 7, 10, 9]
{'event_id': '575ded00-8dd0-4300-a7e6-7818bb1150b5', 'timestamp': '2025-03-04 03:19:39', 'customer_id': 472, 'session_id': '6fe11191-4fea-42da-afc9-e2d5389adb88', 'event_type': 'purchase', 'product_id': 9, 'product_category': 'books', 'price': 265.23, 'order_in_session': 1}
---------------------------
{'event_id': '5e12d07f-de9d-48fd-b635-9e6124f6dd68', 'timestamp': '2025-03-04 03:19:55', 'customer_id': 472, 'session_id': '6fe11191-4fea-42da-afc9-e2d5389adb88', 'event_type': 'click', 'product_id': 110, 'product_category': 'books', 'price': 468.94, 'order_in_session': 2}
---------------------------
{'event_id': '388d26f5-a35a-4770-be1d-0b72ec7f03b9', 'timestamp': '2025-03-04 03:20:16', 'customer_id': 472, 'session_id': '6fe11191-4fea-42da-afc9-e2d5389adb88', 'event_type': 'page_view', 'product_id': 425, 'product_category': 'electronics', 'price': 300.48, 'order_in_session': 3}
---------------------------
{'event_id': '24b68581-3110-4c51-83f2-e5b8a03876c0

#### Generation of  events with clear directions towards a purchase

Based on these factors, users will indicate higher chances of an actual purchase:
- moderate dwell times
- 2 or more consecutive interactions of page clicks, views, scrolling

## Generating events without time restriction

In [37]:
import json
import time
import random
import datetime
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker and Kafka Producer
fake = Faker()

# Define a dwell time distribution (in seconds) as (min, max, probability)
dwell_distribution = [
    (1, 3, 0.05),    # 10% of events: very quick interactions
    (3, 10, 0.70),   # 60% of events: typical dwell times
    (10, 30, 0.20),  # 20% of events: longer engagement
    (30, 60, 0.05)   # 10% of events: very long dwell times
]

def get_dwell_time():
    """Return a dwell time sampled from the defined distribution."""
    r = random.random()
    cumulative = 0
    for start, end, prob in dwell_distribution:
        cumulative += prob
        if r <= cumulative:
            return round(random.uniform(start, end), 2)
    return round(random.uniform(3, 10), 2)

def generate_session_events(customer_id):
    """
    Simulate a user session on a well-functioning site.
    
    - The session consists of one or more product chains.
    - Within each chain, a single product is focused on.
    - The user starts with a page_view and performs a series of events (page_view, scroll, click).
    - The chance to add to cart increases when:
        • There are multiple consecutive interactions (e.g. clicks),
        • And the dwell time between events is moderate.
    - At the end of the session, if any product was added to cart, a purchase event is generated.
    """
    session_id = fake.uuid4()
    events = []
    current_time = fake.date_time_this_month().strftime('%Y-%m-%d %H:%M:%S')
    order_in_session = 1
    cart = []  # Track product_ids added to cart

    # Decide the number of product chains in the session (e.g., 1 to 3)
    num_chains = random.randint(1, 10)
    
    for _ in range(num_chains):
        # For each chain, choose a single product to focus on
        product_id = random.randint(1, 500)
        product_category = random.choice(['electronics', 'fashion', 'home', 'books', 'toys'])
        consecutive_interactions = 0
        chain_length = random.randint(3, 6)
        
        for i in range(chain_length):
            dwell = get_dwell_time()
            # Update the timestamp based on the dwell time
            current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
            current_dt += datetime.timedelta(seconds=dwell)
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            
            # First event in chain is always a page_view
            if i == 0:
                event_type = 'page_view'
            else:
                # Choose randomly among browsing events
                event_type = random.choice(['page_view', 'scroll', 'click'])
                if event_type == 'click':
                    consecutive_interactions += 1
                else:
                    # Reduce consecutive interaction count slightly for non-clicks
                    consecutive_interactions = max(0, consecutive_interactions - 1)
            
            # Increase chance for add_to_cart if conditions are met
            if (i > 1 and product_id not in cart and consecutive_interactions >= 2 and 3 <= dwell <= 10):
                if random.random() < 0.7:
                    event_type = 'add_to_cart'
                    cart.append(product_id)
            
            event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': event_type,
                'product_id': product_id,
                'product_category': product_category,
                'price': round(random.uniform(10.0, 500.0), 2),
                'order_in_session': order_in_session
            }
            events.append(event)
            order_in_session += 1

    # At the end of the session, if any products were added to the cart, generate a purchase event.
    if cart:
        dwell = get_dwell_time()
        current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
        current_dt += datetime.timedelta(seconds=dwell)
        current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
        purchase_event = {
            'event_id': fake.uuid4(),
            'timestamp': current_time,
            'customer_id': customer_id,
            'session_id': session_id,
            'event_type': 'purchase',
            'purchased_items': cart,  # List of product_ids
            'total_amount': sum(round(random.uniform(10.0, 500.0), 2) for _ in cart),
            'order_in_session': order_in_session
        }
        events.append(purchase_event)
        order_in_session += 1

    # Optionally, the user may continue browsing after purchase (simulate one additional chain)
    if random.random() < 0.5:
        product_id = random.randint(1, 500)
        product_category = random.choice(['electronics', 'fashion', 'home', 'books', 'toys'])
        chain_length = random.randint(2, 4)
        for _ in range(chain_length):
            dwell = get_dwell_time()
            current_dt = datetime.datetime.strptime(current_time, '%Y-%m-%d %H:%M:%S')
            current_dt += datetime.timedelta(seconds=dwell)
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': random.choice(['page_view', 'scroll']),
                'product_id': product_id,
                'product_category': product_category,
                'price': round(random.uniform(10.0, 500.0), 2),
                'order_in_session': order_in_session
            }
            events.append(event)
            order_in_session += 1

    return events

currentSessions, totalSessions = 0, 1
allSessions3 = []
# Continuously generate and send session events
while currentSessions < totalSessions:
    print(currentSessions)
    # Pick a random customer
    customer_id = random.randint(1, 1000)
    session_events = generate_session_events(customer_id)
    allSessions3.extend(session_events)
    currentSessions += 1

0


## Generating events WITH time restriction

In [39]:
import json
import time
import random
import datetime
from faker import Faker
from confluent_kafka import Producer

# Initialize Faker
fake = Faker()

# Define a dwell time distribution (in seconds) as (min, max, probability)
dwell_distribution = [
    (1, 3, 0.05),    # 5% of events: very quick interactions
    (3, 10, 0.70),   # 70% of events: typical dwell times
    (10, 30, 0.20),  # 20% of events: longer engagement
    (30, 60, 0.05)   # 5% of events: very long dwell times
]

def get_dwell_time():
    """Return a dwell time sampled from the defined distribution."""
    r = random.random()
    cumulative = 0
    for start, end, prob in dwell_distribution:
        cumulative += prob
        if r <= cumulative:
            return round(random.uniform(start, end), 2)
    return round(random.uniform(3, 10), 2)

def generate_session_events_period(customer_id, start_time, end_time):
    """
    Simulate a user session that generates events only between start_time and end_time.
    
    Parameters:
        customer_id (int): The customer identifier.
        start_time (str): Session start time in '%Y-%m-%d %H:%M:%S' format.
        end_time (str): Session end time in '%Y-%m-%d %H:%M:%S' format.
    
    Returns:
        list: A list of event dictionaries.
    """
    # Convert string inputs to datetime objects
    current_dt = datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
    end_dt = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')
    
    session_id = fake.uuid4()
    events = []
    order_in_session = 1
    cart = []  # Track product_ids added to cart

    # Decide the number of product chains in the session
    num_chains = random.randint(1, 10)
    
    for _ in range(num_chains):
        # Check if there's still time for another chain
        if current_dt >= end_dt:
            break

        product_id = random.randint(1, 500)
        product_category = random.choice(['electronics', 'fashion', 'home', 'books', 'toys'])
        consecutive_interactions = 0
        chain_length = random.randint(3, 6)
        
        for i in range(chain_length):
            # Check if adding a new event would exceed end time
            dwell = get_dwell_time()
            next_dt = current_dt + datetime.timedelta(seconds=dwell)
            if next_dt > end_dt:
                # Stop generating further events if we've reached the end
                return events
            current_dt = next_dt
            
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            
            # Determine event type
            if i == 0:
                event_type = 'page_view'
            else:
                event_type = random.choice(['page_view', 'scroll', 'click'])
                if event_type == 'click':
                    consecutive_interactions += 1
                else:
                    consecutive_interactions = max(0, consecutive_interactions - 1)
            
            # Increase chance for add_to_cart if conditions are met
            if (i > 1 and product_id not in cart and consecutive_interactions >= 2 and 3 <= dwell <= 10):
                if random.random() < 0.7:
                    event_type = 'add_to_cart'
                    cart.append(product_id)
            
            event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': event_type,
                'product_id': product_id,
                'product_category': product_category,
                'price': round(random.uniform(10.0, 500.0), 2),
                'order_in_session': order_in_session
            }
            events.append(event)
            order_in_session += 1
        
    # If any product was added to cart, generate a purchase event before end_time
    if cart and current_dt < end_dt:
        dwell = get_dwell_time()
        next_dt = current_dt + datetime.timedelta(seconds=dwell)
        if next_dt <= end_dt:
            current_dt = next_dt
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            purchase_event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': 'purchase',
                'purchased_items': cart,
                'total_amount': sum(round(random.uniform(10.0, 500.0), 2) for _ in cart),
                'order_in_session': order_in_session
            }
            events.append(purchase_event)
            order_in_session += 1

    # Optionally, simulate additional browsing if time permits
    if random.random() < 0.5 and current_dt < end_dt:
        product_id = random.randint(1, 500)
        product_category = random.choice(['electronics', 'fashion', 'home', 'books', 'toys'])
        chain_length = random.randint(2, 4)
        for _ in range(chain_length):
            dwell = get_dwell_time()
            next_dt = current_dt + datetime.timedelta(seconds=dwell)
            if next_dt > end_dt:
                break
            current_dt = next_dt
            current_time = current_dt.strftime('%Y-%m-%d %H:%M:%S')
            event = {
                'event_id': fake.uuid4(),
                'timestamp': current_time,
                'customer_id': customer_id,
                'session_id': session_id,
                'event_type': random.choice(['page_view', 'scroll']),
                'product_id': product_id,
                'product_category': product_category,
                'price': round(random.uniform(10.0, 500.0), 2),
                'order_in_session': order_in_session
            }
            events.append(event)
            order_in_session += 1

    return events

# Example usage:
start_time = "2025-03-01 02:01:45"
end_time = "2025-03-08 14:33:55"
customer_id = random.randint(1, 1000)
session_events = generate_session_events_period(customer_id, start_time, end_time)
# print(json.dumps(session_events, indent=2))


currentSessions, totalSessions = 0, 10
allSessions3 = []
# Continuously generate and send session events
while currentSessions < totalSessions:
    print(currentSessions)
    # Pick a random customer
    customer_id = random.randint(1, 1000)
    session_events = generate_session_events_period(customer_id, start_time, end_time)
    allSessions3.extend(session_events)
    currentSessions += 1


0
1
2
3
4
5
6
7
8
9


In [40]:
df_all = pd.DataFrame(allSessions3)
print(len(df_all))
df_all.head()

208
208


Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount
0,f52b6be1-7e05-4b61-9391-c399508440ee,2025-03-01 06:00:04,152,f55a68c8-fba0-402b-b6f6-2827531cc651,page_view,371.0,electronics,494.14,1,,
1,410ba27e-c9c2-4696-a441-e5ba09203e51,2025-03-01 06:00:11,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,371.0,electronics,133.95,2,,
2,90c34adb-f11a-4aa8-9724-7f26b3413c0d,2025-03-01 06:00:16,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,371.0,electronics,143.87,3,,
3,fdff7dd1-bfdf-415c-ab8f-96d52bf9a38e,2025-03-01 06:00:26,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,371.0,electronics,188.73,4,,
4,3038ff39-4fae-4437-86ba-e1309ac07747,2025-03-01 06:00:33,152,f55a68c8-fba0-402b-b6f6-2827531cc651,add_to_cart,371.0,electronics,201.14,5,,


In [41]:
int_vals = {
    "purchase": 0,
    "page_view": 4,
    "scroll": 2,
    "click": 1,
    "add_to_cart": 6
}
df_all["interaction_value"] = df_all["event_type"].map(int_vals)

In [42]:
print(len(df_all[df_all["event_type"] == "purchase"]))
df_all[df_all["event_type"] == "click"]

5


Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount,interaction_value
1,410ba27e-c9c2-4696-a441-e5ba09203e51,2025-03-01 06:00:11,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,371.0,electronics,133.95,2,,,1
2,90c34adb-f11a-4aa8-9724-7f26b3413c0d,2025-03-01 06:00:16,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,371.0,electronics,143.87,3,,,1
3,fdff7dd1-bfdf-415c-ab8f-96d52bf9a38e,2025-03-01 06:00:26,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,371.0,electronics,188.73,4,,,1
12,a7f1181b-c14d-4af3-a563-5b301bf1e771,2025-03-01 06:01:49,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,481.0,books,278.28,13,,,1
18,efa82401-5892-4820-91dc-8fa8cadb87d5,2025-03-01 06:03:04,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,311.0,books,252.08,19,,,1
24,212fcc47-c746-4f1e-b134-1b231513651a,2025-03-01 06:03:33,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,271.0,electronics,45.08,25,,,1
28,2e9177ce-3659-4068-8056-10a6b1ad2c5b,2025-03-01 06:04:19,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,314.0,fashion,344.98,29,,,1
30,08895f09-18b6-4731-a0c7-cf5dd94eee35,2025-03-01 06:04:33,152,f55a68c8-fba0-402b-b6f6-2827531cc651,click,314.0,fashion,111.17,31,,,1
42,38fcf7b0-ff0e-4c73-b8d6-2eff144efff8,2025-03-01 06:00:13,1000,8a2da2a9-b707-4de6-ac63-1b1cf85e6502,click,364.0,fashion,355.61,2,,,1
69,298a5d75-faee-4bbf-9c22-1ccb08bec437,2025-03-01 06:00:31,502,c85a811c-3833-456e-bd77-439666d43a2b,click,360.0,fashion,423.01,2,,,1


In [170]:
df_grouped = df_all.groupby(["customer_id"])
df_grouped.agg({'event_type': ','.join}) 

Unnamed: 0_level_0,event_type
customer_id,Unnamed: 1_level_1
1,"page_view,click,click,scroll,scroll,click,page..."
2,"page_view,click,page_view,page_view,page_view,..."
3,"page_view,click,scroll,page_view,scroll,scroll..."
4,"page_view,page_view,scroll,click,click,page_vi..."
5,"page_view,scroll,page_view,page_view,click,pag..."
...,...
996,"page_view,page_view,page_view,click,scroll,pag..."
997,"page_view,page_view,click,click,page_view,scro..."
998,"page_view,scroll,scroll,page_view,page_view,sc..."
999,"page_view,page_view,page_view,click,page_view,..."


In [171]:
df_all['cumsum_interactions'] = df_all.groupby(['customer_id'])['interaction_value'].cumsum()

#view updated DataFrame
df_all

Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount,interaction_value,cumsum_interactions
0,4bdbc1d0-6636-441f-bdc4-afe69932acab,2025-03-01 06:57:58,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,28.70,1,,,4,4
1,d886de40-1e0c-4e97-ac71-26e12b8c272d,2025-03-01 06:58:03,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,88.23,2,,,4,8
2,42fccefc-3336-4b36-9eb6-1a46af559025,2025-03-01 06:58:08,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,90.25,3,,,4,12
3,28d34073-4c11-45fc-af7d-06ab813a8da3,2025-03-01 06:58:33,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,click,45.0,books,195.02,4,,,1,13
4,c4dd3800-cb8c-458e-b60d-b2fac36c8390,2025-03-01 06:58:35,54,dc54070d-0977-4624-94ba-5ab1a1f4e532,page_view,45.0,books,453.65,5,,,4,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1334091,4118da14-a020-487b-977f-2b559c44f991,2025-03-02 12:14:03,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,add_to_cart,396.0,electronics,218.07,37,,,6,3687
1334092,f89ce18b-160a-476e-9576-3ccaffc1a298,2025-03-02 12:14:14,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,click,396.0,electronics,134.46,38,,,1,3688
1334093,6f7f307f-cad2-411d-80be-ea4aeb1c9a27,2025-03-02 12:14:19,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,purchase,,,,39,"[408, 396]",566.95,0,3688
1334094,01b85e41-98b8-4b0c-87b6-850e18bad754,2025-03-02 12:14:27,554,405b48d3-1a4a-4402-b77d-47c0143a08d1,page_view,246.0,books,170.43,40,,,4,3692


In [45]:
print("No of customers who made a purchase:", len(df_all[df_all["event_type"] == "purchase"]))
df_all[df_all["event_type"] == "purchase"]

No of customers who made a purchase: 5


Unnamed: 0,event_id,timestamp,customer_id,session_id,event_type,product_id,product_category,price,order_in_session,purchased_items,total_amount,interaction_value
40,2239cc1c-3bf2-46bf-a6f7-1d65a64541e4,2025-03-01 06:07:15,152,f55a68c8-fba0-402b-b6f6-2827531cc651,purchase,,,,41,[371],239.59,0
91,d6451d3f-6536-4437-aae4-37b354476bc1,2025-03-01 06:06:14,502,c85a811c-3833-456e-bd77-439666d43a2b,purchase,,,,24,[447],154.84,0
122,a9954921-3489-4f02-b572-650daff4105a,2025-03-01 06:04:55,10,62dcea4c-1948-49cf-8130-70132727eb42,purchase,,,,31,[295],198.91,0
148,58a12523-2f63-4720-8d29-1482c5a757fd,2025-03-01 06:01:00,654,5979d24f-5652-4f87-aa0f-5d39a076e9c5,purchase,,,,7,[289],419.17,0
203,6d2f083f-fed2-4b46-9708-5f3f306729b5,2025-03-01 06:03:37,997,91c555bb-b0ca-4bdc-96b1-0af721f77ce4,purchase,,,,21,[274],104.86,0


In [175]:
data_dir = os.path.join(os.getcwd(), 'data/raw_clicks')
os.makedirs(data_dir, exist_ok=True)

In [20]:
# df_all.to_csv("data/raw_clicks/clickstream.csv", index=False)
df_all.to_csv(f's3://{BUCKET}/raw_clicks/clickstream.csv', index=False)

In [47]:
df_all = pd.read_csv(f's3://{BUCKET}/raw_clicks/clickstream.csv')
df_all["timestamp"] = pd.to_datetime(df_all["timestamp"])
print(type(df_all["timestamp"].head()[0]))
print(df_all["timestamp"].min())
print(df_all["timestamp"].max())

<class 'pandas._libs.tslibs.timestamps.Timestamp'>
2025-03-01 02:01:45
2025-03-08 14:33:55


In [23]:
TOTALLENGTH = len(df_all)
print(f"TOTALLENGTH: {TOTALLENGTH}")
# df_all = df_all[:TOTALLENGTH//4]

TOTALLENGTH: 5211


In [49]:
# len(df_all)
df_all.describe()

Unnamed: 0,timestamp,customer_id,product_id,price,order_in_session,total_amount,interaction_value,cumsum_interactions
count,5211,5211.0,5107.0,5107.0,5211.0,104.0,5211.0,5211.0
mean,2025-03-04 23:08:27.634619136,481.311265,250.277854,255.634895,17.42276,371.284712,2.794281,54.432738
min,2025-03-01 02:01:45,1.0,1.0,10.15,1.0,16.71,0.0,4.0
25%,2025-03-03 00:27:41,226.0,124.0,131.575,7.0,173.0725,2.0,23.0
50%,2025-03-05 05:03:47,469.0,249.0,256.15,15.0,359.895,2.0,47.0
75%,2025-03-06 18:37:35,771.0,384.0,378.285,26.0,478.01,4.0,78.5
max,2025-03-08 14:33:55,996.0,500.0,499.92,56.0,1144.94,6.0,223.0
std,,303.2861,146.290022,142.362508,11.938339,243.107593,1.418649,38.41671


# Generating Test Events after the train events for feature generation and prediction

In [None]:
df_all = pd.read_csv(f's3://{BUCKET}/raw_clicks/clickstream.csv')
df_all["timestamp"] = pd.to_datetime(df_all["timestamp"])
customerSet = df_all["customer_id"].unique()
print(customerSet)

In [53]:

# Example usage:
start_time = "2025-03-08 14:33:56"
start_time = "2025-03-12 02:01:45"
customer_id = random.randint(1, 1000)
session_events = generate_session_events_period(customer_id, start_time, end_time)
# print(json.dumps(session_events, indent=2))


currentSessions, totalSessions = 0, 10
allSessions3 = []
# Continuously generate and send session events
while currentSessions < totalSessions:
    print(currentSessions)
    # Pick a random customer
    # customer_id = random.randint(1, 1000)
    # can only be from 
    session_events = generate_session_events_period(customer_id, start_time, end_time)
    allSessions3.extend(session_events)
    currentSessions += 1


In [51]:
df_test = pd.DataFrame(allSessions3)
print(len(df_test))
df_test.head()

0


In [52]:
int_vals = {
    "purchase": 0,
    "page_view": 4,
    "scroll": 2,
    "click": 1,
    "add_to_cart": 6
}
df_test["interaction_value"] = df_test["event_type"].map(int_vals)
df_test['cumsum_interactions'] = df_test.groupby(['customer_id'])['interaction_value'].cumsum()

In [None]:
print("No of customers who made a purchase:", len(df_test[df_test["event_type"] == "purchase"]))
df_test[df_test["event_type"] == "purchase"]