In [7]:

import numpy as np # type: ignore
import pandas as pd # type: ignore

np.random.seed(42)

#Simulate 5,000 users
n = 5000
user_ids = np.arange(1, n+1)

#Random Assignment to control or treatment group
groups = np.random.choice(['control', 'treatment'], size=n)

#Simulate purchases with slightly higher rate in treatment group
purchase_prob = np.where(groups == 'control', 0.12, 0.15)
purchases = np.random.binomial(1, purchase_prob)

#Revenue only for users who purchased in (USD)
revenue = np.where(purchases == 1, np.round(np.random.normal(50, 15, n), 2), 0)

#Session duration in seconds
session_duration = np.random.exponential(scale=180, size=n).astype(int)

#Device type
device_type = np.random.choice(['mobile', 'desktop'], size=n, p=[0.6, 0.4])

#Create DataFrame
df = pd.DataFrame({
    'user_id': user_ids,
    'group': groups,
    'purchase': purchases,
    'revenue': revenue,
    'session_duration': session_duration,
    'device_type': device_type,
    })

df.head()

Unnamed: 0,user_id,group,purchase,revenue,session_duration,device_type
0,1,control,0,0.0,209,mobile
1,2,treatment,0,0.0,890,desktop
2,3,control,0,0.0,90,mobile
3,4,control,1,51.23,176,desktop
4,5,control,0,0.0,593,mobile


In [8]:
#Save the dataset csv
df.to_csv('../data/ecommerce_data.csv', index=False)
print("Dataset saved to /data/ecommerce_data.csv")

Dataset saved to /data/ecommerce_data.csv
