In [None]:
pip install pytest

In [None]:
pip install pandas

In [None]:
pip install faker

In [7]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta

In [None]:
# Create a Faker instance for generating synthetic data
fake = Faker()

# Function to generate random date ranges for campaigns over the last 12 years
def generate_date_range():
    start_date = fake.date_between(start_date='-12y', end_date='today')
    end_date = start_date + timedelta(days=random.randint(10, 90))  # Campaign duration between 10 to 90 days
    return start_date, end_date

# Generate 100 synthetic campaigns
campaigns_data = []
for campaign_id in range(1, 101):
    campaign_name = fake.catch_phrase()
    start_date, end_date = generate_date_range()
    budget = round(random.uniform(1000, 50000))
    channel = random.choice(["instagram", "reddit", "paid search"])
    target_audience = random.choice(["families", "couples", "seniors"])

    campaigns_data.append({
        "campaign_id": campaign_id,
        "campaign_name": campaign_name,
        "start_date": start_date,
        "end_date": end_date,
        "budget": budget,
        "target_audience": target_audience,
        "channel": channel,
    })

# Create a DataFrame from the synthetic data
campaigns_df = pd.DataFrame(campaigns_data)

# Save the DataFrame to a CSV file
csv_filename = "/Users/ekin/Documents/Projects/mealkit-delivery/data/campaigns.csv"
campaigns_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' has been created with 100 synthetic campaigns.")

In [None]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime
import numpy as np

# Create a Faker instance for generating synthetic data
fake = Faker()

# Load the campaigns data from the CSV file
campaigns_df = pd.read_csv("/Users/ekin/Documents/Projects/mealkit-delivery/data/campaigns.csv")

# Ensure 'start_date' and 'end_date' are datetime objects
campaigns_df['start_date'] = pd.to_datetime(campaigns_df['start_date'])
campaigns_df['end_date'] = pd.to_datetime(campaigns_df['end_date'])

# Number of leads and events
num_leads = 8000
num_clicks = 8000
num_subscribes = 2000

# Define possible values for channels
channels = ["referral", "social media", "paid search"]

# Generate click events in bulk
click_leads = random.choices(range(1, num_leads+1), k=num_clicks)
click_campaigns = random.choices(campaigns_df['campaign_id'].tolist(), k=num_clicks)
click_channels = random.choices(channels, k=num_clicks)
click_event_dates = [fake.date_time_between_dates(
    datetime_start=pd.to_datetime(campaigns_df.loc[campaigns_df['campaign_id'] == camp, 'start_date'].values[0]),
    datetime_end=pd.to_datetime(campaigns_df.loc[campaigns_df['campaign_id'] == camp, 'end_date'].values[0])
) for camp in click_campaigns]

click_events = pd.DataFrame({
    "event_id": [fake.uuid4() for _ in range(num_clicks)],
    "campaign_id": click_campaigns,
    "lead_id": click_leads,
    "event_type": ["click"] * num_clicks,
    "event_date": click_event_dates,
    "channel": click_channels,
    "subscription_id": [None] * num_clicks  # No subscription for click events
})

# Keep track of which leads clicked in which campaigns
lead_clicks = click_events.groupby('lead_id')['campaign_id'].apply(list).to_dict()

# Generate subscribe events in bulk, ensuring each lead has a valid prior click
subscribe_leads = []
subscribe_campaigns = []
subscribe_event_dates = []
subscription_ids = []

for _ in range(num_subscribes):
    while True:
        lead_id = random.randint(1, num_leads)
        
        if lead_id in lead_clicks and lead_clicks[lead_id]:
            campaign_id = random.choice(lead_clicks[lead_id])
            
            # Find the click event date for the selected campaign
            click_event = click_events[
                (click_events['lead_id'] == lead_id) &
                (click_events['campaign_id'] == campaign_id)
            ].iloc[0]
            
            subscribe_event_date = fake.date_time_between_dates(
                datetime_start=pd.to_datetime(click_event['event_date']),
                datetime_end=pd.to_datetime(campaigns_df.loc[campaigns_df['campaign_id'] == campaign_id, 'end_date'].values[0])
            )
            
            # Append to lists for subscription event
            subscribe_leads.append(lead_id)
            subscribe_campaigns.append(campaign_id)
            subscribe_event_dates.append(subscribe_event_date)
            subscription_ids.append(fake.uuid4())
            break

subscribe_events = pd.DataFrame({
    "event_id": [fake.uuid4() for _ in range(num_subscribes)],
    "campaign_id": subscribe_campaigns,
    "lead_id": subscribe_leads,
    "event_type": ["subscribe"] * num_subscribes,
    "event_date": subscribe_event_dates,
    "channel": [random.choice(channels) for _ in range(num_subscribes)],
    "subscription_id": subscription_ids
})

# Combine click and subscribe events
events_df = pd.concat([click_events, subscribe_events])

# Save the DataFrame to a CSV file
csv_filename = "/Users/ekin/Documents/Projects/mealkit-delivery/data/events.csv"
events_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' has been created with {len(events_df)} synthetic events.")