In [None]:
pip install pytest

In [None]:
pip install pandas

In [None]:
pip install faker

In [7]:
import pandas as pd
from faker import Faker
import random
from datetime import datetime, timedelta
import numpy as np

In [None]:
# Create a Faker instance for generating synthetic data
fake = Faker()

# Function to generate random date ranges for campaigns over the last 12 years
def generate_date_range():
    start_date = fake.date_between(start_date='-12y', end_date='today')
    end_date = start_date + timedelta(days=random.randint(10, 90))  # Campaign duration between 10 to 90 days
    return start_date, end_date

# Generate 100 synthetic campaigns
campaigns_data = []
for campaign_id in range(1, 101):
    campaign_name = fake.catch_phrase()
    start_date, end_date = generate_date_range()
    budget = round(random.uniform(1000, 50000))
    channel = random.choice(["instagram", "reddit", "paid search"])
    target_audience = random.choice(["families", "couples", "seniors"])

    campaigns_data.append({
        "campaign_id": campaign_id,
        "campaign_name": campaign_name,
        "start_date": start_date,
        "end_date": end_date,
        "budget": budget,
        "target_audience": target_audience,
        "channel": channel,
    })

# Create a DataFrame from the synthetic data
campaigns_df = pd.DataFrame(campaigns_data)

# Save the DataFrame to a CSV file
csv_filename = "/Users/ekin/Documents/Projects/mealkit-delivery/data/campaigns.csv"
campaigns_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' has been created with 100 synthetic campaigns.")

In [22]:
# Create a Faker instance for generating synthetic data
fake = Faker()

# Load the campaigns data from the CSV file
campaigns_df = pd.read_csv("/Users/ekin/Documents/Projects/mealkit-delivery/data/campaigns.csv")

# Ensure 'start_date' and 'end_date' are datetime objects
campaigns_df['start_date'] = pd.to_datetime(campaigns_df['start_date'])
campaigns_df['end_date'] = pd.to_datetime(campaigns_df['end_date'])

# Number of leads and events
num_leads = 8000
num_clicks = 8000
num_subscribes = 2000

# Define possible values for channels
channels = ["referral", "social media", "paid search"]

# Generate click events in bulk
click_leads = random.choices(range(1, num_leads+1), k=num_clicks)
click_campaigns = random.choices(campaigns_df['campaign_id'].tolist(), k=num_clicks)
click_channels = random.choices(channels, k=num_clicks)
click_event_dates = [fake.date_time_between_dates(
    datetime_start=pd.to_datetime(campaigns_df.loc[campaigns_df['campaign_id'] == camp, 'start_date'].values[0]),
    datetime_end=pd.to_datetime(campaigns_df.loc[campaigns_df['campaign_id'] == camp, 'end_date'].values[0])
) for camp in click_campaigns]

click_events = pd.DataFrame({
    "event_id": [fake.uuid4() for _ in range(num_clicks)],
    "campaign_id": click_campaigns,
    "lead_id": click_leads,
    "event_type": ["click"] * num_clicks,
    "event_date": click_event_dates,
    "channel": click_channels,
    "subscription_id": [None] * num_clicks  # No subscription for click events
})

# Keep track of which leads clicked in which campaigns
lead_clicks = click_events.groupby('lead_id')['campaign_id'].apply(list).to_dict()

# Generate subscribe events in bulk, ensuring each lead has a valid prior click
subscribe_leads = []
subscribe_campaigns = []
subscribe_event_dates = []
subscription_ids = []

for _ in range(num_subscribes):
    while True:
        lead_id = random.randint(1, num_leads)
        
        if lead_id in lead_clicks and lead_clicks[lead_id]:
            campaign_id = random.choice(lead_clicks[lead_id])
            
            # Find the click event date for the selected campaign
            click_event = click_events[
                (click_events['lead_id'] == lead_id) &
                (click_events['campaign_id'] == campaign_id)
            ].iloc[0]
            
            subscribe_event_date = fake.date_time_between_dates(
                datetime_start=pd.to_datetime(click_event['event_date']),
                datetime_end=pd.to_datetime(campaigns_df.loc[campaigns_df['campaign_id'] == campaign_id, 'end_date'].values[0])
            )
            
            # Append to lists for subscription event
            subscribe_leads.append(lead_id)
            subscribe_campaigns.append(campaign_id)
            subscribe_event_dates.append(subscribe_event_date)
            subscription_ids.append(len(subscription_ids) + 1)
            break

subscribe_events = pd.DataFrame({
    "event_id": [fake.uuid4() for _ in range(num_subscribes)],
    "campaign_id": subscribe_campaigns,
    "lead_id": subscribe_leads,
    "event_type": ["subscribe"] * num_subscribes,
    "event_date": subscribe_event_dates,
    "channel": [random.choice(channels) for _ in range(num_subscribes)],
    "subscription_id": list(range(1, num_subscribes + 1))
})

# Combine click and subscribe events
events_df = pd.concat([click_events, subscribe_events])

# Save the DataFrame to a CSV file
csv_filename = "/Users/ekin/Documents/Projects/mealkit-delivery/data/events.csv"
events_df.to_csv(csv_filename, index=False)

print(f"CSV file '{csv_filename}' has been created with {len(events_df)} synthetic events.")

CSV file '/Users/ekin/Documents/Projects/mealkit-delivery/data/events.csv' has been created with 10000 synthetic events.


In [26]:
import pandas as pd
import random
from faker import Faker

# Create a Faker instance for generating synthetic data
fake = Faker()

# Number of subscriptions
num_subscriptions = 2000

# Load the events data (to get the event_date for subscriptions)
events_df = pd.read_csv("/Users/ekin/Documents/Projects/mealkit-delivery/data/events.csv")

# Filter out only subscription events
subscription_events_df = events_df[events_df['event_type'] == 'subscribe']

# Ensure we have enough subscription events in the event data
assert len(subscription_events_df) == num_subscriptions, "Mismatch between number of subscriptions and subscription events."

# Define possible values for food choice
food_choices = ["meat", "vegan", "fish", "veggie", "climate"]

# Function to generate a skewed random number for n_orders (more values on the lower end)
def skewed_n_orders():
    return int(random.triangular(1, 100, 5))  # Skewing the distribution towards lower values

# Generate n_orders and end_dates based on subscription length
subscription_ids = list(range(1, num_subscriptions + 1))  # Generate sequential integer subscription IDs
subscription_dates = pd.to_datetime(subscription_events_df['event_date']).tolist()
n_meals = [random.randint(2, 5) for _ in range(num_subscriptions)]
n_people = [random.randint(2, 4) for _ in range(num_subscriptions)]
food_choices = [random.choice(food_choices) for _ in range(num_subscriptions)]
n_orders = []
end_dates = []

# Generate end_date and n_orders for each subscription
for start_date in subscription_dates:
    # Randomly decide if this subscription has an end_date or is ongoing
    has_end_date = random.random() < 0.8  # 80% chance the subscription has an end date
    
    if has_end_date:
        # Create a random duration for the subscription (shorter for lower n_orders, longer for higher n_orders)
        subscription_duration_days = int(random.triangular(30, 365, 90))  # Skew towards shorter durations
        
        # Calculate end_date based on the start_date and duration
        end_date = start_date + pd.DateOffset(days=subscription_duration_days)
        
        # n_orders is skewed towards lower values but influenced by the subscription duration
        max_possible_orders = min(100, subscription_duration_days // 7)  # Assume one possible order per week
        n_orders_value = random.randint(1, max_possible_orders) if max_possible_orders > 1 else 1
        
    else:
        # If no end_date, consider this an ongoing subscription
        end_date = None
        
        # Ongoing subscriptions tend to have fewer orders so far
        n_orders_value = random.randint(1, 10)

    # Append to the lists
    n_orders.append(n_orders_value)
    end_dates.append(end_date)

# Create the DataFrame for the subscriptions table
subscriptions_df = pd.DataFrame({
    "subscription_id": subscription_ids,  # Integer subscription IDs
    "subscription_date": subscription_dates,
    "end_date": end_dates,
    "n_meals": n_meals,
    "n_people": n_people,
    "n_orders": n_orders,
    "food_choice": food_choices
})

# Save the subscriptions DataFrame to a CSV file
subscriptions_csv_filename = "/Users/ekin/Documents/Projects/mealkit-delivery/data/subscriptions.csv"
subscriptions_df.to_csv(subscriptions_csv_filename, index=False)

print(f"CSV file '{subscriptions_csv_filename}' has been created with {len(subscriptions_df)} subscriptions.")

CSV file '/Users/ekin/Documents/Projects/mealkit-delivery/data/subscriptions.csv' has been created with 2000 subscriptions.
