## Synthetic Data Generation for the Use Case 🍭

In [None]:
import pandas as pd
import numpy as np
import random


In [None]:
# Define regions with varying consumption patterns (simplified)
regions = {
    "Marmara": {"base_consumption": 12000, "holiday_multiplier": 1.8},  # High consumption, significant holiday increase
    "Aegean": {"base_consumption": 8000, "holiday_multiplier": 1.5},   # Moderate consumption, moderate holiday increase
    "Mediterranean": {"base_consumption": 10000, "holiday_multiplier": 1.6},  # Moderate consumption, moderate holiday increase
    "Black Sea": {"base_consumption": 7000, "holiday_multiplier": 1.4},   # Lower consumption, lower holiday increase
    "Central Anatolia": {"base_consumption": 5000, "holiday_multiplier": 1.3},  # Lower consumption, lower holiday increase
    "Eastern Anatolia": {"base_consumption": 4000, "holiday_multiplier": 1.2},  # Lowest consumption, lower holiday increase
    "Southeast Anatolia": {"base_consumption": 6000, "holiday_multiplier": 1.5}   # Moderate consumption, moderate holiday increase
}


In [None]:
# Function to generate synthetic data with regional variations
def generate_synthetic_data(num_rows=1000, years=range(2014, 2024)):
    data = []

    for _ in range(num_rows):
        # Randomly select year and region
        year = random.choice(years)
        region = random.choice(list(regions.keys()))

        # Generate features
        holiday_promotion = random.choice([0, 1])  # 1 means promotion, 0 means no promotion
        population = random.randint(1000000, 15000000)  # Random population size
        income_level = random.randint(1500, 25000)  # Random income level (in Turkish Lira)
        bakery_count = random.randint(100, 5000)  # Random number of bakeries

        # Calculate base consumption for the region
        base_consumption = regions[region]["base_consumption"]

        # Adjust consumption based on holiday season and promotions
        if holiday_promotion:
            base_consumption *= 1.1  # 10% increase during promotions
        
        # Determine holiday season (simplified)
        if year % 2 == 0:  # Alternate years for stronger holiday impact
            holiday_season = 1
        else:
            holiday_season = 0

        # Calculate consumption with regional and holiday factors
        consumption = base_consumption * regions[region]["holiday_multiplier"] ** holiday_season 

        # Add random noise to simulate real-world fluctuations
        consumption += np.random.normal(loc=0, scale=consumption * 0.1)  # 10% standard deviation

        # Append the generated row
        data.append([region, holiday_promotion, population, income_level, bakery_count, consumption, year, holiday_season])

    # Create DataFrame
    df = pd.DataFrame(data, columns=["Region", "Holiday_Promotion", "Population", "Income_Level", "Bakery_Count", "Demand", "Year", "Holiday_Season"])

    return df

In [None]:
# Generate and save the data
df = generate_synthetic_data()
df.to_csv("synthetic_baklava_data_turkey.csv", index=False)
print(df.head())