# Synthetic Sales Data Generation: Key Fields 

## Key Fields
1. **Date**: Daily data for 2 years, including holidays or special events (e.g., Diwali, New Year).
2. **Price Per Unit**: Varies with seasonal trends, random fluctuations.
3. **Units Sold**: Depends inversely on price, with added seasonal variations, random noise, and occasional outliers.



In [2]:
import pandas as pd
import numpy as np


Generating dates

In [3]:
start_date = "2023-01-01"
end_date = "2024-12-31"
dates = pd.date_range(start=start_date, end=end_date)

Generating Price Per Unit


In [4]:
base_price = 50 + 10 * np.sin(2 * np.pi * dates.dayofyear / 365)  
price_noise = np.random.uniform(-2, 2, size=len(dates))  
price_per_unit = base_price + price_noise

In [5]:
price_per_unit = np.array(price_per_unit)

In [6]:
price_per_unit

array([50.5466109 , 52.19318172, 49.09703683, 52.60747988, 50.18269429,
       49.14456681, 53.17775999, 52.50294932, 52.49578188, 52.77952832,
       51.86736551, 51.61322494, 53.53373747, 53.38932469, 51.40229952,
       52.69033174, 54.11271946, 51.50793275, 53.60654446, 53.94884209,
       52.75636212, 54.12486023, 54.18785269, 52.16923706, 53.67335794,
       55.88040716, 54.4206664 , 54.78097552, 54.10500933, 56.87849611,
       56.72406302, 56.43848642, 56.92253783, 54.83196744, 55.25242981,
       55.48769068, 54.34664446, 54.95428494, 54.89524219, 56.11214182,
       58.21343912, 57.93189483, 58.33791281, 55.13792818, 57.45391589,
       55.76423174, 58.76718048, 57.56358445, 56.0405609 , 57.81096803,
       56.33183097, 59.20399031, 59.1936362 , 57.06716691, 56.27235073,
       58.46708008, 56.51476772, 57.14602094, 59.83702688, 59.31151524,
       58.02074145, 60.54062177, 59.97731981, 57.56019424, 58.49579439,
       60.53288942, 59.82861448, 60.46740111, 58.58702955, 59.74

Applying discounts during specific periods

In [7]:
discount_periods = [
    ("2023-10-15", "2023-11-15"),  # Diwali
    ("2024-12-20", "2024-12-31"),  # New Year
]
for start, end in discount_periods:
    discount_indices = (dates >= start) & (dates <= end)
    price_per_unit[discount_indices] -= np.random.uniform(2, 5, size=discount_indices.sum())

In [8]:
price_per_unit

array([50.5466109 , 52.19318172, 49.09703683, 52.60747988, 50.18269429,
       49.14456681, 53.17775999, 52.50294932, 52.49578188, 52.77952832,
       51.86736551, 51.61322494, 53.53373747, 53.38932469, 51.40229952,
       52.69033174, 54.11271946, 51.50793275, 53.60654446, 53.94884209,
       52.75636212, 54.12486023, 54.18785269, 52.16923706, 53.67335794,
       55.88040716, 54.4206664 , 54.78097552, 54.10500933, 56.87849611,
       56.72406302, 56.43848642, 56.92253783, 54.83196744, 55.25242981,
       55.48769068, 54.34664446, 54.95428494, 54.89524219, 56.11214182,
       58.21343912, 57.93189483, 58.33791281, 55.13792818, 57.45391589,
       55.76423174, 58.76718048, 57.56358445, 56.0405609 , 57.81096803,
       56.33183097, 59.20399031, 59.1936362 , 57.06716691, 56.27235073,
       58.46708008, 56.51476772, 57.14602094, 59.83702688, 59.31151524,
       58.02074145, 60.54062177, 59.97731981, 57.56019424, 58.49579439,
       60.53288942, 59.82861448, 60.46740111, 58.58702955, 59.74

Generate Units Sold

In [19]:
base_demand = 1000 - 20 * price_per_unit
seasonal_demand = 50 * np.sin(2 * np.pi * dates.dayofyear / 365)  
noise = np.random.normal(0, 10, len(dates))  
units_sold = base_demand + seasonal_demand + noise
units_sold = np.maximum(units_sold, 0)
units_sold = np.round(units_sold).astype(int)

In [20]:
units_sold

Index([  0,   0,   9,   0,   0,  23,   0,   0,   0,   0,
       ...
        86,  41, 104,  86,  41,  95,  61,  49,  67,  69],
      dtype='int64', length=731)

In [21]:
data = pd.DataFrame({
    "date": dates,
    "price_per_unit": price_per_unit,
    "units_sold": units_sold
})

In [22]:
import os

file_path = "./csv_data/synthetic_sales_data.csv"  
os.makedirs(os.path.dirname(file_path), exist_ok=True)  
data.to_csv(file_path, index=False)
print(f"File saved at: {file_path}")

File saved at: ./csv_data/synthetic_sales_data.csv
