## 1. Required Imports

In [33]:
import random
import string
import pandas as pd
from datetime import datetime, timedelta

## 2. Read the existing data from the CSV file

In [34]:

existing_data = pd.read_csv('../datasets/existing_data.csv')

## 3. Generate Synthetic data based on the existing data 

In [35]:
total_records = 1000000  # Adjust the total number of records as needed
order_numbers = ['OID-' + ''.join(random.choices(string.digits, k=6)) for _ in range(total_records)]
start_date = datetime(2017, 1, 1)
end_date = datetime(2022, 12, 31)
total_days = (end_date - start_date).days + 1
transaction_dates = [(start_date + timedelta(days=i)).strftime('%Y-%m-%d') for i in range(total_days)]
customer_ids = ['CID-' + ''.join(random.choices(string.digits, k=6)) for _ in range(5000)]

# Generating synthetic data
data = []
for i in range(total_records):
    order_number = order_numbers[i]
    transaction_date = random.choice(transaction_dates)
    customer_id = random.choice(customer_ids)

    # Randomly select a row from the existing data
    existing_row = existing_data.sample(n=1, replace=True).squeeze()
    product = existing_row['ProductName']
    manufacturer = existing_row['Manufacturer']
    qty = int(existing_row['Quantity'])

    # Generate a 6-digit product ID
    product_id = 'PID-' + ''.join(random.choices(string.digits, k=6))

    data.append([order_number, transaction_date, customer_id, product_id, qty, product, manufacturer])

## 4. Sort the generated data w.r.t date

In [36]:
data.sort(key=lambda x: x[1])

## 5. Create a dataframe with the generated data

In [37]:
df = pd.DataFrame(data, columns=['OrderNumber', 'TransactionDate', 'CustomerID', 'ProductID', 'Quantity',
                                 'ProductName', 'Manufacturer'])

## 6. Write the dataframe to excel file with yearly data into seperate sheets

In [38]:
writer = pd.ExcelWriter('../datasets/sales_data_latest.xlsx', engine='xlsxwriter')
year_groups = df.groupby(df['TransactionDate'].str[:4])
for year, group in year_groups:
    year_sheet_name = f'{year} sales'
    group.to_excel(writer, sheet_name=year_sheet_name, index=False)
writer.save()