In [6]:
import pandas as pd
import random
import numpy as np
from datetime import timedelta

In [None]:
# Load the original XLSX dataset
file_path = 'online_retail.xlsx'  # Replace with your file path
data = pd.read_excel(file_path)

In [None]:
# Define the number of additional records to generate
original_size = len(data)
desired_size = 1_001_200
additional_records = desired_size - original_size

In [None]:
# Generate synthetic data
synthetic_data = pd.DataFrame()

# Randomly generate Invoice Numbers
synthetic_data['InvoiceNo'] = [
    f"{random.randint(500000, 999999)}-{random.randint(1, 99)}" 
    for _ in range(additional_records)
]

In [None]:
# Randomly sample stock codes and descriptions
synthetic_data['StockCode'] = np.random.choice(data['StockCode'], additional_records)
synthetic_data['Description'] = synthetic_data['StockCode'].map(
    dict(zip(data['StockCode'], data['Description']))
)

# Generate random quantities (positive and negative for returns)
synthetic_data['Quantity'] = np.random.randint(-20, 50, additional_records)

In [None]:
# Generate realistic unit prices with random noise
synthetic_data['UnitPrice'] = np.random.choice(data['UnitPrice'], additional_records) + np.random.uniform(-0.5, 0.5, additional_records)

# Randomly generate dates within the same range as the original dataset
start_date = data['InvoiceDate'].min()
end_date = data['InvoiceDate'].max()
synthetic_data['InvoiceDate'] = [
    start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
    for _ in range(additional_records)
]

In [None]:
# Generate Customer IDs and Countries
synthetic_data['CustomerID'] = np.random.choice(data['CustomerID'], additional_records)
synthetic_data['Country'] = np.random.choice(data['Country'], additional_records)

# Combine original and synthetic datasets
expanded_data = pd.concat([data, synthetic_data], ignore_index=True)

In [None]:
# Save the expanded dataset to a CSV file
expanded_data.to_csv('expanded_online_retail.csv', index=False)

print(f"Dataset expanded to {len(expanded_data)} records and saved as 'expanded_online_retail.csv'")