In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import random
import string

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# ======================
# 1. Malaysian Store Data (50 stores)
# ======================
def generate_stores(n_stores=50):
    # Malaysian states and major cities
    malaysia_locations = [
        ("Kuala Lumpur", "Kuala Lumpur"),
        ("Petaling Jaya", "Selangor"),
        ("Shah Alam", "Selangor"),
        ("Subang Jaya", "Selangor"),
        ("Johor Bahru", "Johor"),
        ("Ipoh", "Perak"),
        ("Kuantan", "Pahang"),
        ("Kota Kinabalu", "Sabah"),
        ("Kuching", "Sarawak"),
        ("George Town", "Penang"),
        ("Melaka City", "Melaka"),
        ("Alor Setar", "Kedah"),
        ("Kota Bharu", "Kelantan"),
        ("Seremban", "Negeri Sembilan"),
        ("Kuala Terengganu", "Terengganu")
    ]
    
    # Store name templates (common in MY)
    store_name_prefixes = [
        "MyGrocer", "Kedai Serbaneka", "Toko Serbaneka", "G Mart", "QuickMart",
        "MyMart", "Serbaneka Rakyat", "Kedai 24 Jam", "Lazada Grocer", "Shopee Mart"
    ]
    
    store_codes = [f"MY{str(i).zfill(4)}" for i in range(1, n_stores + 1)]
    store_names = [f"{random.choice(store_name_prefixes)} {i}" for i in range(1, n_stores + 1)]
    
    # Assign locations
    locations = [random.choice(malaysia_locations) for _ in range(n_stores)]
    cities = [loc[0] for loc in locations]
    states = [loc[1] for loc in locations]
    
    # Store open dates (2015â€“2024, realistic for MY retail expansion)
    open_dates = [
        (datetime(2015, 1, 1) + pd.Timedelta(days=random.randint(0, 3650))).strftime("%Y-%m-%d")
        for _ in range(n_stores)
    ]
    
    return pd.DataFrame({
        "StoreCode": store_codes,
        "StoreName": store_names,
        "City": cities,
        "State": states,
        "OpenDate": open_dates
    })

# ======================
# 2. Malaysian Item Data (200 items)
# ======================
def generate_items(n_items=200):
    # Common MY product categories
    categories = [
        "Groceries", "Beverages", "Snacks", "Personal Care", "Household",
        "Frozen Foods", "Dairy", "Baby Care", "Health Supplements", "Local Delicacies"
    ]
    
    # Popular Malaysian & regional brands
    brands = [
        "Mamee", "Maggi", "Dutch Lady", "F&N", "Julie's", "Adabi", "Ayam Brand",
        "Gardenia", "Massimo", "Vitagen", "Safi", "Biokos", "Lifebuoy", "Dettol"
    ]
    
    item_ids = [f"IT{str(i).zfill(5)}" for i in range(1, n_items + 1)]
    barcodes = [''.join(random.choices(string.digits, k=13)) for _ in range(n_items)]
    
    cat_choices = np.random.choice(categories, n_items)
    brand_choices = np.random.choice(brands, n_items)
    
    # Prices in MYR (realistic ranges)
    base_prices = []
    for cat in cat_choices:
        if cat in ["Groceries", "Beverages", "Snacks"]:
            base_prices.append(round(random.uniform(2.0, 25.0), 2))
        elif cat in ["Personal Care", "Household"]:
            base_prices.append(round(random.uniform(5.0, 45.0), 2))
        elif cat in ["Health Supplements", "Local Delicacies"]:
            base_prices.append(round(random.uniform(10.0, 80.0), 2))
        else:
            base_prices.append(round(random.uniform(3.0, 30.0), 2))
    
    cost_prices = np.array(base_prices)
    retail_prices = np.round(cost_prices * np.random.uniform(1.15, 1.6), 2)  # modest markup
    
    item_names = [
        f"{brand} {cat} {random.choice(['Original', 'Spicy', 'Family Pack', 'Mini'])}"
        for brand, cat in zip(brand_choices, cat_choices)
    ]
    
    return pd.DataFrame({
        "ItemID": item_ids,
        "ItemName": item_names,
        "Category": cat_choices,
        "Brand": brand_choices,
        "Barcode": barcodes,
        "CostPrice_MYR": cost_prices,
        "RetailPrice_MYR": retail_prices
    })

# ======================
# 3. SALES DATA: 2M rows (2021â€“2025) â€” Malaysian context
# ======================
def generate_sales_data(stores_df, items_df, n_sales=2_000_000):
    print(f"Generating {n_sales:,} Malaysian retail sales (2021â€“2025)...")
    
    store_codes = stores_df["StoreCode"].values
    item_ids = items_df["ItemID"].values
    retail_prices = items_df["RetailPrice_MYR"].values
    
    store_choices = np.random.choice(store_codes, size=n_sales)
    item_indices = np.random.choice(len(item_ids), size=n_sales)
    item_choices = item_ids[item_indices]
    unit_prices = retail_prices[item_indices]
    
    # Quantities: higher for groceries/snacks (bulk buying common in MY)
    quantities = np.random.choice(
        np.arange(1, 16),  # up to 15 units (e.g., canned drinks, Maggi packs)
        size=n_sales,
        p=[0.35, 0.20, 0.15, 0.10, 0.08, 0.04, 0.03, 0.02, 0.01, 0.005,
           0.005, 0.003, 0.003, 0.002, 0.002]
    )
    
    # Occasional promotions (common during Raya, CNY, Deepavali)
    discounts = np.random.choice([0.0, 0.05, 0.10, 0.15, 0.20], 
                                size=n_sales, 
                                p=[0.65, 0.15, 0.10, 0.07, 0.03])
    total_amount = np.round(unit_prices * quantities * (1 - discounts), 2)
    
    # Sales from 2021-01-01 to 2025-12-31
    date_range = pd.date_range(start="2021-01-01", end="2025-12-31", freq='D')
    transaction_dates = np.random.choice(date_range, size=n_sales)
    
    transaction_ids = np.array([f"TX{str(i).zfill(8)}" for i in range(1, n_sales + 1)])
    
    df = pd.DataFrame({
        "TransactionID": transaction_ids,
        "StoreCode": store_choices,
        "ItemID": item_choices,
        "Quantity": quantities,
        "UnitPrice_MYR": np.round(unit_prices, 2),
        "TotalAmount_MYR": total_amount,
        "TransactionDate": transaction_dates.astype(str)
    })
    
    return df

# ======================
# Main Execution
# ======================
if __name__ == "__main__":
    print("ðŸ‡²ðŸ‡¾ Generating Malaysian retail dummy data (2021â€“2025, 2M sales)...")
    
    stores_df = generate_stores(50)
    items_df = generate_items(200)
    
    # Save dimension tables
    stores_df.to_csv("StoreCode.csv", index=False)
    items_df.to_csv("ItemBarcode.csv", index=False)
    
    # Generate sales
    sales_df = generate_sales_data(stores_df, items_df, 2_000_000)
    print("ðŸ’¾ Writing SalesData.csv (MYR, 5 years)...")
    sales_df.to_csv("SalesData.csv", index=False)
    
    print("\nðŸŽ‰ Done! Malaysian retail dataset ready.")
    print(f"â€¢ Stores: {len(stores_df)} (across MY states)")
    print(f"â€¢ Items: {len(items_df)} (local brands & categories)")
    print(f"â€¢ Sales: {len(sales_df):,} transactions (2021â€“2025, in MYR)")
    
    # Show sample
    print("\nðŸ“‹ Sample Sales Data:")
    print(sales_df.head(3).to_string(index=False))

ðŸ‡²ðŸ‡¾ Generating Malaysian retail dummy data (2021â€“2025, 2M sales)...
Generating 2,000,000 Malaysian retail sales (2021â€“2025)...
ðŸ’¾ Writing SalesData.csv (MYR, 5 years)...

ðŸŽ‰ Done! Malaysian retail dataset ready.
â€¢ Stores: 50 (across MY states)
â€¢ Items: 200 (local brands & categories)
â€¢ Sales: 2,000,000 transactions (2021â€“2025, in MYR)

ðŸ“‹ Sample Sales Data:
TransactionID StoreCode  ItemID  Quantity  UnitPrice_MYR  TotalAmount_MYR               TransactionDate
   TX00000001    MY0034 IT00187         6          42.15           227.61 2023-06-17T00:00:00.000000000
   TX00000002    MY0002 IT00062         5          20.48           102.40 2025-01-15T00:00:00.000000000
   TX00000003    MY0032 IT00033         2          32.39            58.30 2021-05-03T00:00:00.000000000
