# This code will Generate Fake dataset for this project


### 1. 325 Financial files where each file will have 1-14 tabs. each tab represents one service
### 2. 25 operation files


In [6]:
import pandas as pd
import numpy as np
from faker import Faker
import os
import Config

fake = Faker()
Faker.seed(42)
np.random.seed(42)

# --------------------------------------------------------
# CONFIG
# --------------------------------------------------------
BASE_PATH = Config.BASE_RAW
FINANCE_DIR = os.path.join(BASE_PATH, "finance")
BACKEND_DIR = os.path.join(BASE_PATH, "backend")

os.makedirs(FINANCE_DIR, exist_ok=True)
os.makedirs(BACKEND_DIR, exist_ok=True)

TOTAL_AREAS = 325
TOTAL_SERVICES = 14
TOTAL_BACKEND_FILES = 25   # 14 service-based + 11 failsafe

services = [f"Service_{i}" for i in range(1, TOTAL_SERVICES + 1)]
areas = list(range(1, TOTAL_AREAS + 1))

# --------------------------------------------------------
# 1. FINANCE DATA (325 Excel files with multiple sheets)
# --------------------------------------------------------
for area in areas:
    # Random services this area contains (3â€“14)
    area_services = np.random.choice(services, size=np.random.randint(3, 15), replace=False)

    # Create Excel writer per area
    file_path = os.path.join(FINANCE_DIR, f"finance_area_{area}.xlsx")
    with pd.ExcelWriter(file_path, engine="xlsxwriter") as writer:

        for service in area_services:
            n_records = np.random.randint(50, 200)

            df = pd.DataFrame({
                "transaction_id": [fake.uuid4() for _ in range(n_records)],
                "area_id": area,
                "service_type": service,
                "transaction_amount": np.random.uniform(50, 2000, size=n_records).round(2),
                "transaction_date": [
                    fake.date_between(start_date="-2y", end_date="today")
                    for _ in range(n_records)
                ],
            })

            df.to_excel(writer, sheet_name=service, index=False)


# --------------------------------------------------------
# 2. BACKEND OPERATIONAL DATA (25 datasets)
# --------------------------------------------------------

# ---- 14 service-based backend files (primary operational datasets)
for service in services:
    n_records = 30000

    df = pd.DataFrame({
        "record_id": [fake.uuid4() for _ in range(n_records)],
        "area_id": np.random.choice(areas, size=n_records),
        "service_type": service,
        "units_processed": np.random.randint(1, 500, size=n_records),
        "processing_cost": np.random.uniform(20, 1000, size=n_records).round(2),
        "operation_date": [
            fake.date_between(start_date="-2y", end_date="today")
            for _ in range(n_records)
        ],
    })

    file_path = os.path.join(BACKEND_DIR, f"backend_operations_{service}.csv")
    df.to_csv(file_path, index=False)

# ---- 11 failsafe/redundant operational datasets
for i in range(TOTAL_BACKEND_FILES - TOTAL_SERVICES):
    n_records = 3000

    df = pd.DataFrame({
        "record_id": [fake.uuid4() for _ in range(n_records)],
        "area_id": np.random.choice(areas, size=n_records),
        "service_type": np.random.choice(services, size=n_records),
        "units_processed": np.random.randint(1, 500, size=n_records),
        "processing_cost": np.random.uniform(20, 1000, size=n_records).round(2),
        "operation_date": [
            fake.date_between(start_date="-2y", end_date="today")
            for _ in range(n_records)
        ],
    })

    file_path = os.path.join(BACKEND_DIR, f"backend_operations_failsafe_{i+1}.csv")
    df.to_csv(file_path, index=False)

print("Dataset generation completed successfully.")
print(f"Finance files saved to: {FINANCE_DIR}")
print(f"Backend files saved to: {BACKEND_DIR}")


Dataset generation completed successfully.
Finance files saved to: C:\Users\Gauri\Documents\End to End Reporting Project\finance
Backend files saved to: C:\Users\Gauri\Documents\End to End Reporting Project\backend
