In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import random

In [2]:
# Helper functions
def random_date(start, end):
    return start + timedelta(seconds=np.random.randint(0, int((end - start).total_seconds())))

def generate_sample_dates(start_date, end_date, num_records):
    return [random_date(start_date, end_date).strftime('%Y-%m-%d') for _ in range(num_records)]

def generate_time():
    time = datetime.strptime('00:00', '%H:%M') + timedelta(minutes=np.random.randint(0, 1440))
    return time.strftime('%H:%M')

In [3]:
num_records = 1000

In [10]:
# Generate a random number of records between 500 and 1000
num_records = 500

# Function to generate sample dates
def generate_sample_dates(start_date, end_date, n):
    """Generate n random dates between start_date and end_date"""
    date_range = (end_date - start_date).days
    random_days = sorted(random.sample(range(date_range), n))
    return [(start_date + timedelta(days=day)).strftime('%Y-%m-%d') for day in random_days]

# Function to generate random time
def generate_time():
    """Generate a random time in HH:MM:SS format"""
    hours = random.randint(0, 23)
    minutes = random.randint(0, 59)
    seconds = random.randint(0, 59)
    return f"{hours:02d}:{minutes:02d}:{seconds:02d}"

# Dynamically calculate date ranges for the last 3 years from current date
end_date = datetime.now()
start_date = end_date - timedelta(days=3*365)  # Approximately 3 years

print(f"Generating sample data with {num_records} records from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")

# Use context manager to write Excel file
with pd.ExcelWriter('./data/sample_data.xlsx', engine='openpyxl') as writer:
    # 1. Daily DFR
    pd.DataFrame({
        'REPORT_DATE': generate_sample_dates(start_date, end_date, num_records),
        'START_TIME': [generate_time() for _ in range(num_records)],
        'FINISH_TIME': [generate_time() for _ in range(num_records)],
        'ELAPSED_HOURS': np.random.uniform(1, 12, num_records).round(2)
    }).to_excel(writer, sheet_name='Daily DFR', index=False)
    
    # 2. Volume All
    pd.DataFrame({
        'Date': generate_sample_dates(start_date, end_date, num_records),
        'Auth': np.random.randint(1000, 5000, num_records),
        'Txn': np.random.randint(100, 1000, num_records)
    }).to_excel(writer, sheet_name='Volume All', index=False)
    
    # 3. Volume (Subm)-Top Merchants
    # Create monthly dates spanning the 3-year period
    months_range = pd.date_range(start_date, end_date, freq='M')
    # Take only the required number of months (or all if less than num_records)
    months_data = months_range[:min(len(months_range), num_records)]
    
    pd.DataFrame({
        'Month': months_data.strftime('%Y-%m'),
        'Metropolitan (MTA)': np.random.randint(10000, 50000, len(months_data))
    }).to_excel(writer, sheet_name='Volume (Subm)-Top Merchants', index=False)
    
    # For 3-year monthly data sheets
    months = pd.date_range(start_date, end_date, freq='M').strftime('%Y-%m').tolist()
    
    # If months list is longer than num_records, sample from it
    if len(months) > num_records:
        months_sample = random.sample(months, num_records)
    else:
        # Otherwise use all months and repeat if necessary
        months_sample = (months * (num_records // len(months) + 1))[:num_records]
    
    entities = ['Sales', 'Marketing', 'Finance']
    companies = ['Company A', 'Company B', 'Company C']
    reports = ['Monthly Sales', 'Quarterly Financial', 'Marketing Analysis']
    frequencies = ['Monthly', 'Weekly', 'Daily']
    
    common_data = {
        'COMPANY_ID': np.random.randint(1000, 9999, num_records),
        'ENTITY': np.random.choice(entities, num_records),
        'BREAKDOWN_LVL': np.random.choice(['High', 'Medium', 'Low'], num_records),
        'ENTITY_ID': np.random.randint(100, 999, num_records),
        'DB_KEY': np.random.randint(100000, 999999, num_records),
        'COMPANY_NAME': np.random.choice(companies, num_records),
        'REPORT_NAME': np.random.choice(reports, num_records),
        'REPORT_DATE': np.random.choice(months_sample, num_records),
        'FREQUENCY': np.random.choice(frequencies, num_records),
        'START_TIME': [generate_time() for _ in range(num_records)],
        'FINISH_TIME': [generate_time() for _ in range(num_records)],
        'ELAPSED_HOURS': np.random.uniform(1, 24, num_records).round(2)
    }
    
    # Generate sheets
    sheet_names = [
        'Daily Top Web Reports', 'Daily Top GRPT DFR Reports',
        'Top monthly web reports', 'Top monthly Grpt DFR reports (>10 hrs)'
    ]
    
    for name in sheet_names:
        data = common_data.copy()
        if 'BREAKDOWN_LVL' not in name:
            data.pop('BREAKDOWN_LVL')
        df = pd.DataFrame(data)
        if name == 'Top monthly Grpt DFR reports (>10 hrs)':
            df = df[df['ELAPSED_HOURS'] > 10]
        df.to_excel(writer, sheet_name=name, index=False)
    
    # Monthend Duration-Web, Grpt
    # Use the available months, up to num_records
    month_count = min(len(months), num_records)
    
    pd.DataFrame({
        'REPORT_DATE': months[:month_count],
        'START_TIME': [generate_time() for _ in range(month_count)],
        'FINISH_TIME': [generate_time() for _ in range(month_count)],
        'MAX_DURATION_HOURS': np.random.uniform(5, 20, month_count).round(2)
    }).to_excel(writer, sheet_name='Monthend Duration Web GRPT', index=False)
    
    # Daily Web
    pd.DataFrame({
        'REPORT_DATE': generate_sample_dates(start_date, end_date, num_records),
        'START_TIME': [generate_time() for _ in range(num_records)],
        'FINISH_TIME': [generate_time() for _ in range(num_records)],
        'ELAPSED_HOURS': np.random.uniform(1, 12, num_records).round(2)
    }).to_excel(writer, sheet_name='Daily Web', index=False)

print(f"Excel file 'sample_data.xlsx' created successfully with {num_records} records spanning from {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}!")

Generating sample data with 500 records from 2022-05-11 to 2025-05-10
Excel file 'sample_data.xlsx' created successfully with 500 records spanning from 2022-05-11 to 2025-05-10!


  months_range = pd.date_range(start_date, end_date, freq='M')
  months = pd.date_range(start_date, end_date, freq='M').strftime('%Y-%m').tolist()
