COGS DATA

In [1]:
import pandas as pd
import numpy as np
import uuid
from datetime import datetime, timedelta
import random
import os

def load_products_data(products_file="data/products.csv"):
    """
    Load the previously generated products data
    
    Parameters:
    - products_file: CSV file containing products data
    
    Returns:
    - DataFrame containing the products data or None if not available
    """
    try:
        return pd.read_csv(products_file)
    except FileNotFoundError:
        print(f"Note: Products data file {products_file} not found.")
        print("COGS will be generated with synthetic product IDs.")
        return None

def load_batches_data(batches_file="data/batches.csv"):
    """
    Load the previously generated batches data
    
    Parameters:
    - batches_file: CSV file containing batches data
    
    Returns:
    - DataFrame containing the batches data or None if not available
    """
    try:
        return pd.read_csv(batches_file)
    except FileNotFoundError:
        print(f"Note: Batches data file {batches_file} not found.")
        print("COGS will be generated with synthetic batch IDs.")
        return None

def load_work_orders_data(work_orders_file="data/work_orders.csv"):
    """
    Load the previously generated work orders data
    
    Parameters:
    - work_orders_file: CSV file containing work orders data
    
    Returns:
    - DataFrame containing the work orders data or None if not available
    """
    try:
        return pd.read_csv(work_orders_file)
    except FileNotFoundError:
        print(f"Note: Work orders data file {work_orders_file} not found.")
        print("COGS will be generated with synthetic work order IDs.")
        return None

def load_costs_data(costs_file="data/costs.csv"):
    """
    Load the previously generated costs data
    
    Parameters:
    - costs_file: CSV file containing costs data
    
    Returns:
    - DataFrame containing the costs data or None if not available
    """
    try:
        return pd.read_csv(costs_file)
    except FileNotFoundError:
        print(f"Note: Costs data file {costs_file} not found.")
        print("COGS will be generated without reference to existing costs.")
        return None

def load_material_consumption_data(material_consumption_file="data/material_consumption.csv"):
    """
    Load the previously generated material consumption data
    
    Parameters:
    - material_consumption_file: CSV file containing material consumption data
    
    Returns:
    - DataFrame containing the material consumption data or None if not available
    """
    try:
        return pd.read_csv(material_consumption_file)
    except FileNotFoundError:
        print(f"Note: Material consumption data file {material_consumption_file} not found.")
        print("COGS will be generated without reference to existing material consumption.")
        return None

def generate_cogs_data(products_df=None, batches_df=None, work_orders_df=None, 
                     costs_df=None, material_consumption_df=None, num_cogs=200,
                     start_time=None, end_time=None, output_file="data/cogs.csv"):
    """
    Generate synthetic data for the COGS (Cost of Goods Sold) table.
    
    Parameters:
    - products_df: DataFrame containing products data (optional)
    - batches_df: DataFrame containing batches data (optional)
    - work_orders_df: DataFrame containing work orders data (optional)
    - costs_df: DataFrame containing costs data (optional)
    - material_consumption_df: DataFrame containing material consumption data (optional)
    - num_cogs: Number of COGS records to generate
    - start_time: Start time for COGS dates (defaults to 365 days ago)
    - end_time: End time for COGS dates (defaults to current date)
    - output_file: CSV file to save the data
    
    Returns:
    - DataFrame containing the generated COGS data
    """
    # Set default time range if not provided
    if start_time is None:
        start_time = datetime.now() - timedelta(days=365)
    if end_time is None:
        end_time = datetime.now()
    
    # Generate product IDs if products_df is not provided
    if products_df is None or len(products_df) == 0:
        print("Generating synthetic product IDs...")
        product_ids = [f"PROD-{uuid.uuid4().hex[:8].upper()}" for _ in range(30)]
        
        # Create synthetic product information
        product_names = {pid: f"Product {i+1}" for i, pid in enumerate(product_ids)}
        product_families = {pid: random.choice(["Pharmaceutical", "Food & Beverage", "Chemical", "Electronics", "Automotive", "Consumer Goods"]) for pid in product_ids}
    else:
        product_ids = products_df['product_id'].tolist()
        
        # Extract product information if available
        if 'product_name' in products_df.columns:
            product_names = dict(zip(products_df['product_id'], products_df['product_name']))
        else:
            product_names = {pid: f"Product {i+1}" for i, pid in enumerate(product_ids)}
            
        if 'product_family' in products_df.columns:
            product_families = dict(zip(products_df['product_id'], products_df['product_family']))
        else:
            product_families = {pid: random.choice(["Pharmaceutical", "Food & Beverage", "Chemical", "Electronics", "Automotive", "Consumer Goods"]) for pid in product_ids}
    
    # Generate batch IDs if batches_df is not provided
    if batches_df is None or len(batches_df) == 0:
        print("Generating synthetic batch IDs...")
        batch_ids = [f"BATCH-{uuid.uuid4().hex[:8].upper()}" for _ in range(100)]
        
        # Create a mapping of batch to product
        batch_to_product = {}
        for batch_id in batch_ids:
            batch_to_product[batch_id] = random.choice(product_ids)
    else:
        batch_ids = batches_df['batch_id'].tolist()
        
        # Create a mapping of batch to product if available
        if 'product_id' in batches_df.columns:
            batch_to_product = dict(zip(batches_df['batch_id'], batches_df['product_id']))
        else:
            batch_to_product = {bid: random.choice(product_ids) for bid in batch_ids}
    
    # Generate work order IDs if work_orders_df is not provided
    if work_orders_df is None or len(work_orders_df) == 0:
        print("Generating synthetic work order IDs...")
        work_order_ids = [f"WO-{uuid.uuid4().hex[:8].upper()}" for _ in range(80)]
        
        # Create a mapping of work order to product
        work_order_to_product = {}
        for wo_id in work_order_ids:
            work_order_to_product[wo_id] = random.choice(product_ids)
    else:
        work_order_ids = work_orders_df['work_order_id'].tolist()
        
        # Create a mapping of work order to product if available
        if 'product_id' in work_orders_df.columns:
            work_order_to_product = dict(zip(work_orders_df['work_order_id'], work_orders_df['product_id']))
        else:
            work_order_to_product = {woid: random.choice(product_ids) for woid in work_order_ids}
    
    # Define cost categories
    cost_categories = ["Direct Materials", "Direct Labor", "Manufacturing Overhead", "Packaging", 
                      "Quality Control", "Setup", "Utilities", "Depreciation"]
    
    # Define COGS types
    cogs_types = ["Standard", "Actual", "Variance"]
    
    # Define periods (monthly, quarterly, etc.)
    period_types = ["Monthly", "Quarterly", "Annual", "Batch", "Product Run"]
    
    # Generate data structure
    data = {
        "cogs_id": [f"COGS-{uuid.uuid4().hex[:8].upper()}" for _ in range(num_cogs)],
        "product_id": [],
        "batch_id": [],
        "work_order_id": [],
        "period_type": [],
        "period_start_date": [],
        "period_end_date": [],
        "cogs_type": [],
        "direct_materials_cost": [],
        "direct_labor_cost": [],
        "manufacturing_overhead_cost": [],
        "packaging_cost": [],
        "quality_cost": [],
        "other_cost": [],
        "total_cogs": [],
        "units_produced": [],
        "cost_per_unit": [],
        "currency": [],
        "calculation_date": [],
        "notes": []
    }
    
    # Define product costing profiles based on product family
    # This defines the relative proportion of each cost category for different product types
    costing_profiles = {
        "Pharmaceutical": {
            "direct_materials": (0.3, 0.5),    # 30-50% of total cost
            "direct_labor": (0.1, 0.2),        # 10-20% of total cost
            "manufacturing_overhead": (0.15, 0.25), # 15-25% of total cost
            "packaging": (0.05, 0.1),          # 5-10% of total cost
            "quality": (0.1, 0.2),             # 10-20% of total cost
            "other": (0.05, 0.1)               # 5-10% of total cost
        },
        "Food & Beverage": {
            "direct_materials": (0.4, 0.6),    # 40-60% of total cost
            "direct_labor": (0.1, 0.2),        # 10-20% of total cost
            "manufacturing_overhead": (0.1, 0.2), # 10-20% of total cost
            "packaging": (0.1, 0.15),          # 10-15% of total cost
            "quality": (0.05, 0.1),            # 5-10% of total cost
            "other": (0.05, 0.1)               # 5-10% of total cost
        },
        "Chemical": {
            "direct_materials": (0.5, 0.65),   # 50-65% of total cost
            "direct_labor": (0.05, 0.15),      # 5-15% of total cost
            "manufacturing_overhead": (0.15, 0.25), # 15-25% of total cost
            "packaging": (0.05, 0.1),          # 5-10% of total cost
            "quality": (0.05, 0.1),            # 5-10% of total cost
            "other": (0.05, 0.1)               # 5-10% of total cost
        },
        "Electronics": {
            "direct_materials": (0.5, 0.7),    # 50-70% of total cost
            "direct_labor": (0.1, 0.2),        # 10-20% of total cost
            "manufacturing_overhead": (0.1, 0.2), # 10-20% of total cost
            "packaging": (0.03, 0.07),         # 3-7% of total cost
            "quality": (0.05, 0.1),            # 5-10% of total cost
            "other": (0.03, 0.1)               # 3-10% of total cost
        },
        "Automotive": {
            "direct_materials": (0.6, 0.75),   # 60-75% of total cost
            "direct_labor": (0.1, 0.15),       # 10-15% of total cost
            "manufacturing_overhead": (0.1, 0.15), # 10-15% of total cost
            "packaging": (0.01, 0.05),         # 1-5% of total cost
            "quality": (0.05, 0.1),            # 5-10% of total cost
            "other": (0.05, 0.1)               # 5-10% of total cost
        },
        "Consumer Goods": {
            "direct_materials": (0.4, 0.55),   # 40-55% of total cost
            "direct_labor": (0.15, 0.25),      # 15-25% of total cost
            "manufacturing_overhead": (0.1, 0.2), # 10-20% of total cost
            "packaging": (0.1, 0.2),           # 10-20% of total cost
            "quality": (0.05, 0.1),            # 5-10% of total cost
            "other": (0.05, 0.1)               # 5-10% of total cost
        }
    }
    
    # Create a mapping of product to base cost
    product_base_costs = {}
    if products_df is not None and 'base_cost' in products_df.columns:
        for _, product in products_df.iterrows():
            if pd.notna(product['base_cost']):
                product_base_costs[product['product_id']] = product['base_cost']
    else:
        # Generate synthetic base costs
        for product_id in product_ids:
            family = product_families.get(product_id, "Consumer Goods")
            
            if family == "Pharmaceutical":
                base_cost = random.uniform(50, 5000)
            elif family == "Food & Beverage":
                base_cost = random.uniform(5, 100)
            elif family == "Chemical":
                base_cost = random.uniform(20, 500)
            elif family == "Electronics":
                base_cost = random.uniform(50, 2000)
            elif family == "Automotive":
                base_cost = random.uniform(100, 3000)
            else:  # Consumer Goods
                base_cost = random.uniform(10, 300)
                
            product_base_costs[product_id] = base_cost
    
    # Generate data for each COGS record
    for i in range(num_cogs):
        # Determine if this is a batch-level or product-level COGS
        is_batch_level = random.random() < 0.7  # 70% batch-level, 30% product-level
        
        if is_batch_level:
            # Batch-level COGS
            batch_id = random.choice(batch_ids)
            data["batch_id"].append(batch_id)
            
            # Get product associated with this batch
            if batch_id in batch_to_product:
                product_id = batch_to_product[batch_id]
            else:
                product_id = random.choice(product_ids)
            
            data["product_id"].append(product_id)
            
            # Find a work order for this batch if possible
            if batches_df is not None and work_orders_df is not None and 'work_order_id' in batches_df.columns:
                batch_wo = batches_df[batches_df['batch_id'] == batch_id]['work_order_id'].values
                if len(batch_wo) > 0 and pd.notna(batch_wo[0]):
                    data["work_order_id"].append(batch_wo[0])
                else:
                    data["work_order_id"].append("")
            else:
                # No batch-work order relationship available
                data["work_order_id"].append("")
                
            # Batch-level is always "Batch" period type
            data["period_type"].append("Batch")
            
            # Set dates based on batch dates if available
            if batches_df is not None and 'actual_start_time' in batches_df.columns and 'actual_end_time' in batches_df.columns:
                batch_row = batches_df[batches_df['batch_id'] == batch_id]
                if len(batch_row) > 0:
                    start_date = pd.to_datetime(batch_row['actual_start_time'].iloc[0])
                    end_date = pd.to_datetime(batch_row['actual_end_time'].iloc[0])
                    
                    # Ensure we have valid dates
                    if pd.notna(start_date) and pd.notna(end_date):
                        data["period_start_date"].append(start_date.strftime("%Y-%m-%d"))
                        data["period_end_date"].append(end_date.strftime("%Y-%m-%d"))
                    else:
                        # Generate random dates
                        days_ago = random.randint(1, 365)
                        start_date = datetime.now() - timedelta(days=days_ago)
                        end_date = start_date + timedelta(days=random.randint(1, 30))
                        data["period_start_date"].append(start_date.strftime("%Y-%m-%d"))
                        data["period_end_date"].append(end_date.strftime("%Y-%m-%d"))
                else:
                    # Generate random dates
                    days_ago = random.randint(1, 365)
                    start_date = datetime.now() - timedelta(days=days_ago)
                    end_date = start_date + timedelta(days=random.randint(1, 30))
                    data["period_start_date"].append(start_date.strftime("%Y-%m-%d"))
                    data["period_end_date"].append(end_date.strftime("%Y-%m-%d"))
            else:
                # Generate random dates
                days_ago = random.randint(1, 365)
                start_date = datetime.now() - timedelta(days=days_ago)
                end_date = start_date + timedelta(days=random.randint(1, 30))
                data["period_start_date"].append(start_date.strftime("%Y-%m-%d"))
                data["period_end_date"].append(end_date.strftime("%Y-%m-%d"))
            
            # Set units produced (batch size)
            if batches_df is not None and 'batch_size' in batches_df.columns:
                batch_row = batches_df[batches_df['batch_id'] == batch_id]
                if len(batch_row) > 0 and pd.notna(batch_row['batch_size'].iloc[0]):
                    units_produced = batch_row['batch_size'].iloc[0]
                else:
                    units_produced = random.randint(50, 10000)
            else:
                units_produced = random.randint(50, 10000)
                
            data["units_produced"].append(units_produced)
            
        else:
            # Product-level COGS
            product_id = random.choice(product_ids)
            data["product_id"].append(product_id)
            data["batch_id"].append("")  # No specific batch
            data["work_order_id"].append("")  # No specific work order
            
            # Select period type
            period_type = random.choice(["Monthly", "Quarterly", "Annual"])
            data["period_type"].append(period_type)
            
            # Generate period dates based on type
            time_range_days = (end_time - start_time).days
            start_offset = random.randint(0, time_range_days - 1)
            period_start = start_time + timedelta(days=start_offset)
            
            if period_type == "Monthly":
                period_end = period_start + timedelta(days=30)
            elif period_type == "Quarterly":
                period_end = period_start + timedelta(days=90)
            else:  # Annual
                period_end = period_start + timedelta(days=365)
                
            data["period_start_date"].append(period_start.strftime("%Y-%m-%d"))
            data["period_end_date"].append(period_end.strftime("%Y-%m-%d"))
            
            # Set units produced (product-level is typically higher)
            units_produced = random.randint(1000, 100000)
            data["units_produced"].append(units_produced)
        
        # Set COGS type
        cogs_type = random.choice(cogs_types)
        data["cogs_type"].append(cogs_type)
        
        # Calculate costs based on product family
        product_family = product_families.get(product_id, "Consumer Goods")
        profile = costing_profiles.get(product_family, costing_profiles["Consumer Goods"])
        
        # Get base cost for this product
        base_cost = product_base_costs.get(product_id, random.uniform(10, 1000))
        
        # Adjust base cost for variance
        if cogs_type == "Standard":
            # Standard costs are the baseline
            cost_multiplier = 1.0
        elif cogs_type == "Actual":
            # Actual costs vary slightly from standard
            cost_multiplier = random.uniform(0.9, 1.1)
        else:  # Variance
            # Variance shows the difference
            cost_multiplier = random.uniform(0.8, 1.2)
            
        # Calculate total COGS
        total_cost = base_cost * units_produced * cost_multiplier
        
        # Distribute costs across categories based on product family profile
        # Direct Materials
        min_pct, max_pct = profile["direct_materials"]
        direct_materials_pct = random.uniform(min_pct, max_pct)
        direct_materials_cost = total_cost * direct_materials_pct
        data["direct_materials_cost"].append(round(direct_materials_cost, 2))
        
        # Direct Labor
        min_pct, max_pct = profile["direct_labor"]
        direct_labor_pct = random.uniform(min_pct, max_pct)
        direct_labor_cost = total_cost * direct_labor_pct
        data["direct_labor_cost"].append(round(direct_labor_cost, 2))
        
        # Manufacturing Overhead
        min_pct, max_pct = profile["manufacturing_overhead"]
        overhead_pct = random.uniform(min_pct, max_pct)
        overhead_cost = total_cost * overhead_pct
        data["manufacturing_overhead_cost"].append(round(overhead_cost, 2))
        
        # Packaging
        min_pct, max_pct = profile["packaging"]
        packaging_pct = random.uniform(min_pct, max_pct)
        packaging_cost = total_cost * packaging_pct
        data["packaging_cost"].append(round(packaging_cost, 2))
        
        # Quality
        min_pct, max_pct = profile["quality"]
        quality_pct = random.uniform(min_pct, max_pct)
        quality_cost = total_cost * quality_pct
        data["quality_cost"].append(round(quality_cost, 2))
        
        # Other
        min_pct, max_pct = profile["other"]
        other_pct = random.uniform(min_pct, max_pct)
        other_cost = total_cost * other_pct
        data["other_cost"].append(round(other_cost, 2))
        
        # Recalculate total as sum of components to ensure consistency
        total_cogs = (direct_materials_cost + direct_labor_cost + overhead_cost + 
                     packaging_cost + quality_cost + other_cost)
        data["total_cogs"].append(round(total_cogs, 2))
        
        # Calculate cost per unit
        if units_produced > 0:
            cost_per_unit = total_cogs / units_produced
        else:
            cost_per_unit = 0
            
        data["cost_per_unit"].append(round(cost_per_unit, 2))
        
        # Set currency (mostly USD)
        currencies = ["USD", "EUR", "GBP", "JPY", "CAD"]
        currency_weights = [0.7, 0.1, 0.1, 0.05, 0.05]  # Mostly USD
        data["currency"].append(random.choices(currencies, weights=currency_weights)[0])
        
        # Set calculation date (typically at the end of the period)
        calc_date = datetime.strptime(data["period_end_date"][-1], "%Y-%m-%d") + timedelta(days=random.randint(1, 5))
        data["calculation_date"].append(calc_date.strftime("%Y-%m-%d"))
        
        # Set notes
        if random.random() < 0.3:  # 30% chance of having notes
            notes_options = [
                f"Standard costing based on {product_family} category averages",
                "Includes material price variance adjustment",
                "Labor costs higher due to overtime",
                "Overhead allocation based on machine hours",
                f"Cost reconciliation for {data['period_type'][-1]} period",
                "Adjusted for yield loss",
                "Includes rework costs",
                "Based on actual consumption data",
                "Preliminary calculation pending final QC review",
                "Includes expedited shipping costs"
            ]
            data["notes"].append(random.choice(notes_options))
        else:
            data["notes"].append("")
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    # Ensure the directory exists
    os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
    
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    print(f"Successfully generated {len(df)} COGS records.")
    print(f"Data saved to {output_file}")
    
    return df

def display_cogs_statistics(cogs_df, products_df=None, batches_df=None):
    """
    Display basic statistics about the generated COGS data
    
    Parameters:
    - cogs_df: DataFrame containing COGS data
    - products_df: DataFrame containing products data (optional)
    - batches_df: DataFrame containing batches data (optional)
    """
    if cogs_df is None or len(cogs_df) == 0:
        print("No COGS data to analyze.")
        return
    
    print("\nCOGS Statistics:")
    print(f"Total COGS records: {len(cogs_df)}")
    
    # COGS type distribution
    print("\nCOGS Type Distribution:")
    type_counts = cogs_df['cogs_type'].value_counts()
    for cogs_type, count in type_counts.items():
        print(f"  {cogs_type}: {count} ({count/len(cogs_df)*100:.1f}%)")
    
    # Period type distribution
    print("\nPeriod Type Distribution:")
    period_counts = cogs_df['period_type'].value_counts()
    for period_type, count in period_counts.items():
        print(f"  {period_type}: {count} ({count/len(cogs_df)*100:.1f}%)")
    
    # Convert cost fields to numeric
    numeric_fields = ['direct_materials_cost', 'direct_labor_cost', 'manufacturing_overhead_cost',
                     'packaging_cost', 'quality_cost', 'other_cost', 'total_cogs',
                     'units_produced', 'cost_per_unit']
    
    for field in numeric_fields:
        cogs_df[f'{field}_numeric'] = pd.to_numeric(cogs_df[field], errors='coerce')
    
    # Total COGS statistics
    print("\nTotal COGS Statistics:")
    print(f"  Total COGS amount: ${cogs_df['total_cogs_numeric'].sum():,.2f}")
    print(f"  Average COGS per record: ${cogs_df['total_cogs_numeric'].mean():,.2f}")
    print(f"  Minimum COGS: ${cogs_df['total_cogs_numeric'].min():,.2f}")
    print(f"  Maximum COGS: ${cogs_df['total_cogs_numeric'].max():,.2f}")
    
    # Cost component analysis
    print("\nCost Component Analysis:")
    components = [
        ('Direct Materials', 'direct_materials_cost_numeric'),
        ('Direct Labor', 'direct_labor_cost_numeric'),
        ('Manufacturing Overhead', 'manufacturing_overhead_cost_numeric'),
        ('Packaging', 'packaging_cost_numeric'),
        ('Quality', 'quality_cost_numeric'),
        ('Other', 'other_cost_numeric')
    ]
    
    total_cost = cogs_df['total_cogs_numeric'].sum()
    
    for component_name, component_field in components:
        component_sum = cogs_df[component_field].sum()
        component_percent = (component_sum / total_cost) * 100 if total_cost > 0 else 0
        print(f"  {component_name}: ${component_sum:,.2f} ({component_percent:.1f}% of total)")
    
    # Unit cost statistics
    print("\nUnit Cost Statistics:")
    print(f"  Total units produced: {cogs_df['units_produced_numeric'].sum():,.0f}")
    print(f"  Average cost per unit: ${cogs_df['cost_per_unit_numeric'].mean():,.2f}")
    print(f"  Minimum cost per unit: ${cogs_df['cost_per_unit_numeric'].min():,.2f}")
    print(f"  Maximum cost per unit: ${cogs_df['cost_per_unit_numeric'].max():,.2f}")
    
    # Currency distribution
    print("\nCurrency Distribution:")
    currency_counts = cogs_df['currency'].value_counts()
    for currency, count in currency_counts.items():
        print(f"  {currency}: {count} ({count/len(cogs_df)*100:.1f}%)")
    
    # Product-level analysis
    product_counts = cogs_df.groupby('product_id').size().sort_values(ascending=False)
    
    print("\nTop 10 Products by COGS Record Count:")
    if products_df is not None:
        # Create a mapping of product_id to product_name
        product_names = dict(zip(products_df['product_id'], products_df['product_name']))
        
        for product_id, count in product_counts.head(10).items():
            product_name = product_names.get(product_id, product_id)
            print(f"  {product_name}: {count} records ({count/len(cogs_df)*100:.1f}%)")
    else:
        for product_id, count in product_counts.head(10).items():
            print(f"  {product_id}: {count} records ({count/len(cogs_df)*100:.1f}%)")
    
    # Product cost analysis
    product_costs = cogs_df.groupby('product_id')['total_cogs_numeric'].sum().sort_values(ascending=False)
    
    print("\nTop 10 Products by Total COGS:")
    if products_df is not None:
        for product_id, cost in product_costs.head(10).items():
            product_name = product_names.get(product_id, product_id)
            print(f"  {product_name}: ${cost:,.2f} ({cost/total_cost*100:.1f}% of total)")
    else:
        for product_id, cost in product_costs.head(10).items():
            print(f"  {product_id}: ${cost:,.2f} ({cost/total_cost*100:.1f}% of total)")
    
    # Batch-level analysis
    batch_cogs = cogs_df[cogs_df['batch_id'] != ""]
    print(f"\nBatch-level COGS: {len(batch_cogs)} records ({len(batch_cogs)/len(cogs_df)*100:.1f}%)")
    
    if len(batch_cogs) > 0:
        batch_costs = batch_cogs.groupby('batch_id')['total_cogs_numeric'].sum().sort_values(ascending=False)
        
        print("\nTop 10 Batches by Total COGS:")
        for batch_id, cost in batch_costs.head(10).items():
            print(f"  {batch_id}: ${cost:,.2f}")
    
    # Time-based analysis
    cogs_df['period_start_date'] = pd.to_datetime(cogs_df['period_start_date'])
    cogs_df['period_end_date'] = pd.to_datetime(cogs_df['period_end_date'])
    
    # Group by month of period end
    cogs_df['month'] = cogs_df['period_end_date'].dt.to_period('M')
    monthly_cogs = cogs_df.groupby('month')['total_cogs_numeric'].sum()
    
    print("\nCOGS by Month (latest 6 months):")
    for month, cost in monthly_cogs.tail(6).items():
        print(f"  {month}: ${cost:,.2f}")
    
    # COGS type comparison
    print("\nCOGS by Type:")
    type_costs = cogs_df.groupby('cogs_type')['total_cogs_numeric'].sum()
    for cogs_type, cost in type_costs.items():
        print(f"  {cogs_type}: ${cost:,.2f} ({cost/total_cost*100:.1f}% of total)")
        
        # Average unit cost by type
        type_df = cogs_df[cogs_df['cogs_type'] == cogs_type]
        avg_unit_cost = type_df['cost_per_unit_numeric'].mean()
        print(f"    Average cost per unit: ${avg_unit_cost:.2f}")

if __name__ == "__main__":
    # Load required data
    products_df = load_products_data()
    batches_df = load_batches_data()
    work_orders_df = load_work_orders_data()
    costs_df = load_costs_data()
    material_consumption_df = load_material_consumption_data()
    
    # Generate COGS data
    cogs_df = generate_cogs_data(
        products_df=products_df,
        batches_df=batches_df,
        work_orders_df=work_orders_df,
        costs_df=costs_df,
        material_consumption_df=material_consumption_df,
        num_cogs=200,
        output_file="data/cogs.csv"
    )
    
    # Display statistics
    if cogs_df is not None:
        display_cogs_statistics(cogs_df, products_df, batches_df)
        
        # Display sample data
        print("\nSample COGS data (first 5 records):")
        print(cogs_df.head(5))

Successfully generated 200 COGS records.
Data saved to data/cogs.csv

COGS Statistics:
Total COGS records: 200

COGS Type Distribution:
  Standard: 67 (33.5%)
  Variance: 67 (33.5%)
  Actual: 66 (33.0%)

Period Type Distribution:
  Batch: 137 (68.5%)
  Annual: 23 (11.5%)
  Quarterly: 21 (10.5%)
  Monthly: 19 (9.5%)

Total COGS Statistics:
  Total COGS amount: $1,148,455,351.16
  Average COGS per record: $5,742,276.76
  Minimum COGS: $1,596.01
  Maximum COGS: $112,811,648.72

Cost Component Analysis:
  Direct Materials: $608,608,626.41 (53.0% of total)
  Direct Labor: $159,604,116.57 (13.9% of total)
  Manufacturing Overhead: $161,009,642.29 (14.0% of total)
  Packaging: $60,511,026.34 (5.3% of total)
  Quality: $84,949,501.82 (7.4% of total)
  Other: $73,772,437.73 (6.4% of total)

Unit Cost Statistics:
  Total units produced: 3,060,565
  Average cost per unit: $474.71
  Minimum cost per unit: $5.69
  Maximum cost per unit: $1,185.06

Currency Distribution:
  USD: 131 (65.5%)
  EUR: 26

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
