# Generating Inventory Data

In [3]:
import numpy as np
import pandas as pd

In [4]:
items_ace_df = pd.read_excel('./data_original/Ace_Bikes_Data.xlsx', 
                                usecols=['ItemID', 'Price', 'Cost', 'Description', 'CategoryID', 'VendorID'])

items_ace_df

Unnamed: 0,ItemID,Price,Cost,Description,CategoryID,VendorID
0,1.0,499.99,225.0,Ladies-Cruiser Classic-Black,BI,V1
1,2.0,629.99,270.0,Mens-Cruiser Classic-Black,BI,V2
2,3.0,629.99,270.0,Mens-Cruiser Classic-Blue,BI,V2
3,4.0,499.99,225.0,Ladies-Cruiser Classic-Purple,BI,V3
4,5.0,599.99,270.0,Ladies-Cruiser Stretch-Orange,BI,V3
...,...,...,...,...,...,...
3534,,,,,,
3535,,,,,,
3536,,,,,,
3537,,,,,,


plan: 
create new cols: 
- Month (YYYY-MM-01)
- LocationID
- ItemID
- BeginningOnHand
- PurchasedQty (received from vendors)
- SoldQty
- AdjustmentsQty (shrink, damage, count corrections; can be negative)
- EndingOnHand (= BeginningOnHand + PurchasedQty - SoldQty + AdjustmentsQty)

categories included in inventory:  BI, AC, WE, SA
categories not included in inventory: SV, EX (track sales/orders) 

In [5]:
class InventoryGenerator:
    """
    Generates monthly inventory snapshots for retail locations.
    Simulates realistic inventory movements: purchases, sales, adjustments.
    """
    
    def __init__(self, items_df: pd.DataFrame):
        """
        Initialize with items data.
        
        Args:
            items_df: DataFrame with columns [ItemID, CategoryID, ...]
        """
        self.items_df = items_df
        
        # Categories that hold physical inventory
        self.inventory_categories = {"BI", "AC", "WE", "SA"}
        
        # Filter to only stockable items
        self.stockable_items = items_df[
            items_df["CategoryID"].isin(self.inventory_categories)
        ].copy()
        
        # Store opening dates
        self.store_openings = {
            "L01": "2017-01-01",
            "L02": "2018-01-01",
            "L03": "2018-01-01",
            "L04": "2019-01-01",
            "L05": "2019-01-01",
            "L06": "2020-01-01",
            "L07": "2020-01-01",
            "L08": "2020-01-01",
            "L09": "2021-01-01",
            "L10": "2021-01-01",
            "L11": "2021-01-01",
            "L12": "2022-01-01",
            "L13": "2023-01-01",
            "L14": "2024-01-01",
            "L15": "2025-01-01",
        }
        
        # Convert to timestamps
        self.store_start_dates = {
            loc: pd.Timestamp(date) 
            for loc, date in self.store_openings.items()
        }
        
        # Inventory behavior by category
        self.category_params = {
            "BI": {  # Bikes
                "initial_stock": (0, 5),
                "monthly_sales": (0, 3),
                "reorder_point": (0, 2),
                "reorder_qty": (1, 4),
                "reorder_probability": 0.3,  # Less frequent ordering
            },
            "AC": {  # Accessories
                "initial_stock": (10, 80),
                "monthly_sales": (0, 30),
                "reorder_point": (8, 20),
                "reorder_qty": (15, 50),
                "reorder_probability": 0.4,
            },
            "WE": {  # Wearables
                "initial_stock": (10, 80),
                "monthly_sales": (0, 30),
                "reorder_point": (8, 20),
                "reorder_qty": (15, 50),
                "reorder_probability": 0.4,
            },
            "SA": {  # Safety
                "initial_stock": (10, 80),
                "monthly_sales": (0, 30),
                "reorder_point": (8, 20),
                "reorder_qty": (15, 50),
                "reorder_probability": 0.4,
            },
        }
        
        # Track current inventory levels: {(location, item_id): quantity}
        self.current_inventory = {}
    
    def _initialize_inventory(self, location: str, item_id: int, category: str) -> int:
        """
        Set initial inventory when a store first opens or we first track an item.
        
        Returns:
            Initial quantity on hand
        """
        params = self.category_params[category]
        min_qty, max_qty = params["initial_stock"]
        return np.random.randint(min_qty, max_qty + 1)
    
    def _calculate_sales(self, category: str, on_hand: int) -> int:
        """
        Calculate monthly sales. Can't sell more than available inventory.
        
        Returns:
            Units sold this month
        """
        params = self.category_params[category]
        min_sales, max_sales = params["monthly_sales"]
        
        # Random sales within category range
        potential_sales = np.random.randint(min_sales, max_sales + 1)
        
        # Can't sell more than what's available
        return min(potential_sales, on_hand)
    
    def _calculate_purchase(self, category: str, current_stock: int) -> int:
        """
        Determine if we should purchase inventory and how much.
        Purchase when stock falls below reorder point.
        
        Returns:
            Units purchased this month
        """
        params = self.category_params[category]
        
        # Get reorder thresholds
        min_reorder_point, max_reorder_point = params["reorder_point"]
        min_reorder_qty, max_reorder_qty = params["reorder_qty"]
        
        reorder_point = np.random.randint(min_reorder_point, max_reorder_point + 1)
        
        # If below reorder point, purchase
        if current_stock < reorder_point:
            return np.random.randint(min_reorder_qty, max_reorder_qty + 1)
        
        # Otherwise, occasional small top-up
        if np.random.random() < params["reorder_probability"]:
            return np.random.randint(0, max_reorder_qty // 3 + 1)
        
        return 0
    
    def _calculate_adjustment(self) -> int:
        """
        Calculate inventory adjustments (shrink, damage, corrections).
        Mostly zero, occasionally negative, rarely positive.
        
        Returns:
            Adjustment quantity (can be negative)
        """
        # 70% chance of no adjustment
        # 20% chance of small shrink (-1 to -3)
        # 10% chance of correction (+1 to +2)
        
        rand = np.random.random()
        
        if rand < 0.70:
            return 0
        elif rand < 0.90:
            return np.random.randint(-3, 0)  # -3, -2, -1
        else:
            return np.random.randint(1, 3)  # 1, 2
    
    def generate_inventory(
        self, 
        start_date: str = "2017-01-01",
        end_date: str = "2025-12-01"
    ) -> pd.DataFrame:
        """
        Generate complete inventory snapshot data.
        
        Args:
            start_date: First month to generate (YYYY-MM-DD)
            end_date: Last month to generate (YYYY-MM-DD)
        
        Returns:
            DataFrame with monthly inventory snapshots
        """
        # Generate monthly date range
        months = pd.date_range(start=start_date, end=end_date, freq="MS")
        
        all_locations = list(self.store_start_dates.keys())
        
        records = []
        
        print(f"Generating inventory data from {start_date} to {end_date}...")
        print(f"Items: {len(self.stockable_items)}")
        print(f"Locations: {len(all_locations)}")
        print(f"Months: {len(months)}")
        
        # Process each month
        for month_idx, month in enumerate(months):
            if month_idx % 12 == 0:
                print(f"Processing year {month.year}...")
            
            # Process each location
            for location in all_locations:
                
                # Skip if store hasn't opened yet
                if month < self.store_start_dates[location]:
                    continue
                
                # Process each item
                for _, item_row in self.stockable_items.iterrows():
                    item_id = int(item_row["ItemID"])
                    category = item_row["CategoryID"]
                    
                    key = (location, item_id)
                    
                    # Initialize inventory if first time seeing this location + item
                    if key not in self.current_inventory:
                        self.current_inventory[key] = self._initialize_inventory(
                            location, item_id, category
                        )
                    
                    # Get beginning inventory
                    beginning = self.current_inventory[key]
                    
                    # Calculate purchases (before sales, to allow for restocking)
                    purchased = self._calculate_purchase(category, beginning)
                    
                    # Stock after receiving purchases
                    stock_after_purchase = beginning + purchased
                    
                    # Calculate sales (limited by available stock)
                    sold = self._calculate_sales(category, stock_after_purchase)
                    
                    # Calculate adjustments
                    adjustment = self._calculate_adjustment()
                    
                    # Calculate ending inventory
                    ending = stock_after_purchase - sold + adjustment
                    
                    # Ensure ending inventory doesn't go negative
                    if ending < 0:
                        # Reduce adjustment to prevent negative
                        adjustment = adjustment - ending
                        ending = 0
                    
                    # Update state for next month
                    self.current_inventory[key] = ending
                    
                    # Create record
                    records.append({
                        "Month": month.strftime("%Y-%m-01"),
                        "LocationID": location,
                        "ItemID": item_id,
                        "BeginningOnHand": beginning,
                        "PurchasedQty": purchased,
                        "SoldQty": sold,
                        "AdjustmentsQty": adjustment,
                        "EndingOnHand": ending,
                    })
        
        print(f"Generated {len(records):,} inventory records")
        
        return pd.DataFrame(records)

In [None]:
# Load items data
items_df = 

# Display first few rows
print("Items data loaded:")
print(f"Total items: {len(items_df)}")

items_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Items.csv'

In [None]:
# check category distribution
print("Category distribution:")
print(items_df['CategoryID'].value_counts().sort_index())

# Inventory vs non-inventory items
inventory_categories = {"BI", "AC", "WE", "SA"}
inventory_items = items_df[items_df['CategoryID'].isin(inventory_categories)]
print(f"\nInventory items (BI, AC, WE, SA): {len(inventory_items)}")
print(f"Non-inventory items (SV, EX): {len(items_df) - len(inventory_items)}")

In [None]:
# Create generator
generator = InventoryGenerator(items_df)

# Generate inventory snapshots
inventory_df = generator.generate_inventory(
    start_date="2017-01-01",
    end_date="2025-12-01"
)

print("\n✓ Inventory data generated successfully!")

In [None]:
# First records
inventory_df.head(10)

In [None]:
# Last records
inventory_df.head(10)

In [None]:
# data summary & validation
print("=" * 60)
print("INVENTORY DATA SUMMARY")
print("=" * 60)

print(f"\nTotal records: {len(inventory_df):,}")
print(f"Date range: {inventory_df['Month'].min()} to {inventory_df['Month'].max()}")

print("\n" + "-" * 60)
print("Records by Location:")
print("-" * 60)
print(inventory_df.groupby('LocationID').size().sort_index())

print("\n" + "-" * 60)
print("Records by Category:")
print("-" * 60)
print(inventory_df.groupby('CategoryID').size())

In [None]:
# Validate the ending inventory formula
inventory_df['Calculated_Ending'] = (
    inventory_df['BeginningOnHand'] + 
    inventory_df['PurchasedQty'] - 
    inventory_df['SoldQty'] + 
    inventory_df['AdjustmentsQty']
)

# Check if calculations match
matches = (inventory_df['EndingOnHand'] == inventory_df['Calculated_Ending']).all()

if matches:
    print("✓ Inventory calculations validated successfully!")
    print("  Formula: EndingOnHand = BeginningOnHand + PurchasedQty - SoldQty + AdjustmentsQty")
else:
    print("✗ ERROR: Inventory calculation mismatch detected!")
    mismatches = inventory_df[inventory_df['EndingOnHand'] != inventory_df['Calculated_Ending']]
    print(f"  Found {len(mismatches)} mismatched records")

# Drop the validation column
inventory_df = inventory_df.drop(columns=['Calculated_Ending'])

In [None]:
# Numeric column statistics
print("Statistical summary of inventory quantities:")
print("\n")
inventory_df[['BeginningOnHand', 'PurchasedQty', 'SoldQty', 'AdjustmentsQty', 'EndingOnHand']].describe()

In [None]:
# Check for any negative ending inventory (should be zero)
negative_inventory = inventory_df[inventory_df['EndingOnHand'] < 0]

if len(negative_inventory) == 0:
    print("✓ No negative inventory found - all values are valid!")
else:
    print(f"✗ WARNING: Found {len(negative_inventory)} records with negative ending inventory")
    print(negative_inventory.head())

In [None]:
# Show sample records for each category
for category in ['BI', 'AC', 'WE', 'SA']:
    print(f"\n{'='*60}")
    print(f"Sample records for {category}:")
    print('='*60)
    sample = inventory_df[inventory_df['CategoryID'] == category].head(3)
    display(sample)

In [None]:
print("Data Quality Checks:")
print("=" * 60)

# Check for null values
null_counts = inventory_df.isnull().sum()
if null_counts.sum() == 0:
    print("✓ No null values found")
else:
    print("✗ Null values found:")
    print(null_counts[null_counts > 0])

# Check for duplicate records (same month, location, item)
duplicates = inventory_df.duplicated(subset=['Month', 'LocationID', 'ItemID']).sum()
if duplicates == 0:
    print("✓ No duplicate records found")
else:
    print(f"✗ Found {duplicates} duplicate records")

# Verify date format
try:
    pd.to_datetime(inventory_df['Month'])
    print("✓ All dates are valid")
except:
    print("✗ Some dates are invalid")

# Check if all stores respect their opening dates
store_openings = {
    "L01": "2017-01-01", "L02": "2018-01-01", "L03": "2018-01-01",
    "L04": "2019-01-01", "L05": "2019-01-01", "L06": "2020-01-01",
    "L07": "2020-01-01", "L08": "2020-01-01", "L09": "2021-01-01",
    "L10": "2021-01-01", "L11": "2021-01-01", "L12": "2022-01-01",
    "L13": "2023-01-01", "L14": "2024-01-01", "L15": "2025-01-01",
}

violations = 0
for loc, opening_date in store_openings.items():
    early_records = inventory_df[
        (inventory_df['LocationID'] == loc) & 
        (inventory_df['Month'] < opening_date)
    ]
    if len(early_records) > 0:
        violations += len(early_records)

if violations == 0:
    print("✓ All stores respect their opening dates")
else:
    print(f"✗ Found {violations} records before store opening dates")

print("\n" + "=" * 60)
print("✓ All quality checks complete!")