# Generating Inventory Data

In [1]:
import numpy as np
import pandas as pd

take orig LineItemSales, add date and loc from ORderInfo. Add that here, concat w LineItemSales_NEW, then change yr in func to 2017, let threshold be

In [3]:
lineitemsorig_df = pd.read_csv('./data_original/LineItemSales.csv')
# lineitemsorig_df

orders_df = pd.read_excel('./data_original/OrderInfo.xlsx')

order_small = orders_df[['OrderID', 'Date', 'LocationID']].copy()

lineitems_with_date_df = lineitemsorig_df.merge(
    order_small,
    on='OrderID',
    how='left',
    validate='many_to_one' # many line items per order; one matching order row
)

lineitems_with_date_df.isna().sum() # confirm no missing values

LineItemID        0
OrderID           0
ItemID            0
Qty               0
DiscountID    76495
Date              0
LocationID        0
dtype: int64

In [4]:
lineitemsnew_df = pd.read_csv('data_new/LineItemSales_NEW.csv')
lineitemsnew_df

Unnamed: 0,LineItemID,OrderID,ItemID,Qty,DiscountID,Date,LocationID
0,80142,29374,13,1,,2022-01-01,L03
1,80143,29374,50,1,,2022-01-01,L03
2,80144,29374,45,1,,2022-01-01,L03
3,80145,29374,29,1,,2022-01-01,L03
4,80146,29611,51,1,D1,2022-01-01,L15
...,...,...,...,...,...,...,...
9559,89701,32256,31,1,,2025-12-31,L02
9560,89702,32256,44,1,,2025-12-31,L02
9561,89703,32032,47,1,,2025-12-31,L13
9562,89704,32032,16,1,,2025-12-31,L13


In [5]:
lineitem_full_df = pd.concat([lineitems_with_date_df, lineitemsnew_df], ignore_index=True)

print("Old rows:", len(lineitems_with_date_df))
print("New rows:", len(lineitemsnew_df))
print("Combined rows:", len(lineitem_full_df))

Old rows: 80142
New rows: 9564
Combined rows: 89706


categories included in inventory:  BI, AC, WE, SA
categories not included in inventory: SV, EX (track sales/orders) 

I need to create a dataframe with these columns 
- Month (YYYY-MM-01)
- LocationID
- ItemID
- BeginningOnHand
- PurchasedQty (received from vendors)
- SoldQty
- AdjustmentsQty (shrink, damage, count corrections; can be negative)


Cover all years between y1 and y2 (e.g., 2020-2025)
For each month, I need to have rows for all locations currently existing and each items
Cover locations available each month (some stores have opened later like in 2024:
Here are the establishment year for each location                                      
L01    2017
L02    2018
L03    2018
L04    2019
L05    2019
L06    2020
L07    2020
L08    2020
L09    2021
L10    2021
L11    2021
L12    2022
L13    2023
L14    2024
L15    2025

PurchasedQty should usually “top up” if stock is low. Assume a good value and purchase up to threshold if stock falls below 20% of threshold. 

Sales qty can be gotten by groupby sum by locationid itemid and date from the line item dataframe which looks kinda like this

LineItemID,OrderID,ItemID,Qty,DiscountID,Date,LocationID
80142,29374,13,1,,2022-01-01,L03
80143,29374,50,1,,2022-01-01,L03
80144,29374,45,1,,2022-01-01,L03
80145,29374,29,1,,2022-01-01,L03
80146,29611,51,1,D1,2022-01-01,L15
80147,29611,4,1,,2022-01-01,L15
80148,29611,12,1,,2022-01-01,L15
80149,28952,55,1,,2022-01-01,L08
80150,28952,52,1,,2022-01-01,L08
80151,28952,18,1,D3,2022-01-01,L08
80152,29521,6,1,,2022-01-02,L13
80153,28947,55,1,D3,2022-01-02,L11
80154,28947,54,1,,2022-01-02,L11
80155,28947,27,1,,2022-01-02,L11
80156,28947,35,1,,2022-01-02,L11
80157,28873,47,1,,2022-01-02,L05


itemID is between 1-64

In [6]:

# Location establishment years
location_years = {
    'L01': 2017, 'L02': 2018, 'L03': 2018, 'L04': 2019, 'L05': 2019,
    'L06': 2020, 'L07': 2020, 'L08': 2020, 'L09': 2021, 'L10': 2021,
    'L11': 2021, 'L12': 2022, 'L13': 2023, 'L14': 2024, 'L15': 2025
}

In [7]:
def generate_inventory_dataframe(line_items_df, y1=2017, y2=2025, num_items=64):
    """
    Generate inventory dataframe with realistic movements
    
    Parameters:
    - line_items_df: DataFrame with columns [LineItemID, OrderID, ItemID, Qty, Date, LocationID]
    - y1: Start year
    - y2: End year
    - num_items: Total number of items (1 to num_items)
    """
    
    # Convert Date column to datetime
    line_items_df['Date'] = pd.to_datetime(line_items_df['Date'])
    line_items_df['Month'] = line_items_df['Date'].dt.to_period('M').dt.to_timestamp()
    
    # Calculate sales by month, location, and item
    sales_summary = line_items_df.groupby(['Month', 'LocationID', 'ItemID'])['Qty'].sum().reset_index()
    sales_summary.columns = ['Month', 'LocationID', 'ItemID', 'SoldQty']
    
    # Generate date range
    date_range = pd.date_range(start=f'{y1}-01-01', end=f'{y2}-12-01', freq='MS')
    
    # Create all combinations of dates, locations, and items
    # But only include locations that were open in that month
    records = []
    
    for date in date_range:
        year = date.year
        month = date.month
        
        # Determine which locations are open
        open_locations = [loc for loc, est_year in location_years.items() 
                         if est_year <= year or (est_year == year and month >= 1)]
        
        for location in open_locations:
            for item_id in range(1, num_items + 1):
                records.append({
                    'Month': date,
                    'LocationID': location,
                    'ItemID': item_id
                })
    
    # Create base dataframe
    inventory_df = pd.DataFrame(records)
    
    # Merge with sales data
    inventory_df = inventory_df.merge(sales_summary, 
                                      on=['Month', 'LocationID', 'ItemID'], 
                                      how='left')
    inventory_df['SoldQty'] = inventory_df['SoldQty'].fillna(0).astype(int)
    
    # Sort by location, item, and month
    inventory_df = inventory_df.sort_values(['LocationID', 'ItemID', 'Month']).reset_index(drop=True)
    
    # Set inventory parameters
    # Different items might have different thresholds; here we'll use item-based variation
    np.random.seed(42)
    item_thresholds = {item_id: np.random.randint(30, 100) for item_id in range(1, num_items + 1)}
    
    # Initialize columns
    inventory_df['BeginningOnHand'] = 0
    inventory_df['PurchasedQty'] = 0
    inventory_df['AdjustmentsQty'] = 0
    
    # Calculate inventory movements month by month
    for idx, row in inventory_df.iterrows():
        location = row['LocationID']
        item_id = row['ItemID']
        month = row['Month']
        sold_qty = row['SoldQty']
        
        threshold = item_thresholds[item_id]
        
        # Check if this is the first month for this location-item combination
        prev_rows = inventory_df[
            (inventory_df['LocationID'] == location) & 
            (inventory_df['ItemID'] == item_id) & 
            (inventory_df['Month'] < month)
        ]
        
        if len(prev_rows) == 0:
            # First month: start with initial stock
            beginning_on_hand = threshold # will need to hardcode to include 2022
            inventory_df.at[idx, 'BeginningOnHand'] = beginning_on_hand
        else:
            # Get ending inventory from previous month
            prev_idx = prev_rows.index[-1]
            prev_row = inventory_df.loc[prev_idx]
            beginning_on_hand = (prev_row['BeginningOnHand'] + 
                               prev_row['PurchasedQty'] - 
                               prev_row['SoldQty'] + 
                               prev_row['AdjustmentsQty'])
            inventory_df.at[idx, 'BeginningOnHand'] = max(0, beginning_on_hand)
        
        # Calculate purchases: top up if below 20% of threshold
        current_stock = inventory_df.at[idx, 'BeginningOnHand']
        
        if current_stock < threshold * 0.2:
            # Purchase enough to reach threshold
            purchase_qty = threshold - current_stock
            inventory_df.at[idx, 'PurchasedQty'] = purchase_qty
        else:
            inventory_df.at[idx, 'PurchasedQty'] = 0
        
        # Add random adjustments (shrink, damage, corrections)
        # Small probability of adjustments, can be positive or negative
        if np.random.random() < 0.15:  # 15% chance of adjustment
            # Adjustments typically small, mostly negative (shrink/damage)
            adjustment = np.random.choice(
                [-5, -4, -3, -2, -1, 0, 1, 2], 
                p=[0.1, 0.15, 0.2, 0.25, 0.15, 0.05, 0.05, 0.05]
            )
            inventory_df.at[idx, 'AdjustmentsQty'] = adjustment
        else:
            inventory_df.at[idx, 'AdjustmentsQty'] = 0
    
    # Convert Month to string format YYYY-MM-01
    inventory_df['Month'] = inventory_df['Month'].dt.strftime('%Y-%m-%d')
    
    # Ensure proper column order and data types
    inventory_df = inventory_df[['Month', 'LocationID', 'ItemID', 'BeginningOnHand', 
                                 'PurchasedQty', 'SoldQty', 'AdjustmentsQty']]
    
    inventory_df['ItemID'] = inventory_df['ItemID'].astype(int)
    inventory_df['BeginningOnHand'] = inventory_df['BeginningOnHand'].astype(int)
    inventory_df['PurchasedQty'] = inventory_df['PurchasedQty'].astype(int)
    inventory_df['SoldQty'] = inventory_df['SoldQty'].astype(int)
    inventory_df['AdjustmentsQty'] = inventory_df['AdjustmentsQty'].astype(int)
    
    return inventory_df


# Example usage with sample data
if __name__ == "__main__":
    # Load sample line items
    line_items_df = lineitem_full_df.copy()
    
    # Generate inventory dataframe
    inventory_df = generate_inventory_dataframe(line_items_df, y1=2017, y2=2025, num_items=64)
    
    # Display sample
    print("Inventory DataFrame Sample:")
    print(inventory_df.head(20))
    print(f"\nTotal rows: {len(inventory_df)}")
    print(f"\nDataFrame info:")
    print(inventory_df.info())
    
    # Show some statistics
    print("\n\nSummary by Location:")
    print(inventory_df.groupby('LocationID').agg({
        'SoldQty': 'sum',
        'PurchasedQty': 'sum',
        'AdjustmentsQty': 'sum'
    }))
    
    

Inventory DataFrame Sample:
         Month LocationID  ItemID  BeginningOnHand  PurchasedQty  SoldQty  \
0   2020-01-01        L01       1               81             0        2   
1   2020-02-01        L01       1               79             0        1   
2   2020-03-01        L01       1               78             0        3   
3   2020-04-01        L01       1               75             0        7   
4   2020-05-01        L01       1               68             0        2   
5   2020-06-01        L01       1               66             0        3   
6   2020-07-01        L01       1               63             0        9   
7   2020-08-01        L01       1               54             0        7   
8   2020-09-01        L01       1               46             0        3   
9   2020-10-01        L01       1               45             0        1   
10  2020-11-01        L01       1               44             0        2   
11  2020-12-01        L01       1               

In [8]:
# Save to CSV
inventory_df.to_csv('./data_new/FULL_Inventory.csv', index=False)
print("\n\nInventory dataframe saved to inventory_dataframe.csv")



Inventory dataframe saved to inventory_dataframe.csv
