# Generate Line Item Sales

In [1]:
import pandas as pd
import numpy as np

In [2]:
lineitem_df = pd.read_csv('data_original/LineItemSales.csv')
lineitem_df.head()

Unnamed: 0,LineItemID,OrderID,ItemID,Qty,DiscountID
0,1,1,20,1,D3
1,2,1,2,1,
2,3,1,53,1,
3,4,2,18,1,
4,5,2,41,1,


In [3]:
lineitem_df.columns.to_list()

['LineItemID', 'OrderID', 'ItemID', 'Qty', 'DiscountID']

In [4]:
orders_df = pd.read_excel('data_new/OrderInfo_NEW.xlsx')
orders_df

Unnamed: 0,CustomerID,LocationID,Date,Time,EmployeeID,OrderID
0,57520,L03,2022-01-01,10:27:31,20,29374
1,57757,L15,2022-01-01,14:58:33,56,29611
2,57098,L08,2022-01-01,15:31:06,45,28952
3,57667,L13,2022-01-02,18:13:23,57,29521
4,57093,L11,2022-01-02,16:22:58,54,28947
...,...,...,...,...,...,...
3834,60256,L02,2025-12-31,11:20:47,8,32110
3835,59781,L07,2025-12-31,14:50:01,40,31635
3836,59857,L11,2025-12-31,12:06:26,65,31711
3837,60402,L02,2025-12-31,17:03:27,79,32256


In [5]:
# FOr each orderID, need to generate 1-3 line item sales rows. 
# Each line item sales row will have ItemID, Quantity, OrderID, lineItemID, Discount id

# ItemID = between 1-61 (54-61 will be much less common)
# Quantity = For each item, quantity can be between 1-4. But for itemIDs 1-21, quantity should strictly be 1
# OrderID is the orderId for which we are generating the line item rows rn. 
# LineItem ID (grows in seqience from 1
# DiscountID: D1 2%, D2 2%, D3 1% and remaining 95% of the time, no value

In [6]:
np.random.seed(1234)

# Initialize list to store line items
line_item_id = int(lineitem_df["LineItemID"].max())
line_item_id

80142

In [7]:
line_items = []
for _, order in orders_df.iterrows():
    order_id = order['OrderID']
    date = order['Date']
    location = order['LocationID']
    
    # Generate 1-3 line items per order
    num_line_items = np.random.randint(1, 5)
    
    # Track items already added to this order to avoid duplicates
    items_in_order = set()
    
    for _ in range(num_line_items):
        # Generate ItemID with weighted probability
        # Items 1-53 are common, 54-61 are much less common
        if np.random.random() < 0.90:  # 90% chance for items 1-53
            item_id = np.random.randint(1, 54)
        else:  # 10% chance for items 54-61
            item_id = np.random.randint(54, 62)
        
        # Avoid duplicate items in the same order
        while item_id in items_in_order:
            if np.random.random() < 0.90:
                item_id = np.random.randint(1, 54)
            else:
                item_id = np.random.randint(54, 62)
        
        items_in_order.add(item_id)
        
        # Generate Quantity based on ItemID
        if 1 <= item_id <= 21:
            quantity = 1  # Strictly 1 for items 1-21
        else:
            if np.random.random() < 0.98:
                quantity = 1
            else:    
                quantity = np.random.randint(2, 5)  # 1-4 for other items
        
        # Generate DiscountID with specified probabilities
        discount_rand = np.random.random()
        if discount_rand < 0.02:  # 2% chance
            discount_id = 'D1'
        elif discount_rand < 0.04:  # 2% chance
            discount_id = 'D2'
        elif discount_rand < 0.05:  # 1% chance
            discount_id = 'D3'
        else:  # 95% chance
            discount_id = None
        
        # Create line item
        line_items.append({
            'LineItemID': line_item_id,
            'OrderID': order_id,
            'ItemID': item_id,
            'Qty': quantity,
            'DiscountID': discount_id,
            'Date': date,
            'LocationID': location
        })

        
        line_item_id += 1

# Create DataFrame
line_items_df = pd.DataFrame(line_items)

# Display results
print(f"Generated {len(line_items_df)} line items for {len(orders_df)} orders")
print("\nFirst 20 line items:")
print(line_items_df.head(20))

# Save to CSV
line_items_df.to_csv('line_items.csv', index=False)
print("\nLine items saved to 'line_items.csv'")

# Show summary statistics
print("\nSummary Statistics:")
print(f"Total line items: {len(line_items_df)}")
print(f"Average line items per order: {len(line_items_df)/len(orders_df):.2f}")
print(f"\nItemID distribution:")
print(f"  Items 1-53: {len(line_items_df[line_items_df['ItemID'] <= 53])}")
print(f"  Items 54-61: {len(line_items_df[line_items_df['ItemID'] >= 54])}")
print(f"\nDiscount distribution:")
print(line_items_df['DiscountID'].value_counts(dropna=False))

Generated 9564 line items for 3839 orders

First 20 line items:
    LineItemID  OrderID  ItemID  Qty DiscountID       Date LocationID
0        80142    29374      13    1       None 2022-01-01        L03
1        80143    29374      50    1       None 2022-01-01        L03
2        80144    29374      45    1       None 2022-01-01        L03
3        80145    29374      29    1       None 2022-01-01        L03
4        80146    29611      51    1         D1 2022-01-01        L15
5        80147    29611       4    1       None 2022-01-01        L15
6        80148    29611      12    1       None 2022-01-01        L15
7        80149    28952      55    1       None 2022-01-01        L08
8        80150    28952      52    1       None 2022-01-01        L08
9        80151    28952      18    1         D3 2022-01-01        L08
10       80152    29521       6    1       None 2022-01-02        L13
11       80153    28947      55    1         D3 2022-01-02        L11
12       80154    28947   

In [8]:
line_items_df.drop_duplicates(subset=["LineItemID", "OrderID"]) \
    .to_csv('./data_new/LineItemSales_NEW.csv', index=False)