In [1]:
import pandas as pd
df = pd.read_csv("Grocery_Inventory_and_Sales_Dataset.csv")
df.head()



Unnamed: 0,Product_ID,Product_Name,Catagory,Supplier_ID,Supplier_Name,Stock_Quantity,Reorder_Level,Reorder_Quantity,Unit_Price,Date_Received,Last_Order_Date,Expiration_Date,Warehouse_Location,Sales_Volume,Inventory_Turnover_Rate,Status
0,29-205-1132,Sushi Rice,Grains & Pulses,38-037-1699,Jaxnation,22,72,70,$4.50,8/16/2024,6/29/2024,9/19/2024,48 Del Sol Trail,32,19,Discontinued
1,40-681-9981,Arabica Coffee,Beverages,54-470-2479,Feedmix,45,77,2,$20.00,11/1/2024,5/29/2024,5/8/2024,36 3rd Place,85,1,Discontinued
2,06-955-3428,Black Rice,Grains & Pulses,54-031-2945,Vinder,30,38,83,$6.00,8/3/2024,6/10/2024,9/22/2024,3296 Walton Court,31,34,Backordered
3,71-594-6552,Long Grain Rice,Grains & Pulses,63-492-7603,Brightbean,12,59,62,$1.50,12/8/2024,2/19/2025,4/17/2024,3 Westerfield Crossing,95,99,Active
4,57-437-1828,Plum,Fruits & Vegetables,54-226-4308,Topicstorm,37,30,74,$4.00,7/3/2024,10/11/2024,10/5/2024,15068 Scoville Court,62,25,Backordered


In [2]:
date_columns = ["Date_Received", "Last_Order_Date", "Expiration_Date"]
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')
df[["Date_Received", "Expiration_Date"]].head()


Unnamed: 0,Date_Received,Expiration_Date
0,2024-08-16,2024-09-19
1,2024-11-01,2024-05-08
2,2024-08-03,2024-09-22
3,2024-12-08,2024-04-17
4,2024-07-03,2024-10-05


In [3]:
df["days_to_expiry"] = (df["Expiration_Date"] - df["Date_Received"]).dt.days
df["days_to_expiry"].describe()


count    990.000000
mean       0.185859
std      150.125993
min     -365.000000
25%     -106.750000
50%       -1.500000
75%      100.750000
max      344.000000
Name: days_to_expiry, dtype: float64

In [4]:
df["Unit_Price"] = df["Unit_Price"].replace('[\$,]', '', regex=True).astype(float)
df["Unit_Price"].head()


0     4.5
1    20.0
2     6.0
3     1.5
4     4.0
Name: Unit_Price, dtype: float64

In [5]:
numerical_cols = ["Stock_Quantity", "Reorder_Level", "Reorder_Quantity",
                  "Unit_Price", "Sales_Volume", "Inventory_Turnover_Rate", "days_to_expiry"]

for col in numerical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())
df[numerical_cols].isnull().sum()


Stock_Quantity             0
Reorder_Level              0
Reorder_Quantity           0
Unit_Price                 0
Sales_Volume               0
Inventory_Turnover_Rate    0
days_to_expiry             0
dtype: int64

In [6]:
categorical_cols = ["Status", "Catagory", "Product_Name", "Warehouse_Location"]

for col in categorical_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])
df[categorical_cols].isnull().sum()


Status                0
Catagory              0
Product_Name          0
Warehouse_Location    0
dtype: int64

In [7]:
df["Status_encoded"] = df["Status"].astype("category").cat.codes
df["Catagory_encoded"] = df["Catagory"].astype("category").cat.codes
df[["Status", "Status_encoded", "Catagory", "Catagory_encoded"]].head()


Unnamed: 0,Status,Status_encoded,Catagory,Catagory_encoded
0,Discontinued,2,Grains & Pulses,4
1,Discontinued,2,Beverages,1
2,Backordered,1,Grains & Pulses,4
3,Active,0,Grains & Pulses,4
4,Backordered,1,Fruits & Vegetables,3


In [8]:
drop_cols = ["Product_ID", "Product_Name", "Supplier_ID", "Supplier_Name",
             "Warehouse_Location", "Date_Received", "Expiration_Date",
             "Last_Order_Date", "Status", "Catagory"]

df_clean = df.drop(columns=drop_cols)
df_clean = df_clean[df_clean["days_to_expiry"] >= 0]
df_clean["days_to_expiry"].min()


0

In [9]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return df[(df[column] >= lower) & (df[column] <= upper)]

outlier_columns = ["Unit_Price", "Sales_Volume", "Inventory_Turnover_Rate", "days_to_expiry"]
for col in outlier_columns:
    df_clean = remove_outliers_iqr(df_clean, col)
df_clean[outlier_columns].describe()


Unnamed: 0,Unit_Price,Sales_Volume,Inventory_Turnover_Rate,days_to_expiry
count,462.0,462.0,462.0,462.0
mean,4.493398,59.352814,50.712121,120.352814
std,2.619244,22.726302,28.953202,87.08561
min,0.2,20.0,1.0,0.0
25%,2.5,41.0,24.25,49.0
50%,4.0,58.0,52.0,98.0
75%,6.0,80.0,75.0,189.0
max,12.0,100.0,100.0,344.0


In [10]:
print(df_clean.isnull().sum())
df_clean.to_csv("final_cleaned_data.csv", index=False)


Stock_Quantity             0
Reorder_Level              0
Reorder_Quantity           0
Unit_Price                 0
Sales_Volume               0
Inventory_Turnover_Rate    0
days_to_expiry             0
Status_encoded             0
Catagory_encoded           0
dtype: int64


In [11]:
# feature engineering ka code starts here

In [12]:
# Function to return discount factor based on days to expiry
def expiry_discount(days_left):
    if days_left <= 3:
        return 0.5    # 50% discount
    elif days_left <= 7:
        return 0.7    # 30% discount
    elif days_left <= 14:
        return 0.85   # 15% discount
    elif days_left <= 30:
        return 0.95   # 5% discount
    else:
        return 1.0    # No discount
# Function to calculate dynamic price based on expiry and sales performance
def dynamic_final_price(row):
    # Step 1: Get expiry factor
    expiry_factor = expiry_discount(row['days_to_expiry'])

    # Step 2: Adjust based on Sales Volume
    median_sales = df_clean['Sales_Volume'].median()
    
    if row['Sales_Volume'] > median_sales:
        expiry_factor *= 1.1  # Product is selling fast, reduce discount
    else:
        expiry_factor *= 0.9  # Product is slow-selling, increase discount

    expiry_factor = min(expiry_factor, 1.0)  # cap it at 1.0 (never increase price above original)

    # Step 3: Calculate final predicted price
    return row['Unit_Price'] * expiry_factor
df_clean['Final_Price'] = df_clean.apply(dynamic_final_price, axis=1)
df_clean[['Unit_Price', 'days_to_expiry', 'Sales_Volume', 'Final_Price']].head(10)
for col in outlier_columns:
    df_clean = remove_outliers_iqr(df_clean, col)


In [13]:
# Save the new feature engineered dataset
df_clean.to_csv("Grocery_Feature_Engineered.csv", index=False)


In [14]:
# feature engineering done and data saved in grocery feature engineered

In [15]:
import os
os.getcwd()


'C:\\Users\\Diya arora\\Desktop'