In [2]:
import os
import sys
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=UserWarning, module="pandas")

PROJECT_ROOT = r"D:\demand_forecasting_system"
SRC_DIR = os.path.join(PROJECT_ROOT, "src")
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

from tasks.extract_mssql import fetch_table_data

In [3]:
df_products = fetch_table_data('blinkit_products')
print(df_products.info())
print(df_products.head())

  df = pd.read_sql(query, conn)
2025-10-21 21:08:34,697 | INFO | data_pipeline | Fetched 268 rows from table 'blinkit_products' (DB)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   product_id         268 non-null    int64  
 1   product_name       268 non-null    object 
 2   category           268 non-null    object 
 3   brand              268 non-null    object 
 4   price              268 non-null    float64
 5   mrp                268 non-null    float64
 6   margin_percentage  268 non-null    float64
 7   shelf_life_days    268 non-null    int64  
 8   min_stock_level    268 non-null    int64  
 9   max_stock_level    268 non-null    int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 21.1+ KB
None
   product_id product_name             category                    brand  \
0        4452   Baby Wipes            Baby Care             Morar-Mistry   
1        6405    Baby Food            Baby Care            Kashyap-Reddy   
2        9436   Toothpa

In [4]:
usable_cols = [
    'product_id', 'product_name', 'category', 'brand',
    'price', 'mrp', 'margin_percentage',
    'shelf_life_days', 'min_stock_level', 'max_stock_level'
]

df_products = df_products[usable_cols].copy()


Remove duplicates & clean column names

In [5]:
df_products.columns = df_products.columns.str.strip()
df_products.drop_duplicates(subset='product_id', inplace=True)


Handle missing values

In [6]:
# Categorical columns → fill with 'Unknown'
cat_cols = ['product_name', 'category', 'brand']
df_products[cat_cols] = df_products[cat_cols].fillna('Unknown')

# Numeric columns → fill with 0 or reasonable defaults
num_cols = ['price', 'mrp', 'margin_percentage', 'shelf_life_days', 'min_stock_level', 'max_stock_level']
df_products[num_cols] = df_products[num_cols].fillna(0)


Convert datatypes

In [7]:
df_products['product_id'] = df_products['product_id'].astype(int)
for col in ['price', 'mrp', 'margin_percentage']:
    df_products[col] = df_products[col].astype(float)
for col in ['shelf_life_days', 'min_stock_level', 'max_stock_level']:
    df_products[col] = df_products[col].astype(int)


Derived columns

In [8]:
# Profit per product
df_products['profit_per_unit'] = df_products['mrp'] - df_products['price']

# Price difference percentage
df_products['discount_percentage'] = ((df_products['mrp'] - df_products['price']) / df_products['mrp']).replace([np.inf, np.nan], 0)


Final checks

In [9]:
print(df_products.isnull().sum())
print(df_products.dtypes)
print(df_products.shape)


product_id             0
product_name           0
category               0
brand                  0
price                  0
mrp                    0
margin_percentage      0
shelf_life_days        0
min_stock_level        0
max_stock_level        0
profit_per_unit        0
discount_percentage    0
dtype: int64
product_id               int64
product_name            object
category                object
brand                   object
price                  float64
mrp                    float64
margin_percentage      float64
shelf_life_days          int64
min_stock_level          int64
max_stock_level          int64
profit_per_unit        float64
discount_percentage    float64
dtype: object
(268, 12)


Save cleaned file

In [12]:
save_path = r"D:\demand_forecasting_system\data\processed\blinkit_products_clean.csv"
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)
df_products.to_csv(save_path, index=False)
print(f" Products data cleaned and saved to: {save_path}")


 Products data cleaned and saved to: D:\demand_forecasting_system\data\processed\blinkit_products_clean.csv
