In [8]:
import os
import sys
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=UserWarning, module="pandas")

PROJECT_ROOT = r"D:\demand_forecasting_system"
SRC_DIR = os.path.join(PROJECT_ROOT, "src")
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

from tasks.extract_mssql import fetch_table_data

results = fetch_table_data("blinkit_customers")   

  df = pd.read_sql(query, conn)
2025-10-21 21:05:35,245 | INFO | data_pipeline | Fetched 2500 rows from table 'blinkit_customers' (DB)


Data Cleaning

In [2]:
df_customers = pd.DataFrame(results)
print(df_customers.head())


   customer_id customer_name                       email         phone  \
0        31813  Indrajit Pau   wakeetasehgal@example.com  911785834174   
1        31826     Ekiya Pau           fbedi@example.com  919364120886   
2        61020  Onkar Suresh  rudradeshpande@example.com  918421031341   
3        75482   Jeet Gandhi        amrita05@example.net  916683687947   
4       119099   Zansi Parsa          neha08@example.com  912032907631   

                               address              area  pincode  \
0    66, Vora Chowk\nMaheshtala 376030  Sultan Pur Majra   666762   
1  H.No. 336\nLad Zila, Hajipur-409871         Chinsurah   201602   
2      88/40, Bora Zila, Phusro-161515           Kurnool   345635   
3          95/861, Jaggi, Sikar-343285             Korba   298093   
4   H.No. 40, Gopal Ganj, Bidar 854350            Kavali   692517   

  registration_date customer_segment  total_orders  avg_order_value  
0        2024-04-10          Regular            18      1230.750000  


In [3]:
df_customers.columns

Index(['customer_id', 'customer_name', 'email', 'phone', 'address', 'area',
       'pincode', 'registration_date', 'customer_segment', 'total_orders',
       'avg_order_value'],
      dtype='object')

Keep Only Useful Columns

In [5]:
keep_cols = [
    "customer_id", "area", "pincode", 
    "registration_date", "customer_segment", 
    "total_orders", "avg_order_value"
]
df_customers = df_customers[keep_cols].copy()


In [6]:
df_customers.columns

Index(['customer_id', 'area', 'pincode', 'registration_date',
       'customer_segment', 'total_orders', 'avg_order_value'],
      dtype='object')

Clean Missing Values

In [7]:
# Fill missing areas with 'Unknown'
df_customers["area"] = df_customers["area"].fillna("Unknown")

# Fill missing segment with 'Regular'
df_customers["customer_segment"] = df_customers["customer_segment"].fillna("Regular")

# Replace missing numeric values with median or 0
df_customers["total_orders"] = df_customers["total_orders"].fillna(df_customers["total_orders"].median())
df_customers["avg_order_value"] = df_customers["avg_order_value"].fillna(df_customers["avg_order_value"].median())

# Convert registration_date to datetime
df_customers["registration_date"] = pd.to_datetime(df_customers["registration_date"], errors="coerce")


Validate & Fix Data Types

In [8]:
df_customers = df_customers.astype({
    "customer_id": "int64",
    "pincode": "Int64",
    "total_orders": "int64",
    "avg_order_value": "float64"
})


Remove Duplicates

In [9]:
df_customers.drop_duplicates(subset="customer_id", inplace=True)


Final Verification

In [10]:
print(df_customers.isnull().sum())
print(df_customers.dtypes)
print(df_customers.shape)


customer_id          0
area                 0
pincode              0
registration_date    0
customer_segment     0
total_orders         0
avg_order_value      0
dtype: int64
customer_id                   int64
area                         object
pincode                       Int64
registration_date    datetime64[ns]
customer_segment             object
total_orders                  int64
avg_order_value             float64
dtype: object
(2500, 7)


In [12]:
save_path = r"D:\demand_forecasting_system\data\processed\blinkit_customers_clean.csv"
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)
df_customers.to_csv(save_path, index=False)
print(f" Customers data cleaned and saved to: {save_path}")


 Customers data cleaned and saved to: D:\demand_forecasting_system\data\processed\blinkit_customers_clean.csv
