In [1]:
import os
import sys
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore", category=UserWarning, module="pandas")

PROJECT_ROOT = r"D:\demand_forecasting_system"
SRC_DIR = os.path.join(PROJECT_ROOT, "src")
if SRC_DIR not in sys.path:
    sys.path.append(SRC_DIR)

from tasks.extract_mssql import fetch_table_data

In [2]:
df_marketing=fetch_table_data('blinkit_marketing_performance')

  df = pd.read_sql(query, conn)
2025-10-21 21:07:41,403 | INFO | data_pipeline | Fetched 5400 rows from table 'blinkit_marketing_performance' (DB)


Select only usable columns

In [4]:
usable_cols = [
    'campaign_id', 'campaign_name', 'date', 
    'channel', 'impressions', 'clicks', 'conversions', 'spend', 'revenue_generated'
]

df_marketing = df_marketing[usable_cols].copy()


In [15]:
df_marketing.isnull().sum()


campaign_id          0
campaign_name        0
date                 0
channel              0
impressions          0
clicks               0
conversions          0
spend                0
revenue_generated    0
dtype: int64

Handle duplicates & column whitespace

In [5]:
df_marketing.columns = df_marketing.columns.str.strip()
df_marketing.drop_duplicates(subset=['campaign_id', 'date'], inplace=True)


Handle missing values

In [6]:
# Numeric columns → fill missing with 0
numeric_cols = ['impressions', 'clicks', 'conversions', 'spend', 'revenue_generated']
df_marketing[numeric_cols] = df_marketing[numeric_cols].fillna(0)

# Categorical columns → fill missing with 'Unknown'
df_marketing['campaign_name'] = df_marketing['campaign_name'].fillna('Unknown')
df_marketing['channel'] = df_marketing['channel'].fillna('Unknown')

# Date column → parse as datetime
df_marketing['date'] = pd.to_datetime(df_marketing['date'], errors='coerce', dayfirst=True)


Fix datatypes

In [7]:
df_marketing['campaign_id'] = df_marketing['campaign_id'].astype(int)
for col in numeric_cols:
    df_marketing[col] = df_marketing[col].astype(float)


Create derived metrics

In [16]:
# Click-through rate
df_marketing['ctr'] = (df_marketing['clicks'] / df_marketing['impressions']).replace([np.inf, np.nan], 0)

# Conversion rate
df_marketing['conversion_rate'] = (df_marketing['conversions'] / df_marketing['clicks']).replace([np.inf, np.nan], 0)

# ROI
df_marketing['roi'] = ((df_marketing['revenue_generated'] - df_marketing['spend']) / df_marketing['spend']).replace([np.inf, np.nan], 0)


In [17]:
print(df_marketing.isnull().sum())
print(df_marketing.dtypes)
print(df_marketing.shape)


campaign_id          0
campaign_name        0
date                 0
channel              0
impressions          0
clicks               0
conversions          0
spend                0
revenue_generated    0
ctr                  0
conversion_rate      0
roi                  0
dtype: int64
campaign_id                   int64
campaign_name                object
date                 datetime64[ns]
channel                      object
impressions                 float64
clicks                      float64
conversions                 float64
spend                       float64
revenue_generated           float64
ctr                         float64
conversion_rate             float64
roi                         float64
dtype: object
(5400, 12)


In [18]:
save_path = r"D:\demand_forecasting_system\data\processed\blinkit_marketing_clean.csv"
import os
os.makedirs(os.path.dirname(save_path), exist_ok=True)
df_marketing.to_csv(save_path, index=False)
print(f"✅ Marketing data cleaned and saved to: {save_path}")


✅ Marketing data cleaned and saved to: D:\demand_forecasting_system\data\processed\blinkit_marketing_clean.csv
