In [None]:
# 01_data_cleaning.ipynb
# ---
# Marketing Analytics Dashboard - Data Cleaning & Transformation (Local data version)

# Step 1: Import libraries
import pandas as pd
import numpy as np
import os

# Step 2: Define file paths
raw_path = '../data/raw/marketing_campaign_dataset.csv'
processed_path = '../data/processed/cleaned_marketing_data.csv'

# Create directories if they don't exist
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# Step 3: Load dataset
# Check if raw file exists, otherwise download from GitHub sample source
if not os.path.exists(raw_path):
    url = "https://raw.githubusercontent.com/datablist/sample-csv-files/main/files/marketing_campaign.csv"
    df = pd.read_csv(url)
    df.to_csv(raw_path, index=False)
    print(f"Downloaded and saved raw dataset to {raw_path}")
else:
    df = pd.read_csv(raw_path)
    print(f"Loaded dataset from {raw_path}")

print("Initial data shape:", df.shape)
print(df.head())

print("\nColumns in dataset:")
print(df.columns.tolist())


# Expected columns based on provided schema
expected_cols = [
'Company', 'Campaign_Type', 'Target_Audience', 'Duration', 'Channels_Used',
'Conversion_Rate', 'Acquisition_Cost', 'ROI', 'Location', 'Language',
'Clicks', 'Impressions', 'Engagement_Score', 'Customer_Segment', 'Date'
]

# Step 4: Inspect columns
# Ensure expected columns exist
missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
print(f"Warning: Missing expected columns: {missing_cols}")


# Step 5: Data cleaning
# - Handle missing values
# - Normalize text columns
# - Convert date and numeric columns
df.drop_duplicates(inplace=True)
df.fillna({'Conversion_Rate': 0, 'ROI': 0, 'Clicks': 0, 'Impressions': 0, 'Engagement_Score': 0}, inplace=True)


text_columns = ['Company', 'Campaign_Type', 'Target_Audience', 'Channels_Used', 'Location', 'Language', 'Customer_Segment']
for col in text_columns:
if col in df.columns:
df[col] = df[col].astype(str).str.strip().str.title()


if 'Date' in df.columns:
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')


numeric_cols = ['Duration', 'Conversion_Rate', 'Acquisition_Cost', 'ROI', 'Clicks', 'Impressions', 'Engagement_Score']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)


# Step 6: Derive additional KPIs
# CTR = Clicks / Impressions
# CPC = Acquisition_Cost / Clicks
# Efficiency_Index = ROI * Conversion_Rate / Acquisition_Cost


df['CTR'] = np.where(df['Impressions'] > 0, df['Clicks'] / df['Impressions'], 0)
df['CPC'] = np.where(df['Clicks'] > 0, df['Acquisition_Cost'] / df['Clicks'], 0)
df['Efficiency_Index'] = np.where(df['Acquisition_Cost'] > 0, df['ROI'] * df['Conversion_Rate'] / df['Acquisition_Cost'], 0)


# Step 7: Summary statistics
summary = df.describe(include='all')
print("\nSummary of KPIs:")
print(summary[['Clicks', 'Impressions', 'Conversion_Rate', 'ROI', 'CTR', 'CPC', 'Efficiency_Index']])


# Step 8: Export cleaned data
df.to_csv(processed_path, index=False)
print(f"\nCleaned dataset saved to {processed_path}")


# Step 9: Quick visualization (optional)
try:
import matplotlib.pyplot as plt
df.groupby('Campaign_Type')['ROI'].mean().sort_values(ascending=False).plot(kind='bar', title='Average ROI by Campaign Type')
plt.xlabel('Campaign Type')
plt.ylabel('Average ROI')
plt.tight_layout()
plt.show()
except ImportError:
print("Matplotlib not installed; skipping plot.")
