In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import os
from utils import clean_text_columns, convert_numeric, compute_kpis

# Step 2: Define file paths
raw_path = '../data/raw/marketing_campaign_dataset.csv'
processed_path = '../data/processed/cleaned_marketing_data.csv'

# Create directories if they don't exist
os.makedirs('../data/raw', exist_ok=True)
os.makedirs('../data/processed', exist_ok=True)

# Step 3: Load dataset
if not os.path.exists(raw_path):
    raise FileNotFoundError(
        f"The dataset was not found at {raw_path}. Please place your marketing_campaign_dataset.csv file in the data/raw folder."
    )

df = pd.read_csv(raw_path)
print("Loaded dataset from:", raw_path)
print("Initial data shape:", df.shape)
print(df.head())

print("\nColumns in dataset:")
print(df.columns.tolist())

# Expected columns based on provided schema
expected_cols = [
    'Company', 'Campaign_Type', 'Target_Audience', 'Duration', 'Channels_Used',
    'Conversion_Rate', 'Acquisition_Cost', 'ROI', 'Location', 'Language',
    'Clicks', 'Impressions', 'Engagement_Score', 'Customer_Segment', 'Date'
]

# Step 4: Inspect columns
missing_cols = [c for c in expected_cols if c not in df.columns]
if missing_cols:
    print(f"Warning: Missing expected columns: {missing_cols}")

# Step 5: Data cleaning
df.drop_duplicates(inplace=True)
df.fillna({'Conversion_Rate': 0, 'ROI': 0, 'Clicks': 0, 'Impressions': 0, 'Engagement_Score': 0}, inplace=True)

# Define column groups
text_columns = [
    'Company', 'Campaign_Type', 'Target_Audience', 'Channels_Used',
    'Location', 'Language', 'Customer_Segment'
]

numeric_cols = [
    'Duration', 'Conversion_Rate', 'Acquisition_Cost', 'ROI',
    'Clicks', 'Impressions', 'Engagement_Score'
]

# Apply utility functions
df = clean_text_columns(df, text_columns)
df = convert_numeric(df, numeric_cols)
df = compute_kpis(df)

if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Step 6: Summary statistics
summary = df.describe(include='all')
print("\nSummary of KPIs:")
print(summary[['Clicks', 'Impressions', 'Conversion_Rate', 'ROI', 'CTR', 'CPC', 'Efficiency_Index']])

# Step 7: Export cleaned data
df.to_csv(processed_path, index=False)
print(f"\nCleaned dataset saved to {processed_path}")

# Step 8: Quick visualization (optional)
try:
    import matplotlib.pyplot as plt
    df.groupby('Campaign_Type')['ROI'].mean().sort_values(ascending=False).plot(
        kind='bar', title='Average ROI by Campaign Type'
    )
    plt.xlabel('Campaign Type')
    plt.ylabel('Average ROI')
    plt.tight_layout()
    plt.show()
except ImportError:
    print("Matplotlib not installed; skipping plot.")


# Step 7: Summary statistics
summary = df.describe(include='all')
print("\nSummary of KPIs:")
print(summary[['Clicks', 'Impressions', 'Conversion_Rate', 'ROI', 'CTR', 'CPC', 'Efficiency_Index']])


# Step 8: Export cleaned data
df.to_csv(processed_path, index=False)
print(f"\nCleaned dataset saved to {processed_path}")


# Step 9: Quick visualization (optional)
try:
import matplotlib.pyplot as plt
df.groupby('Campaign_Type')['ROI'].mean().sort_values(ascending=False).plot(kind='bar', title='Average ROI by Campaign Type')
plt.xlabel('Campaign Type')
plt.ylabel('Average ROI')
plt.tight_layout()
plt.show()
except ImportError:
print("Matplotlib not installed; skipping plot.")
