# Data Cleaning Workflow

This notebook loads raw SME sales data, performs basic cleaning, engineers profitability metrics, and saves a processed dataset.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Load raw CSV data
raw_path = '../data/raw/sales_data.csv'
df = pd.read_csv(raw_path)
df.head()

In [None]:
# Standardize column names
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

# Remove duplicate rows
df = df.drop_duplicates()

# Handle missing values for key numeric columns
for col in ['quantity', 'unit_price', 'discount']:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Ensure numeric types
for col in ['quantity', 'unit_price', 'discount']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)

In [None]:
# Add derived financial columns
df['sales'] = df['quantity'] * df['unit_price']
df['cost'] = df['sales'] * 0.70  # assumed 70% cost ratio
df['profit'] = df['sales'] - df['cost']
df['profit_margin'] = np.where(df['sales'] > 0, (df['profit'] / df['sales']) * 100, 0)

df[['sales', 'cost', 'profit', 'profit_margin']].head()

In [None]:
# Save cleaned data
processed_path = '../data/processed/sales_data_cleaned.csv'
df.to_csv(processed_path, index=False)
print(f'Cleaned dataset saved to: {processed_path}')