# Data Cleaning Notebook

This notebook cleans the raw datasets (`Cars.csv`, `Customers.csv`, `Sales.csv`) using high-performance **Polars** and prepares them for further analysis or machine learning.

In [3]:
import polars as pl
import numpy as np
import re
from pathlib import Path

from src.utils.data_manager import get_data_path

## 1. Load Data
Loading raw files from `data/raw/`.

In [4]:
raw_dir = get_data_path("raw")
out_dir = get_data_path("cleaned")

print(f"Raw Path: {raw_dir}")
print(f"Output Path: {out_dir}")

cars_raw = pl.read_csv(raw_dir / "Cars.csv")
customers_raw = pl.read_csv(raw_dir / "Customers.csv")
sales_raw = pl.read_csv(raw_dir / "Sales.csv")

print(f"Cars shape: {cars_raw.shape}")
print(f"Customers shape: {customers_raw.shape}")
print(f"Sales shape: {sales_raw.shape}")

## 2. Clean Cars Data
-   **Dynamic Brand Imputation**: Mapping `Model -> Brand`.
-   **Transmission Standardization**.
-   **Type Casting**.

In [5]:
# 1. Create Model -> Brand mapping from existing data
brand_map = (
    cars_raw.filter(pl.col("Brand").is_not_null() & (pl.col("Brand") != ""))
    .select(["Model", "Brand"])
    .unique()
)

# 2. Join to fill missing brands
cars_cleaned = (
    cars_raw
    .join(brand_map, on="Model", how="left", suffix="_mapped")
    .with_columns([
        pl.coalesce([pl.col("Brand"), pl.col("Brand_mapped")]).alias("Brand"),
        # Standardize Transmission
        pl.col("Transmission").str.strip_chars()
        .str.to_lowercase()
        .map_elements(
            lambda x: "Automatic" if x in ["automatic", "auto"] 
            else "Manual" if x in ["manual", "m", "manaul"] 
            else "Unknown",
            return_dtype=pl.String
        ).alias("Transmission"),
        # Clean numeric types
        pl.col("Price").cast(pl.Float64, strict=False),
        pl.col("Quantity_In_Stock").cast(pl.Int64, strict=False)
    ])
    .drop("Brand_mapped")
    .select([
        "Car_ID", "Brand", "Model", "Year", "Color", "Engine_Type", 
        "Transmission", "Price", "Quantity_In_Stock", "Status"
    ])
)

cars_cleaned.head()

## 3. Clean Customers Data
-   **Remove typos** (special characters) from names.
-   **Standardize Casing**.
-   **Clean Emails**.

In [6]:
customers_cleaned = (
    customers_raw
    .with_columns([
        # Remove non-alphabetic characters except spaces from names
        pl.col("First Name").str.replace_all(r"[^a-zA-Z ]", "").str.to_titlecase(),
        pl.col("Last Name").str.replace_all(r"[^a-zA-Z ]", "").str.to_titlecase(),
        # Lowercase email and strip whitespace
        pl.col("Email").str.to_lowercase().str.strip_chars(),
        # Titlecase other string columns
        pl.col("City").str.to_titlecase(),
        pl.col("Job Role").str.to_titlecase()
    ])
)

customers_cleaned.head()

## 4. Clean Sales Data
-   **Date Parsing**.
-   **Foreign Key Standardization**.

In [7]:
sales_cleaned = (
    sales_raw
    .with_columns([
        # Date format conversion
        pl.col("Sale_Date").str.to_date("%d-%m-%Y", strict=False),
        # Ensure consistency in lookup keys
        pl.col("Customer_ID").str.strip_chars(),
        pl.col("Car_ID").str.strip_chars(),
        # Sale Price
        pl.col("Sale_Price").cast(pl.Float64, strict=False)
    ])
)

sales_cleaned.head()

## 5. Export Cleaned DataSets
Saving results as `_cleaned.csv` to `data/cleaned/`.

In [8]:
cars_cleaned.write_csv(out_dir / "Cars_cleaned.csv")
customers_cleaned.write_csv(out_dir / "Customers_cleaned.csv")
sales_cleaned.write_csv(out_dir / "Sales_cleaned.csv")

print("All cleaned datasets saved successfully!")