# Data Cleaning Notebook

This notebook cleans the raw datasets (`Cars.csv`, `Customers.csv`, `Sales.csv`) using high-performance **Polars** and prepares them for further analysis or machine learning.

In [3]:
import polars as pl
import numpy as np
import re
from pathlib import Path

## 1. Load Data
Loading raw files from `data/raw/`.

In [4]:
raw_dir = Path("../../data/raw")
out_dir = Path("../../data/cleaned")

cars_raw = pl.read_csv(raw_dir / "Cars.csv")
customers_raw = pl.read_csv(raw_dir / "Customers.csv")
sales_raw = pl.read_csv(raw_dir / "Sales.csv")

print(f"Cars shape: {cars_raw.shape}")
print(f"Customers shape: {customers_raw.shape}")
print(f"Sales shape: {sales_raw.shape}")

Cars shape: (501, 14)
Customers shape: (2000, 11)
Sales shape: (10000, 8)


## 2. Clean Cars Data
-   **Dynamic Brand Imputation**: Mapping `Model -> Brand`.
-   **Transmission Standardization**.
-   **Type Casting**.

In [5]:
# 1. Create Model -> Brand mapping from existing data
brand_map = (
    cars_raw.filter(pl.col("Brand").is_not_null() & (pl.col("Brand") != ""))
    .select(["Model", "Brand"])
    .unique()
)

# 2. Join to fill missing brands
cars_cleaned = (
    cars_raw
    .join(brand_map, on="Model", how="left", suffix="_mapped")
    .with_columns([
        pl.coalesce([pl.col("Brand"), pl.col("Brand_mapped")]).alias("Brand"),
        # Standardize Transmission
        pl.col("Transmission").str.strip_chars()
        .str.to_lowercase()
        .map_elements(
            lambda x: "Automatic" if x in ["automatic", "auto"] 
            else "Manual" if x in ["manual", "m", "manaul"] 
            else "Unknown",
            return_dtype=pl.String
        ).alias("Transmission"),
        # Clean numeric types
        pl.col("Price").cast(pl.Float64, strict=False),
        pl.col("Quantity_In_Stock").cast(pl.Int64, strict=False)
    ])
    .drop("Brand_mapped")
)

cars_cleaned.head()

Car_ID,Brand,Model,Year,Color,Engine_Type,Transmission,Price,Quantity_In_Stock,Status,Unnamed: 10_level_0,_duplicated_0,_duplicated_1,_duplicated_2
str,str,str,i64,str,str,str,f64,i64,str,str,str,str,str
"""C0001""","""Toyota""","""Camry""",2023,"""Red""","""Petrol""","""Automatic""",80338.15,6,"""Available""",,,,
"""C0002""","""Tesla""","""Model 3""",2019,"""Red""","""Electric""","""Manual""",26437.73,16,"""Available""",,,,
"""C0003""","""Nissan""","""Qashqai""",2018,"""Blue""","""Electric""","""Automatic""",50158.13,20,"""Available""",,,,
"""C0004""","""Hyundai""","""Sonata""",2025,"""Red""","""Hybrid""","""Automatic""",33026.14,3,"""Available""",,,,
"""C0005""","""Toyota""","""RAV4""",2016,"""White""","""Hybrid""","""Manual""",79672.9,9,"""Reserved""",,,,


In [13]:
import polars as pl

df = pl.read_csv("../../data/cleaned/Cars_cleaned.csv")
df.select(
    [
'Car_ID','Brand','Model','Year','Color','Engine_Type','Transmission','Price','Quantity_In_Stock','Status'
    ]
).write_csv("../../data/cleaned/Cars_cleaned.csv")
print("Done! Trailing columns removed.")


Done! Trailing columns removed.


## 3. Clean Customers Data
-   **Remove typos** (special characters) from names.
-   **Standardize Casing**.
-   **Clean Emails**.

In [6]:
customers_cleaned = (
    customers_raw
    .with_columns([
        # Remove non-alphabetic characters except spaces from names
        pl.col("First Name").str.replace_all(r"[^a-zA-Z ]", "").str.to_titlecase(),
        pl.col("Last Name").str.replace_all(r"[^a-zA-Z ]", "").str.to_titlecase(),
        # Lowercase email and strip whitespace
        pl.col("Email").str.to_lowercase().str.strip_chars(),
        # Titlecase other string columns
        pl.col("City").str.to_titlecase(),
        pl.col("Job Role").str.to_titlecase()
    ])
)

customers_cleaned.head()

Customer_ID,First Name,Last Name,Gender,Age,Job Role,Phone,Email,City,State,Region
str,str,str,str,i64,str,str,str,str,str,str
"""CU0001""","""Jill""","""Snyder""","""Male""",49,"""Network Engineer""","""1-202-347-6622""","""timothy92@yahoo.com""","""Oakterrace""","""Alabama""","""South"""
"""CU0002""","""Nicholas""","""Foster""","""Male""",23,"""Librarian""","""1-752-982-6354""","""cochrancarlos@berry.info""","""West Clearterrace""","""Alaska""","""West"""
"""CU0003""","""Courtney""","""Robbins""","""Male""",60,"""Surgeon""","""1-774-292-7255""","""donna01@yahoo.com""","""Mount Shorecreek""","""Arizona""","""West"""
"""CU0004""","""Blake""","""Barry""","""Male""",67,"""Inventory Manager""","""1-553-463-9137""","""sandra08@yahoo.com""","""North Shoreheights""","""Arkansas""","""South"""
"""CU0005""","""Claudia""","""Hardin""","""Female""",31,"""Librarian""","""1-218-426-3548""","""caitlindavis@bradley.org""","""Saint Oakestates""","""California""","""West"""


## 4. Clean Sales Data
-   **Date Parsing**.
-   **Foreign Key Standardization**.

In [7]:
sales_cleaned = (
    sales_raw
    .with_columns([
        # Date format conversion
        pl.col("Sale_Date").str.to_date("%d-%m-%Y", strict=False),
        # Ensure consistency in lookup keys
        pl.col("Customer_ID").str.strip_chars(),
        pl.col("Car_ID").str.strip_chars(),
        # Sale Price
        pl.col("Sale_Price").cast(pl.Float64, strict=False)
    ])
)

sales_cleaned.head()

Sale_ID,Customer_ID,Car_ID,Sale_Date,Quantity,Sale_Price,Payment_Method,Salesperson
str,str,str,date,i64,f64,str,str
"""S00001""","""CU1241""","""C0214""",2025-03-28,3,73293.19,"""Installment""","""Ashley Ramos"""
"""S00002""","""CU0100""","""C0202""",2024-02-12,3,32681.2,"""Cash""","""Pamela Blair"""
"""S00003""","""CU1690""","""C0228""",2023-02-26,2,53530.92,"""Credit""","""Sergio Lee"""
"""S00004""","""CU0534""","""C0231""",2024-06-21,1,89816.61,"""Cash""","""Mary Johnston"""
"""S00005""","""CU1153""","""C0071""",2023-12-21,2,77590.86,"""Installment""","""Ricardo Garcia"""


## 5. Export Cleaned DataSets
Saving results as `_cleaned.csv` to `data/cleaned/`.

In [8]:
cars_cleaned.write_csv(out_dir / "Cars_cleaned.csv")
customers_cleaned.write_csv(out_dir / "Customers_cleaned.csv")
sales_cleaned.write_csv(out_dir / "Sales_cleaned.csv")

print("All cleaned datasets saved successfully!")

All cleaned datasets saved successfully!
