In [2]:
import duckdb
import time
import os
import psutil
import pandas as pd

# ========== Start Performance Tracking ==========
start_time = time.time()
process = psutil.Process(os.getpid())
cpu_start = psutil.cpu_percent(interval=None)
memory_start = process.memory_info().rss / (1024 * 1024)  # in MB

# ========== Connect to DuckDB ==========
conn = duckdb.connect(database=':memory:')

# ========== Execute Optimized Query ==========
try:
    # Single query to compute all metrics
    result = conn.execute("""
        WITH car_data AS (
            SELECT
                TRY_CAST("price (myr)" AS DOUBLE) AS price,
                TRY_CAST(mileage AS DOUBLE) AS mileage,
                TRIM(region) AS region
            FROM read_csv('cleaned_data.csv',
                         header=true,
                         auto_detect=true,
                         ignore_errors=true)
            WHERE "price (myr)" IS NOT NULL
              AND mileage IS NOT NULL
              AND region IS NOT NULL
        )
        SELECT
            COUNT(*) AS total_rows,
            AVG(price) AS mean_price,
            AVG(mileage) AS mean_mileage,
            FIRST(region ORDER BY region_count DESC) AS most_common_region
        FROM (
            SELECT
                *,
                COUNT(*) OVER (PARTITION BY region) AS region_count
            FROM car_data
        )
    """).fetchone()

    total_rows = result[0]
    mean_price = result[1]
    mean_mileage = result[2]
    most_common_region = result[3]

    # ========== End Performance Tracking ==========
    end_time = time.time()
    cpu_end = psutil.cpu_percent(interval=None)
    memory_end = process.memory_info().rss / (1024 * 1024)  # in MB
    execution_time = end_time - start_time

    # ========== Prepare Metrics for Saving ==========
    metrics = {
        "Total Rows": [total_rows],
        "Total Processing Time (seconds)": [execution_time],
        "CPU Usage (%)": [cpu_end - cpu_start],
        "Memory Usage (MB)": [memory_end - memory_start],
        "Throughput (records/second)": [total_rows / execution_time],
        "Mean Price (MYR)": [mean_price],
        "Mean Mileage (km)": [mean_mileage],
        "Most Common Region": [most_common_region]
    }

    # Convert metrics to DataFrame and save as CSV
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv('duckdb_optimized.csv', index=False)

    # ========== Print Metrics ==========
    print("\n🚀 DuckDB Optimization Complete")
    print(f"📄 Total Rows: {total_rows}")
    print(f"🕒 Total Processing Time: {execution_time:.2f} seconds")
    print(f"🧠 CPU Usage: {cpu_end - cpu_start:.2f}%")
    print(f"💾 Memory Usage: {memory_end - memory_start:.2f} MB")
    print(f"⚡ Throughput: {total_rows / execution_time:.2f} records/second")
    print(f"💸 Mean Price (MYR): RM {mean_price:.2f}")
    print(f"🛣️ Mean Mileage: {mean_mileage:.2f} km")
    print(f"📍 Most Common Region: {most_common_region}")

finally:
    conn.close()


🚀 DuckDB Optimization Complete
📄 Total Rows: 175545
🕒 Total Processing Time: 0.57 seconds
🧠 CPU Usage: 43.90%
💾 Memory Usage: 80.97 MB
⚡ Throughput: 309677.71 records/second
💸 Mean Price (MYR): RM 208327.84
🛣️ Mean Mileage: 56206.03 km
📍 Most Common Region: selangor
