# Polars

In [1]:
!pip install polars psutil



In [2]:
import polars as pl
import time
import psutil
import os
import json

def run_polars_optimization():
    start_time = time.time()
    process = psutil.Process(os.getpid())
    cpu_start = psutil.cpu_percent(interval=None)
    memory_start = process.memory_info().rss / (1024 * 1024)  # MB

    # Read CSV with error handling for problematic columns
    df = pl.read_csv(
        "cleaned_data.csv",
        try_parse_dates=True,
        ignore_errors=True,  # Skip rows with parsing errors
        null_values=["-", "NA", "N/A", ""]  # Treat these as null values
    )

    # Alternatively, specify dtypes for problematic columns
    # df = pl.read_csv(
    #     "cleaned_data.csv",
    #     dtypes={
    #         "seating capacity": pl.Utf8  # Read as string first
    #     }
    # )

    # If needed, you can then clean the problematic column
    # df = df.with_columns(
    #     pl.col("seating capacity")
    #     .str.extract(r"(\d+)")  # Extract numbers
    #     .cast(pl.Int64)  # Convert to integer
    # )

    row_count = df.shape[0]
    mean_price = df["price (myr)"].mean()
    mean_mileage = df["mileage"].mean()

    # Handle mode safely in case all values are null
    region_mode = df["region"].mode()
    most_common_region = region_mode[0] if len(region_mode) > 0 else "N/A"

    end_time = time.time()
    execution_time = end_time - start_time
    cpu_end = psutil.cpu_percent(interval=None)
    memory_end = process.memory_info().rss / (1024 * 1024)  # MB

    metrics = {
        "Optimization Stage": "Polars Optimization",
        "Total Rows": row_count,
        "Total Processing Time (seconds)": execution_time,
        "CPU Usage (%)": cpu_end - cpu_start,
        "Memory Usage (MB)": memory_end - memory_start,
        "Throughput (records/second)": row_count / execution_time,
        "Data Processed (rows)": row_count,
        "Time Taken (seconds)": execution_time,
        "Records per second": row_count / execution_time
    }

    print("\n✅ Polars Optimization Complete")
    print(f"📄 Total Rows: {row_count}")
    print(f"🕒 Total Processing Time: {execution_time:.2f} seconds")
    print(f"🧠 CPU Usage: {cpu_end - cpu_start:.2f}%")
    print(f"💾 Memory Usage: {memory_end - memory_start:.2f} MB")
    print(f"⚡ Throughput: {row_count / execution_time:.2f} records/second")
    print(f"💸 Mean Price (MYR): RM {mean_price:.2f}")
    print(f"🛣️ Mean Mileage: {mean_mileage:.2f}km")
    print(f"📍 Most Common Region: {most_common_region}")

    return metrics

if __name__ == "__main__":
    metrics = run_polars_optimization()
    # Save metrics to a temporary JSON file
    with open("polars_metrics.json", "w") as f:
        json.dump(metrics, f)


✅ Polars Optimization Complete
📄 Total Rows: 175545
🕒 Total Processing Time: 0.61 seconds
🧠 CPU Usage: 14.80%
💾 Memory Usage: 129.63 MB
⚡ Throughput: 289688.39 records/second
💸 Mean Price (MYR): RM 208327.84
🛣️ Mean Mileage: 56206.03km
📍 Most Common Region: selangor


# Pandas

In [3]:
import time
import os
import psutil
import pandas as pd
import json

def run_pandas_optimization():
    # ========== Warm-up Run ==========
    _ = pd.read_csv('cleaned_data.csv').head(100)  # Initialize libraries

    # ========== Start Performance Tracking ==========
    start_time = time.time()
    process = psutil.Process(os.getpid())

    # Stable CPU measurement
    cpu_readings = [psutil.cpu_percent(interval=0.1) for _ in range(3)]
    cpu_start = sum(cpu_readings) / len(cpu_readings)

    memory_start = process.memory_info().rss / (1024 * 1024)  # in MB

    # ========== Read and Process Data ==========
    df = pd.read_csv('cleaned_data.csv', encoding='utf-8')

    # Data cleaning
    df.dropna(subset=['price (myr)', 'mileage', 'region'], inplace=True)
    df['price (myr)'] = pd.to_numeric(df['price (myr)'], errors='coerce')
    df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')
    df = df.dropna(subset=['price (myr)', 'mileage'])

    # ========== Compute Metrics ==========
    total_rows = len(df)
    mean_price = df['price (myr)'].mean()
    mean_mileage = df['mileage'].mean()
    most_common_region = df['region'].mode()[0] if not df['region'].empty else "N/A"

    # ========== End Performance Tracking ==========
    end_time = time.time()
    cpu_end = psutil.cpu_percent(interval=0.1)
    memory_end = process.memory_info().rss / (1024 * 1024)
    execution_time = end_time - start_time

    # ========== Prepare Results ==========
    metrics = {
        "Optimization Stage": "Pandas Optimization",
            "Total Rows": total_rows,
            "Total Processing Time (seconds)": execution_time,
            "CPU Usage (%)": cpu_end - cpu_start,
            "Memory Usage (MB)": memory_end - memory_start,
            "Throughput (records/second)": total_rows / execution_time,
            "Data Processed (rows)": total_rows,
            "Time Taken (seconds)": execution_time,
            "Records per second": total_rows / execution_time
    }

    # ========== Print Results ==========
    print("\n🐍 Pandas Optimization Complete")
    print(f"📄 Total Rows: {total_rows}")
    print(f"🕒 Total Processing Time: {execution_time:.2f} seconds")
    print(f"🧠 CPU Usage: {cpu_end - cpu_start:.2f}%")
    print(f"💾 Memory Usage: {memory_end - memory_start:.2f} MB")
    print(f"⚡ Throughput: {total_rows / execution_time:.2f} records/second")
    print(f"💸 Mean Price (MYR): RM {mean_price:.2f}")
    print(f"🛣️ Mean Mileage: {mean_mileage:.2f} km")
    print(f"📍 Most Common Region: {most_common_region}")

    return metrics

if __name__ == "__main__":
    metrics = run_pandas_optimization()
    with open("pandas_metrics.json", "w") as f:
        json.dump(metrics, f)
    print("\n✅ Metrics saved to pandas_metrics.json")


🐍 Pandas Optimization Complete
📄 Total Rows: 175545
🕒 Total Processing Time: 3.44 seconds
🧠 CPU Usage: 3.60%
💾 Memory Usage: 125.80 MB
⚡ Throughput: 51003.49 records/second
💸 Mean Price (MYR): RM 208327.84
🛣️ Mean Mileage: 56206.03 km
📍 Most Common Region: selangor

✅ Metrics saved to pandas_metrics.json


# DuckDB

In [6]:
import duckdb
import time
import os
import psutil
import json

def run_duckdb_optimization():
    # ========== Start Performance Tracking ==========
    start_time = time.time()
    process = psutil.Process(os.getpid())
    cpu_start = psutil.cpu_percent(interval=None)
    memory_start = process.memory_info().rss / (1024 * 1024)  # in MB

    # ========== Connect to DuckDB ==========
    conn = duckdb.connect(database=':memory:')

    try:
        # ========== Execute Optimized Query ==========
        result = conn.execute("""
            WITH car_data AS (
                SELECT
                    TRY_CAST("price (myr)" AS DOUBLE) AS price,
                    TRY_CAST(mileage AS DOUBLE) AS mileage,
                    TRIM(region) AS region
                FROM read_csv('cleaned_data.csv',
                             header=true,
                             auto_detect=true,
                             ignore_errors=true)
                WHERE "price (myr)" IS NOT NULL
                  AND mileage IS NOT NULL
                  AND region IS NOT NULL
            )
            SELECT
                COUNT(*) AS total_rows,
                AVG(price) AS mean_price,
                AVG(mileage) AS mean_mileage,
                FIRST(region ORDER BY region_count DESC) AS most_common_region
            FROM (
                SELECT
                    *,
                    COUNT(*) OVER (PARTITION BY region) AS region_count
                FROM car_data
            )
        """).fetchone()

        total_rows = result[0]
        mean_price = result[1]
        mean_mileage = result[2]
        most_common_region = result[3]

        # ========== End Performance Tracking ==========
        end_time = time.time()
        cpu_end = psutil.cpu_percent(interval=None)
        memory_end = process.memory_info().rss / (1024 * 1024)  # in MB
        execution_time = end_time - start_time

        # Prepare metrics dictionary
        metrics = {
            "Optimization Stage": "DuckDB Optimization",
            "Total Rows": total_rows,
            "Total Processing Time (seconds)": execution_time,
            "CPU Usage (%)": cpu_end - cpu_start,
            "Memory Usage (MB)": memory_end - memory_start,
            "Throughput (records/second)": total_rows / execution_time,
            "Data Processed (rows)": total_rows,
            "Time Taken (seconds)": execution_time,
            "Records per second": total_rows / execution_time
        }

        # ========== Print Metrics ==========
        print("\n🚀 DuckDB (FireDucks) - After Optimization")
        print(f"📄 Total Rows: {total_rows}")
        print(f"🕒 Total Processing Time: {execution_time:.2f} seconds")
        print(f"🧠 CPU Usage: {cpu_end - cpu_start:.2f}%")
        print(f"💾 Memory Usage: {memory_end - memory_start:.2f} MB")
        print(f"⚡ Throughput: {total_rows / execution_time:.2f} records/second")
        print(f"💸 Mean Price (MYR): RM {mean_price:.2f}")
        print(f"🛣️ Mean Mileage: {mean_mileage:.2f} km")
        print(f"📍 Most Common Region: {most_common_region}")

        return metrics

    finally:
        conn.close()

if __name__ == "__main__":
    metrics = run_duckdb_optimization()
    # Save metrics to a temporary JSON file
    with open("duckdb_metrics.json", "w") as f:
        json.dump(metrics, f)


🚀 DuckDB (FireDucks) - After Optimization
📄 Total Rows: 175545
🕒 Total Processing Time: 0.43 seconds
🧠 CPU Usage: 46.20%
💾 Memory Usage: 81.64 MB
⚡ Throughput: 406446.43 records/second
💸 Mean Price (MYR): RM 208327.84
🛣️ Mean Mileage: 56206.03 km
📍 Most Common Region: selangor


In [7]:
import pandas as pd
import json
from pathlib import Path

def load_and_clean_metrics():
    # Initialize empty DataFrame with the correct columns
    columns = [
        'Optimization Stage',
        'Total Rows',
        'Total Processing Time (seconds)',
        'CPU Usage (%)',
        'Memory Usage (MB)',
        'Throughput (records/second)',
        'Data Processed (rows)',
        'Time Taken (seconds)',
        'Records per second'
    ]
    df = pd.DataFrame(columns=columns)

    # Load metrics from JSON files if they exist
    json_files = {
        'Polars' : 'polars_metrics.json',
        'Pandas': 'pandas_metrics.json',
        'DuckDB': 'duckdb_metrics.json'
    }

    for stage, file in json_files.items():
        if Path(file).exists():
            with open(file) as f:
                metrics = json.load(f)
                # Convert to DataFrame and append
                temp_df = pd.DataFrame([metrics])
                df = pd.concat([df, temp_df], ignore_index=True)

    if not df.empty:
        # Round all numeric columns to 2 decimals
        numeric_cols = df.select_dtypes(include=['float64']).columns
        df[numeric_cols] = df[numeric_cols].round(2)

        # Fix negative memory values (take absolute value)
        df['Memory Usage (MB)'] = df['Memory Usage (MB)'].abs()

        # Format throughput and records per second without commas
        df['Throughput (records/second)'] = df['Throughput (records/second)'].apply(lambda x: f"{float(x):.2f}")
        df['Records per second'] = df['Records per second'].apply(lambda x: f"{float(x):.2f}")

        # Save cleaned data
        df.to_csv("performance_after.csv", index=False)
        print("Successfully saved cleaned metrics to performance_after.csv")
        return df
    else:
        print("No JSON metrics files found (pandas_metrics.json, duckdb_metrics.json)")
        return None

if __name__ == "__main__":
    result_df = load_and_clean_metrics()
    if result_df is not None:
        print("\nCleaned Performance Metrics:")
        print(result_df)

Successfully saved cleaned metrics to performance_after.csv

Cleaned Performance Metrics:
    Optimization Stage Total Rows  Total Processing Time (seconds)  \
0  Polars Optimization     175545                             0.61   
1  Pandas Optimization     175545                             3.44   
2  DuckDB Optimization     175545                             0.43   

   CPU Usage (%)  Memory Usage (MB) Throughput (records/second)  \
0           14.8             129.63                   289688.39   
1            3.6             125.80                    51003.49   
2           46.2              81.64                   406446.43   

  Data Processed (rows)  Time Taken (seconds) Records per second  
0                175545                  0.61          289688.39  
1                175545                  3.44           51003.49  
2                175545                  0.43          406446.43  


  df = pd.concat([df, temp_df], ignore_index=True)
