In [None]:
!pip install polars psutil



In [None]:
import polars as pl
import time
import psutil
import os
import pandas as pd

def run_polars_optimization():
    start_time = time.time()
    process = psutil.Process(os.getpid())
    cpu_start = psutil.cpu_percent(interval=None)
    memory_start = process.memory_info().rss / (1024 * 1024)  # MB

    # Read CSV with error handling for problematic columns
    df = pl.read_csv(
        "cleaned_data.csv",
        try_parse_dates=True,
        ignore_errors=True,  # Skip rows with parsing errors
        null_values=["-", "NA", "N/A", ""]  # Treat these as null values
    )

    row_count = df.shape[0]
    mean_price = df["price (myr)"].mean()
    mean_mileage = df["mileage"].mean()

    # Handle mode safely in case all values are null
    region_mode = df["region"].mode()
    most_common_region = region_mode[0] if len(region_mode) > 0 else "N/A"

    end_time = time.time()
    execution_time = end_time - start_time
    cpu_end = psutil.cpu_percent(interval=None)
    memory_end = process.memory_info().rss / (1024 * 1024)  # MB

    metrics = {
        "Optimization Stage": ["Polars Optimization"],
        "Total Rows": [row_count],
        "Total Processing Time (seconds)": [execution_time],
        "CPU Usage (%)": [cpu_end - cpu_start],
        "Memory Usage (MB)": [memory_end - memory_start],
        "Throughput (records/second)": [row_count / execution_time],
        "Data Processed (rows)": [row_count],
        "Time Taken (seconds)": [execution_time],
        "Records per second": [row_count / execution_time]
    }

    # Convert metrics to a DataFrame and save as CSV
    metrics_df = pd.DataFrame(metrics)
    metrics_df.to_csv("polars_optimized.csv", index=False)

    print("\n✅ Polars Optimization Complete")
    print(f"📄 Total Rows: {row_count}")
    print(f"🕒 Total Processing Time: {execution_time:.2f} seconds")
    print(f"🧠 CPU Usage: {cpu_end - cpu_start:.2f}%")
    print(f"💾 Memory Usage: {memory_end - memory_start:.2f} MB")
    print(f"⚡ Throughput: {row_count / execution_time:.2f} records/second")
    print(f"💸 Mean Price (MYR): RM {mean_price:.2f}")
    print(f"🛣️ Mean Mileage: {mean_mileage:.2f}km")
    print(f"📍 Most Common Region: {most_common_region}")

    return metrics

if __name__ == "__main__":
    metrics = run_polars_optimization()


✅ Polars Optimization Complete
📄 Total Rows: 175545
🕒 Total Processing Time: 0.38 seconds
🧠 CPU Usage: 25.70%
💾 Memory Usage: 130.30 MB
⚡ Throughput: 466429.09 records/second
💸 Mean Price (MYR): RM 208327.84
🛣️ Mean Mileage: 56206.03km
📍 Most Common Region: selangor
