In [None]:
import polars as pl
import time
import psutil
import os

# Load dataset
df = pl.read_csv("updated_item_list.csv")

# Start performance timer
start_time = time.time()
process = psutil.Process(os.getpid())

# Drop duplicates
df = df.unique()

# Fill nulls with default values
df = df.with_columns([
    pl.col("location").fill_null("Unknown"),
    pl.col("product_type").fill_null("Unknown")
])

# Extract cleaned versions first
df = df.with_columns([
    pl.col("price").str.extract(r"(\d+(?:\.\d+)?)").cast(pl.Float64).round(2).alias("price"),
    pl.col("product_name").str.replace_all(r"[^\x00-\x7F]+", "").alias("product_name")
])

# Reorder columns if needed
df = df.select([
    "product_name", "price", "location", "link", "product_type"
])

# Rename columns
df = df.rename({
    "product_name": "Product Name",
    "price": "Price",
    "location": "Location",
    "link": "Link",
    "product_type": "Product Type"
})

# Save cleaned CSV
df.write_csv("Item_list_cleaned_optimized.csv")

# End performance tracking
end_time = time.time()
elapsed_time = end_time - start_time
cpu_percent = process.cpu_percent(interval=1)
memory_usage_mb = process.memory_info().rss / 1024 ** 2
throughput = df.shape[0] / elapsed_time

# Log performance
print("\n--- Optimized Performance Metrics (Polars) ---")
print(f"Total processing time: {elapsed_time:.2f} seconds")
print(f"CPU usage: {cpu_percent}%")
print(f"Memory usage: {memory_usage_mb:.2f} MB")
print(f"Throughput: {throughput:.2f} records/second")



--- Optimized Performance Metrics (Polars) ---
Total processing time: 0.57 seconds
CPU usage: 1.0%
Memory usage: 337.42 MB
Throughput: 359721.64 records/second
