In [None]:
import dask.dataframe as dd
import re
import time
import psutil
import os

# Load dataset with Dask
df = dd.read_csv("updated_item_list.csv")

# Define function to extract lowest price
def extract_price(x):
    matches = re.findall(r"\d+(?:\.\d+)?", str(x))
    if matches:
        return float(matches[0])
    return None

# Clean product_name (remove non-ASCII characters)
def clean_name(x):
    return re.sub(r"[^\x00-\x7F]+", '', str(x))

# Start performance timer
start_time = time.time()
process = psutil.Process(os.getpid())

# Drop duplicates
df = df.drop_duplicates()

# Fill nulls
df['location'] = df['location'].fillna('Unknown')
df['product_type'] = df['product_type'].fillna('Unknown')

# Apply extraction using map and specify meta
df['cleaned_price'] = df['price'].map(extract_price, meta=('cleaned_price', 'f8'))

df['product_name'] = df['product_name'].map(clean_name, meta=('product_name', 'str'))

# Rename columns
df = df.rename(columns={
    'product_name': 'Product Name',
    'cleaned_price': 'Price',
    'location': 'Location',
    'link': 'Link',
    'product_type': 'Product Type'
})

# Drop the original 'price' column
df = df.drop('price', axis=1)

# Compute the result and round the price
result = df.compute()
result['Price'] = result['Price'].round(2)

# Save to CSV
result.to_csv("Item_list_cleaned_dask.csv", index=False, float_format='%.2f')

# Performance metrics
end_time = time.time()
elapsed_time = end_time - start_time
cpu_percent = process.cpu_percent(interval=1)
memory_usage_mb = process.memory_info().rss / 1024 ** 2
throughput = result.shape[0] / elapsed_time

print("\n--- Dask Optimized Performance ---")
print(f"Total processing time: {elapsed_time:.2f} seconds")
print(f"CPU usage: {cpu_percent}%")
print(f"Memory usage: {memory_usage_mb:.2f} MB")
print(f"Throughput: {throughput:.2f} records/second")



--- Dask Optimized Performance ---
Total processing time: 3.81 seconds
CPU usage: 0.0%
Memory usage: 368.32 MB
Throughput: 25994.40 records/second
