In [None]:
import pandas as pd
import re
import time
import psutil
import os

# Load dataset
df = pd.read_csv("updated_item_list.csv")

# Log initial shape
initial_shape = df.shape

In [None]:
print("Duplicated data:",(df.duplicated()).sum())


# Check for nulls by column (log or print if needed)
null_counts = df.isnull().sum()
print("Null values by column:\n", null_counts)

Duplicated data: 7
Null values by column:
 product_name       0
link               0
price              0
location        2947
timestamp          0
product_type     432
dtype: int64


In [None]:
# Clean price column and extract lowest numeric value
def extract_lowest_price(price_str):
    if pd.isna(price_str):
        return None
    matches = re.findall(r"\d+(?:\.\d+)?", price_str)
    if matches:
        return float(matches[0])
    return None

In [None]:
# Start performance timer and process monitor
start_time = time.time()
process = psutil.Process(os.getpid())
# Drop duplicate rows
df = df.drop_duplicates()

# Fill in null
df['location'].fillna('Unknown', inplace=True)
df['product_type'].fillna('Unknown', inplace=True)

# Standardize price format
df['cleaned_price'] = df['price'].apply(extract_lowest_price)
df['cleaned_price'] = df['cleaned_price'].round(2)

df.drop(columns=['price'], inplace=True)

# Clean unreadable characters from product_name
df['product_name'] = df['product_name'].apply(lambda x: re.sub(r"[^\x00-\x7F]+", '', str(x)))

df.rename(columns={
    'product_name': 'Product Name',
    'cleaned_price': 'Price',
    'location': 'Location',
    'link': 'Link',
    'product_type': 'Product Type'
}, inplace=True)

# Save cleaned data
df.to_csv("Item_list_cleaned.csv", index=False, float_format='%.2f')

# Log performance metrics
end_time = time.time()
elapsed_time = end_time - start_time
cpu_percent = process.cpu_percent(interval=1)
memory_usage_mb = process.memory_info().rss / 1024 ** 2
throughput = df.shape[0] / elapsed_time

print("\n--- Performance Metrics ---")
print(f"Total processing time: {elapsed_time:.2f} seconds")
print(f"CPU usage: {cpu_percent}%")
print(f"Memory usage: {memory_usage_mb:.2f} MB")
print(f"Throughput: {throughput:.2f} records/second")
print(f"Rows before: {initial_shape[0]}, Rows after: {df.shape[0]}")

from google.colab import files
files.download("Item_list_cleaned.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna('Unknown', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['location'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) 


--- Performance Metrics ---
Total processing time: 3.81 seconds
CPU usage: 2.0%
Memory usage: 241.00 MB
Throughput: 53992.78 records/second
Rows before: 205784, Rows after: 205777


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>