In [9]:
import time
import os
import psutil
import pandas as pd

# ========== Start Performance Tracking ==========
start_time = time.time()
process = psutil.Process(os.getpid())
cpu_start = psutil.cpu_percent(interval=None)
memory_start = process.memory_info().rss / (1024 * 1024)  # in MB

# ========== Read CSV with Pandas ==========
df = pd.read_csv('cleaned_data.csv', encoding='utf-8')

# Drop rows with missing or invalid data
df.dropna(subset=['price (myr)', 'mileage', 'region'], inplace=True)

# Convert columns to appropriate numeric types
df['price (myr)'] = pd.to_numeric(df['price (myr)'], errors='coerce')
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')

# Filter out rows where conversion failed
df = df.dropna(subset=['price (myr)', 'mileage'])

# ========== Compute Metrics ==========
total_rows = len(df)
mean_price = df['price (myr)'].mean()
mean_mileage = df['mileage'].mean()
most_common_region = df['region'].mode()[0] if not df['region'].empty else "N/A"

# ========== End Performance Tracking ==========
end_time = time.time()
cpu_end = psutil.cpu_percent(interval=None)
memory_end = process.memory_info().rss / (1024 * 1024)  # in MB
execution_time = end_time - start_time

# ========== Save Processed DataFrame as CSV ==========
df.to_csv('pandas_optimized.csv', index=False)

# ========== Prepare Metrics for CSV ==========
metrics = {
    "Optimization Stage": "Pandas Optimization",
    "Total Rows": total_rows,
    "Total Processing Time (seconds)": execution_time,
    "CPU Usage (%)": cpu_end - cpu_start,
    "Memory Usage (MB)": memory_end - memory_start,
    "Throughput (records/second)": total_rows / execution_time,
    "Data Processed (rows)": total_rows,
    "Time Taken (seconds)": execution_time,
    "Records per second": total_rows / execution_time
}

# Convert metrics to DataFrame and save as CSV
metrics_df = pd.DataFrame([metrics])
metrics_df.to_csv('pandas_optimized.csv', index=False)

# ========== Print Metrics ==========
print("\n🐍 Pandas Optimization Complete")
print(f"📄 Total Rows: {total_rows}")
print(f"🕒 Total Processing Time: {execution_time:.2f} seconds")
print(f"🧠 CPU Usage: {cpu_end - cpu_start:.2f}%")
print(f"💾 Memory Usage: {memory_end - memory_start:.2f} MB")
print(f"⚡ Throughput: {total_rows / execution_time:.2f} records/second")
print(f"💸 Mean Price (MYR): RM {mean_price:.2f}")
print(f"🛣️ Mean Mileage: {mean_mileage:.2f} km")
print(f"📍 Most Common Region: {most_common_region}")

print("\n✅ Metrics saved to pandas_optimized.csv")


🐍 Pandas Optimization Complete
📄 Total Rows: 175545
🕒 Total Processing Time: 2.16 seconds
🧠 CPU Usage: 70.60%
💾 Memory Usage: 86.04 MB
⚡ Throughput: 81291.59 records/second
💸 Mean Price (MYR): RM 208327.84
🛣️ Mean Mileage: 56206.03 km
📍 Most Common Region: selangor

✅ Metrics saved to pandas_optimized.csv
