**Install Required Packages**

In [None]:
!pip install supabase

Collecting supabase
  Downloading supabase-2.15.1-py3-none-any.whl.metadata (11 kB)
Collecting gotrue<3.0.0,>=2.11.0 (from supabase)
  Downloading gotrue-2.12.0-py3-none-any.whl.metadata (6.1 kB)
Collecting postgrest<1.1,>0.19 (from supabase)
  Downloading postgrest-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime<2.5.0,>=2.4.0 (from supabase)
  Downloading realtime-2.4.3-py3-none-any.whl.metadata (6.7 kB)
Collecting storage3<0.12,>=0.10 (from supabase)
  Downloading storage3-0.11.3-py3-none-any.whl.metadata (1.8 kB)
Collecting supafunc<0.10,>=0.9 (from supabase)
  Downloading supafunc-0.9.4-py3-none-any.whl.metadata (1.2 kB)
Collecting pytest-mock<4.0.0,>=3.14.0 (from gotrue<3.0.0,>=2.11.0->supabase)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from postgrest<1.1,>0.19->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting aiohttp<4.0.0,>=3.11.18 (from realtime<2.5.0,>=2.4.0-

**Import Libraries**

In [None]:
import time
import os
import psutil
import numpy as np
from collections import defaultdict
from joblib import Parallel, delayed
from prettytable import PrettyTable
from supabase import create_client, Client

**Initialize Supabase Client**

In [None]:
url = "https://ugjwigpcopmtjgylopwf.supabase.co"
key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InVnandpZ3Bjb3BtdGpneWxvcHdmIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDU4MjgxMjIsImV4cCI6MjA2MTQwNDEyMn0.oFcP1wCt1upByqTU8NgD4FpJUdv9I8sG1ECWMX1wz8I"

supabase: Client = create_client(url, key)

**Define Data Fetching Function**

In [None]:
def fetch_car_data():
    response = supabase.table("cars_clean").select("*").range(0, 999).execute()
    batch_size = 1000
    offset = 1000

    while True:
        batch_response = supabase.table("cars_clean").select("*").range(offset, offset + batch_size - 1).execute()
        rows = batch_response.data
        if not rows:
            break
        response.data.extend(rows)
        offset += batch_size
        print(f"Fetched {len(response.data)} rows so far...")

    print(f"✅ Done. Total rows fetched: {len(response.data)}")
    return response.data

**💰 Query 1: Most Expensive Car in Each Location & Record Query Performance**

In [None]:
# Query 1: Most Expensive Car by Location
def query_most_expensive_car_by_location(data, n_jobs=-1):
    start_time = time.time()

    # Use numpy for balanced chunking
    chunks = np.array_split(data, n_jobs)

    def process_chunk(chunk):
        most_expensive_cars = {}
        for car in chunk:
            location = car.get("c_location")
            price = car.get("c_price")

            # Skip invalid or noisy location values
            if not location or location in ["Used", "c_location"]:
                continue

            if price is None or not isinstance(price, (int, float)):
                continue

            if location not in most_expensive_cars or price > most_expensive_cars[location]["price"]:
                most_expensive_cars[location] = {
                    "price": price,
                    "c_name": car.get("c_name"),
                    "id": car.get("id")
                }
        return most_expensive_cars

    results = Parallel(n_jobs=n_jobs)(delayed(process_chunk)(chunk) for chunk in chunks)

    most_expensive_cars = {}
    for result in results:
        for location, car_info in result.items():
            # If location already exists, keep the one with higher price
            if (location not in most_expensive_cars) or (car_info["price"] > most_expensive_cars[location]["price"]):
                most_expensive_cars[location] = car_info

    end_time = time.time()
    query_time = end_time - start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    throughput = len(data) / query_time if query_time > 0 else 0

    table = PrettyTable()
    table.field_names = ["ID", "Location", "Car Name", "Price"]
    for location, car_info in most_expensive_cars.items():
        table.add_row([car_info["id"], location, car_info["c_name"], car_info["price"]])

    return table, query_time, cpu_percent, memory_info.percent, throughput

**📆 Query 2: Total Number of Cars Available by Year & Record Query Performance**
*(Shows Top 5)*

In [None]:
# Query 2: Total Cars per Year
def query_total_cars_per_year(data, n_jobs=-1):
    start_time = time.time()
    # Use numpy for balanced chunking
    chunks = np.array_split(data, n_jobs)
    def process_chunk(chunk):
        year_count = defaultdict(int)
        for item in chunk:
            year = item.get('c_year')
            if year:
                year_count[year] += 1
        return year_count

    results = Parallel(n_jobs=n_jobs)(delayed(process_chunk)(chunk) for chunk in chunks)
    combined = defaultdict(int)
    for result in results:
        for year, count in result.items():
            combined[year] += count

    end_time = time.time()
    query_time = end_time - start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    throughput = len(data) / query_time if query_time > 0 else 0

    table = PrettyTable()
    table.field_names = ["Year", "Count"]
    for year, count in sorted(combined.items(), key=lambda x: x[0])[:5]:
        table.add_row([year, count])

    return table, query_time, cpu_percent, memory_info.percent, throughput

**🚗 Query 3: Average Price of Cars by Engine Size (Grouped by 500cc) & Record Query Performance**
*(Shows Top 5)*

In [None]:
# Query 3: Average Price by Engine Size (Grouped by 500cc Intervals, Top 5)
def query_average_price_by_engine_size(data, n_jobs=-1):
    start_time = time.time()
    interval = 500  # Group size in cc

    # Split the data into chunks for parallel processing
    chunks = np.array_split(data, n_jobs)

    def process_chunk(chunk):
        engine_price_map = defaultdict(list)
        for item in chunk:
            engine = item.get('c_engine')
            price = item.get('c_price')
            if engine and isinstance(engine, (int, float)) and isinstance(price, (int, float)):
                # Group by engine size intervals (500cc)
                engine_group = int(engine // interval * interval)
                engine_price_map[engine_group].append(price)
        return engine_price_map

    # Parallel processing
    results = Parallel(n_jobs=n_jobs)(delayed(process_chunk)(chunk) for chunk in chunks)

    # Combine results
    combined = defaultdict(list)
    for result in results:
        for engine_group, prices in result.items():
            combined[engine_group].extend(prices)

    # Compute average prices
    average_prices = {
        group: sum(prices) / len(prices)
        for group, prices in combined.items()
        if prices
    }

    # Sort by average price descending and keep top 5
    top5_avg_prices = sorted(average_prices.items(), key=lambda x: x[1], reverse=True)[:5]

    end_time = time.time()
    query_time = end_time - start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    throughput = len(data) / query_time if query_time > 0 else 0


    # Prepare table
    table = PrettyTable()
    table.field_names = ["Engine Size Group (cc)", "Average Price"]
    for group, avg_price in top5_avg_prices:
        table.add_row([f"{group}cc", f"${avg_price:,.2f}"])

    return table, query_time, cpu_percent, memory_info.percent, throughput

**📍 Query 4: Total Number of Cars by Location & Record Query Performance**

In [None]:
# Query 4: Total Cars by Location
def query_total_cars_by_location(data, n_jobs=-1):
    start_time = time.time()

    # Use numpy to split data into chunks for parallel processing
    chunks = np.array_split(data, n_jobs)

    def process_chunk(chunk):
        location_counts = defaultdict(int)
        for item in chunk:
            location = item.get('c_location')
            # Filter: exclude None, "Used", and "c_location"
            if location and location != "Used" and location != "c_location":
                location_counts[location] += 1
        return location_counts

    # Run parallel processing
    results = Parallel(n_jobs=n_jobs)(delayed(process_chunk)(chunk) for chunk in chunks)

    # Combine results
    combined = defaultdict(int)
    for result in results:
        for location, count in result.items():
            combined[location] += count

    # Sort locations alphabetically
    sorted_locations = sorted(combined.items())

    end_time = time.time()
    query_time = end_time - start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    throughput = len(data) / query_time if query_time > 0 else 0

    # Create output table
    table = PrettyTable()
    table.field_names = ["Location", "Count"]
    for location, count in sorted_locations:
        table.add_row([location, count])

    return table, query_time, cpu_percent, memory_info.percent, throughput

**🛣️ Query 5: Average Minimum Mileage Grouped by Condition & Record Query Performance**

In [None]:
# Query 5: Average Minimum Mileage by Condition
def query_avg_min_mileage_by_condition(data, n_jobs=-1):
    start_time = time.time()
    # Use numpy for balanced chunking
    chunks = np.array_split(data, n_jobs)

    def process_chunk(chunk):
        mileage_sums = defaultdict(float)
        mileage_counts = defaultdict(int)
        for car in chunk:
            condition = car.get("c_condition")
            mileage = car.get("c_mileage_min")
            if condition and isinstance(mileage, (int, float)):
                mileage_sums[condition] += mileage
                mileage_counts[condition] += 1
        return mileage_sums, mileage_counts

    results = Parallel(n_jobs=n_jobs)(delayed(process_chunk)(chunk) for chunk in chunks)
    total_sums = defaultdict(float)
    total_counts = defaultdict(int)
    for mileage_sums, mileage_counts in results:
        for condition in mileage_sums:
            total_sums[condition] += mileage_sums[condition]
            total_counts[condition] += mileage_counts[condition]

    avg_min_mileage_by_condition = {
        condition: total_sums[condition] / total_counts[condition]
        for condition in total_sums
    }

    end_time = time.time()
    query_time = end_time - start_time
    cpu_percent = psutil.cpu_percent(interval=1)
    memory_info = psutil.virtual_memory()
    throughput = len(data) / query_time if query_time > 0 else 0

    table = PrettyTable()
    table.field_names = ["Condition", "Avg Min Mileage"]
    for condition, mileage in avg_min_mileage_by_condition.items():
        table.add_row([condition, f"{mileage:.2f}"])

    return table, query_time, cpu_percent, memory_info.percent, throughput

**Define Function to Run All Queries**

In [None]:
# Run all queries
def run_all_queries(data, n_jobs=4):
    queries = [
        ("Query 1: Most Expensive Car by Location", query_most_expensive_car_by_location),
        ("Query 2: Total Cars per Year", query_total_cars_per_year),
        ("Query 3: Avg Price by Engine Size", query_average_price_by_engine_size),
        ("Query 4: Total Cars by Location", query_total_cars_by_location),
        ("Query 5: Avg Min Mileage by Condition", query_avg_min_mileage_by_condition)
    ]

    for title, query_func in queries:
        print(f"\n{title}")
        table, time_taken, cpu, mem, throughput = query_func(data, n_jobs)
        print(table)
        print("\nQuery Performance: ")
        print(f"Time: {time_taken:.2f}s | CPU: {cpu}% | Memory: {mem}% | Throughput: {throughput:.2f} records/s\n")

**Execute Data Fetching and Run Queries**

In [None]:
if __name__ == "__main__":
    data = fetch_car_data()
    run_all_queries(data)

Fetched 2000 rows so far...
Fetched 3000 rows so far...
Fetched 4000 rows so far...
Fetched 5000 rows so far...
Fetched 6000 rows so far...
Fetched 7000 rows so far...
Fetched 8000 rows so far...
Fetched 9000 rows so far...
Fetched 10000 rows so far...
Fetched 11000 rows so far...
Fetched 12000 rows so far...
Fetched 13000 rows so far...
Fetched 14000 rows so far...
Fetched 15000 rows so far...
Fetched 16000 rows so far...
Fetched 17000 rows so far...
Fetched 18000 rows so far...
Fetched 19000 rows so far...
Fetched 20000 rows so far...
Fetched 21000 rows so far...
Fetched 22000 rows so far...
Fetched 23000 rows so far...
Fetched 24000 rows so far...
Fetched 25000 rows so far...
Fetched 26000 rows so far...
Fetched 27000 rows so far...
Fetched 28000 rows so far...
Fetched 29000 rows so far...
Fetched 30000 rows so far...
Fetched 31000 rows so far...
Fetched 32000 rows so far...
Fetched 33000 rows so far...
Fetched 34000 rows so far...
Fetched 35000 rows so far...
Fetched 36000 rows so 