In [None]:
# System-level dependencies
!apt-get update
!apt-get install -y postgresql postgresql-contrib python3-dev libpq-dev

# Python packages
!pip install pandas numpy sqlalchemy psutil mimesis dask "dask[dataframe]" tqdm psycopg2-binary

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:3 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
postgresql is already the newest version (14+238).
postgresql-contrib is already the newest version (14+238).
libpq-dev is already the newest version

In [2]:
import os
import csv
import time
import psutil
import threading
import pandas as pd
import numpy as np
from datetime import datetime, date
from typing import List, Dict
from sqlalchemy import create_engine, text
from multiprocessing import Pool
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from mimesis.locales import Locale
from mimesis.schema import Fieldset
import tempfile
import io

# Docker setup for PostgreSQL
def setup_postgres_colab():
    """Setup PostgreSQL in Google Colab"""
    print("Setting up PostgreSQL in Google Colab...")

    # Install PostgreSQL
    !apt-get update
    !apt-get install -y postgresql postgresql-contrib

    # Start PostgreSQL service
    !service postgresql start

    # Configure PostgreSQL to accept connections
    !sudo -u postgres psql -c "ALTER USER postgres PASSWORD 'password';"
    !sudo -u postgres psql -c "CREATE DATABASE employees;"

    # Update pg_hba.conf to allow local connections
    !echo "host all all 0.0.0.0/0 md5" | sudo tee -a /etc/postgresql/*/main/pg_hba.conf

    # Update postgresql.conf to listen on all addresses
    !echo "listen_addresses = '*'" | sudo tee -a /etc/postgresql/*/main/postgresql.conf

    # Restart PostgreSQL to apply changes
    !service postgresql restart

    # Wait for PostgreSQL to be ready
    connection_string = "postgresql://postgres:password@localhost:5432/employees"
    engine = create_engine(connection_string)

    max_attempts = 30
    attempt = 0
    while attempt < max_attempts:
        try:
            print(f"Attempting to connect to database... (Attempt {attempt + 1}/{max_attempts})")
            with engine.connect() as connection:
                connection.execute(text("SELECT 1"))
            print("Successfully connected to PostgreSQL!")
            return connection_string
        except Exception as e:
            print(f"Connection attempt failed: {str(e)}")
            attempt += 1
            time.sleep(2)

    raise Exception("Failed to connect to PostgreSQL after maximum attempts")

# Database schema setup
def setup_database(engine):
    """Create SCD Type 2 table schema"""
    with engine.connect() as conn:
        conn.execute(text("""
            CREATE TABLE IF NOT EXISTS employees (
                employee_id INTEGER,
                name VARCHAR(100),
                email VARCHAR(100),
                address TEXT,
                phone VARCHAR(50),
                date_of_birth DATE,
                gender VARCHAR(10),
                company VARCHAR(100),
                position VARCHAR(100),
                salary DECIMAL(10,2),
                retired VARCHAR(3),
                valid_from TIMESTAMP,
                valid_to TIMESTAMP,
                is_current BOOLEAN,
                PRIMARY KEY (employee_id, valid_from)
            )
        """))
        conn.commit()

def generate_data(row_count: int) -> pd.DataFrame:
    """Generate synthetic data using Mimesis Fieldset"""
    fieldset = Fieldset(locale=Locale.EN)

    # Generate all fields at once using Fieldset
    employee_ids = list(range(row_count))
    names = fieldset("full_name", i=row_count)
    emails = fieldset("email", i=row_count)
    addresses = fieldset("address", i=row_count)
    phones = fieldset("telephone", i=row_count)
    dates = [str(date.isoformat()) for date in fieldset("date", start=1950, end=2005, i=row_count)]
    genders = np.random.choice(["Male", "Female"], size=row_count).tolist()
    cities = fieldset("city", i=row_count)
    positions = fieldset("occupation", i=row_count)
    salaries = np.round(np.random.uniform(30000, 200000, row_count), 2).tolist()
    retired = np.random.choice(["Yes", "No"], size=row_count).tolist()

    # Create records using list comprehension with zip
    records = [
        {
            "employee_id": emp_id,
            "name": name,
            "email": email,
            "address": address,
            "phone": phone,
            "date_of_birth": dob,
            "gender": gender,
            "company": f"{city} Corp",
            "position": position,
            "salary": salary,
            "retired": retired_status
        }
        for emp_id, name, email, address, phone, dob, gender, city, position, salary, retired_status
        in zip(employee_ids, names, emails, addresses, phones, dates, genders,
               cities, positions, salaries, retired)
    ]

    return pd.DataFrame(records)

def identify_changes(new_df: pd.DataFrame, engine) -> pd.DataFrame:
    """Compare new data with existing records and identify changes"""
    current_records = pd.read_sql(
        """
        SELECT * FROM employees
        WHERE is_current = true
        """,
        engine
    )

    if len(current_records) > 0:
        merged = new_df.merge(
            current_records,
            on='employee_id',
            how='left',
            suffixes=('_new', '_current')
        )

        changed_mask = (
            (merged['name_new'] != merged['name_current']) |
            (merged['email_new'] != merged['email_current']) |
            (merged['address_new'] != merged['address_current']) |
            (merged['phone_new'] != merged['phone_current']) |
            (merged['position_new'] != merged['position_current']) |
            (merged['salary_new'] != merged['salary_current'])
        )

        new_mask = merged['name_current'].isna()

        new_df['change_type'] = 'no_change'
        new_df.loc[new_mask, 'change_type'] = 'insert'
        new_df.loc[changed_mask & ~new_mask, 'change_type'] = 'update'
    else:
        new_df['change_type'] = 'insert'

    return new_df

def apply_scd2_changes_fixed(df: pd.DataFrame, engine) -> pd.DataFrame:
    """Apply SCD Type 2 changes to the data with a valid PostgreSQL future date"""
    current_timestamp = datetime.now().isoformat()

    df['valid_from'] = current_timestamp
    # Use 2099-12-31 instead of 9999-12-31 for valid_to as "infinity"
    df['valid_to'] = '2099-12-31 23:59:59'
    df['is_current'] = True

    updates = df[df['change_type'] == 'update']
    if not updates.empty:
        with engine.begin() as conn:
            employee_ids = tuple(updates['employee_id'].tolist())
            conn.execute(
                text("""
                    UPDATE employees
                    SET valid_to = :valid_to,
                        is_current = FALSE
                    WHERE employee_id IN :employee_ids
                    AND is_current = TRUE
                """),
                {
                    "valid_to": current_timestamp,
                    "employee_ids": employee_ids
                }
            )

    return df.drop(columns=['change_type'])

def reset_table(engine):
    """Drop and recreate the employees table instead of truncating"""
    try:
        with engine.connect() as conn:
            conn.execute(text("DROP TABLE IF EXISTS employees"))
            conn.commit()

        # Recreate the table schema
        setup_database(engine)

        print("Table dropped and recreated successfully")
    except Exception as e:
        print(f"Error resetting table: {str(e)}")
        raise

# Modify truncate_table to use reset_table instead
def truncate_table(engine):
    """Use reset_table instead of truncate for cleaner state"""
    reset_table(engine)

# Resource monitoring
def monitor_resources(interval, stats):
    """Monitor CPU and memory usage"""
    while not stats['stop']:
        stats['cpu'].append(psutil.cpu_percent(interval=None))
        stats['memory'].append(psutil.virtual_memory().percent)
        time.sleep(interval)

def print_resource_stats(stats):
    """Print resource usage statistics"""
    print("\nResource Usage Statistics:")
    print(f"Average CPU Usage: {sum(stats['cpu']) / len(stats['cpu']):.2f}%")
    print(f"Max CPU Usage: {max(stats['cpu']):.2f}%")
    print(f"Min CPU Usage: {min(stats['cpu']):.2f}%")
    print(f"Average Memory Usage: {sum(stats['memory']) / len(stats['memory']):.2f}%")
    print(f"Max Memory Usage: {max(stats['memory']):.2f}%")
    print(f"Min Memory Usage: {min(stats['memory']):.2f}%")

def monitor_performance(func):
    """Decorator to monitor performance of loading methods"""
    def wrapper(*args, **kwargs):
        stats = {'cpu': [], 'memory': [], 'stop': False}

        # Start monitoring thread
        monitor_thread = threading.Thread(target=monitor_resources, args=(1, stats))
        monitor_thread.start()

        try:
            # Execute the loading function
            start_time = time.time()
            func(*args, **kwargs)
            duration = time.time() - start_time

            # Stop monitoring
            stats['stop'] = True
            monitor_thread.join()

            # Calculate resource statistics
            resource_stats = {
                'duration': duration,
                'avg_cpu': sum(stats['cpu']) / len(stats['cpu']) if stats['cpu'] else 0,
                'max_cpu': max(stats['cpu']) if stats['cpu'] else 0,
                'avg_memory': sum(stats['memory']) / len(stats['memory']) if stats['memory'] else 0,
                'max_memory': max(stats['memory']) if stats['memory'] else 0
            }

            print_resource_stats(stats)
            return duration, resource_stats

        except Exception as e:
            stats['stop'] = True
            monitor_thread.join()
            raise e

    return wrapper

# Update loading methods with performance monitoring
@monitor_performance
def load_row_by_row(df: pd.DataFrame, engine):
    """Load data row by row with SCD Type 2"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes_fixed(df, engine)

    with engine.begin() as conn:
        for _, row in df.iterrows():
            conn.execute(
                text("""
                    INSERT INTO employees
                    VALUES (:employee_id, :name, :email, :address, :phone,
                           :date_of_birth, :gender, :company, :position,
                           :salary, :retired, :valid_from, :valid_to, :is_current)
                """),
                row.to_dict()
            )

    duration = time.time() - start_time
    return duration

# Update loading methods with performance monitoring
@monitor_performance
def load_bulk_pandas(df: pd.DataFrame, engine):
    """Load data using pandas bulk insert"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes_fixed(df, engine)
    df.to_sql('employees', engine, if_exists='append', index=False, method='multi', chunksize=1000)

    duration = time.time() - start_time
    return duration

# Update loading methods with performance monitoring
@monitor_performance
def load_streaming_chunks(df: pd.DataFrame, engine, chunk_size=1000):
    """Load data in chunks"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes_fixed(df, engine)  # Use fixed function

    for chunk_start in range(0, len(df), chunk_size):
        chunk = df.iloc[chunk_start:chunk_start + chunk_size]
        chunk.to_sql('employees', engine, if_exists='append', index=False, method='multi')

    duration = time.time() - start_time
    return duration

def parallel_worker(chunk_data):
    """Worker function for parallel processing with proper connection and error handling"""
    try:
        # Use the correct database name
        engine = create_engine("postgresql://postgres:password@localhost:5432/employees")

        # Use with context to ensure proper resource management
        with engine.begin() as conn:
            # Use if_exists='append' to ensure we don't recreate the table
            chunk_data.to_sql('employees', conn, if_exists='append', index=False, method='multi')

        return len(chunk_data)  # Return the number of records processed
    except Exception as e:
        print(f"Worker error: {str(e)}")
        raise

# Fixed parallel processing method
@monitor_performance
def load_parallel(df: pd.DataFrame, engine, num_processes=4):
    """Load data using parallel processing with proper error handling"""
    start_time = time.time()

    try:
        df = identify_changes(df, engine)
        df = apply_scd2_changes_fixed(df, engine)  # Use fixed function

        # Split the dataframe into chunks
        chunks = np.array_split(df, num_processes)
        print(f"Split data into {len(chunks)} chunks of approximately {len(df) // num_processes} records each")

        # Use Pool to process chunks in parallel with proper error handling
        with Pool(num_processes) as pool:
            try:
                # Use map_async with get() to catch worker exceptions
                results = pool.map_async(parallel_worker, chunks)
                processed_counts = results.get()  # This will raise any exceptions from workers
                total_processed = sum(processed_counts) if processed_counts else 0
                print(f"Successfully processed {total_processed} records in parallel")
            except Exception as e:
                print(f"Error in parallel processing: {str(e)}")
                raise

    except Exception as e:
        print(f"Error in load_parallel: {str(e)}")
        raise

    duration = time.time() - start_time
    return duration

# Update loading methods with performance monitoring
@monitor_performance
def load_dask(df: pd.DataFrame, engine, npartitions=4):
    """Load data using Dask"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes_fixed(df, engine)  # Use fixed function

    ddf = dd.from_pandas(df, npartitions=npartitions)
    with ProgressBar():
        for partition in ddf.partitions:
            partition.compute().to_sql('employees', engine, if_exists='append', index=False)

    duration = time.time() - start_time
    return duration

# Update loading methods with performance monitoring
@monitor_performance
def load_postgres_copy(df: pd.DataFrame, engine):
    """Load data using PostgreSQL COPY command with proper data type handling"""
    start_time = time.time()

    try:
        df = identify_changes(df, engine)
        # Use the fixed SCD2 function with reasonable timestamp
        df = apply_scd2_changes_fixed(df, engine)

        # Create a copy of the dataframe to avoid modifying the original
        copy_df = df.copy()

        # Convert timestamp columns to proper format
        copy_df['valid_from'] = pd.to_datetime(copy_df['valid_from']).dt.strftime('%Y-%m-%d %H:%M:%S')
        copy_df['valid_to'] = pd.to_datetime(copy_df['valid_to']).dt.strftime('%Y-%m-%d %H:%M:%S')

        # Convert date_of_birth to date format
        copy_df['date_of_birth'] = pd.to_datetime(copy_df['date_of_birth']).dt.strftime('%Y-%m-%d')

        # Handle boolean values
        copy_df['is_current'] = copy_df['is_current'].map({True: 't', False: 'f'})

        # Escape special characters in text fields
        text_cols = ['name', 'email', 'address', 'phone', 'gender', 'company', 'position', 'retired']
        for col in text_cols:
            copy_df[col] = copy_df[col].astype(str).str.replace('\t', ' ').str.replace('\n', ' ')

        # Ensure numeric types
        copy_df['employee_id'] = copy_df['employee_id'].astype(int)
        copy_df['salary'] = copy_df['salary'].astype(float)

        # Write to a temporary CSV file with proper formatting
        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False) as f:
            copy_df.to_csv(f, index=False, header=False, sep='\t',
                         na_rep='\\N', quoting=csv.QUOTE_NONE, escapechar='\\')
            temp_file_path = f.name

        # Debug: Print column counts and sample data
        print(f"DataFrame has {len(copy_df.columns)} columns: {', '.join(copy_df.columns)}")
        if len(copy_df) > 0:
            print(f"Sample row (first 3 values): {list(copy_df.iloc[0].values)[:3]}")

        try:
            # Get a raw connection directly - don't use it as a context manager
            raw_conn = engine.raw_connection()
            cursor = raw_conn.cursor()

            try:
                with open(temp_file_path, 'r', encoding='utf-8') as f:
                    print(f"Executing COPY with columns: {', '.join(copy_df.columns)}")
                    cursor.copy_from(
                        f,
                        'employees',
                        sep='\t',
                        columns=copy_df.columns.tolist(),
                        null='\\N'
                    )
                # Commit the transaction
                raw_conn.commit()
                print(f"COPY operation successful, loaded {len(copy_df)} records")
            except Exception as e:
                # Rollback on error
                raw_conn.rollback()
                print(f"COPY operation failed: {str(e)}")
                # If we get an error, try reading the first few lines of the file for debugging
                with open(temp_file_path, 'r', encoding='utf-8') as debug_f:
                    first_lines = [next(debug_f) for _ in range(min(3, len(copy_df)))]
                    print(f"First few lines of CSV:\n{''.join(first_lines)}")
                raise
            finally:
                # Close the cursor and connection
                cursor.close()
                raw_conn.close()
        finally:
            os.remove(temp_file_path)

    except Exception as e:
        print(f"Error in PostgreSQL COPY: {str(e)}")
        raise

    duration = time.time() - start_time
    return duration


def main():
    # 1. Setup Database
    print("\n=== 1. Setting up PostgreSQL Database ===")
    connection_string = setup_postgres_colab()
    engine = create_engine(connection_string)
    setup_database(engine)

    # Define test data sizes
    data_sizes = [
        (1000000, 100000),
        (100000, 10000),
        (10000, 1000),
        (1000, 100),
        (100, 10)
    ]

    # Define the loading methods
    methods = [
        (load_postgres_copy, "PostgreSQL COPY"),
        (load_parallel, "Parallel Processing"),
        (load_row_by_row, "Row-by-row"),
        (load_bulk_pandas, "Bulk Pandas"),
        (load_streaming_chunks, "Streaming Chunks"),
        (load_dask, "Dask")
    ]

    all_results = {}

    # Loop through different data sizes
    for initial_size, update_size in data_sizes:
        print(f"\n\n================================================")
        print(f"Testing with Initial Size: {initial_size}, Update Size: {update_size}")
        print(f"================================================")

        # Reset the table at the beginning of each data size test to ensure we start fresh
        print("\nResetting table for new data size test...")
        reset_table(engine)

        # 2. Create Initial Load File
        print(f"\n=== 2. Creating Initial Load File ({initial_size} records) ===")
        initial_df = generate_data(initial_size)
        print(f"Generated {len(initial_df)} records for initial load")

        # Print a sample record to validate data structure
        print("\nSample record:")
        if len(initial_df) > 0:
            sample = initial_df.iloc[0].to_dict()
            for k, v in sample.items():
                print(f"{k}: {v} ({type(v).__name__})")

        # 3. Create Subsequent Load File
        print(f"\n=== 3. Creating Subsequent Load File ({update_size} records) ===")
        update_df = generate_data(update_size)
        print(f"Generated {len(update_df)} records for subsequent load")

        results = []

        # Execute methods sequentially
        for idx, (method, name) in enumerate(methods, start=1):
            print(f"\n=== Method {idx}: {name} ===")

            # Reset table before each method (except the first one since we just reset it)
            if idx > 1:
                print(f"\nResetting table for method {name}...")
                reset_table(engine)

            try:
                # 4. Load Initial File
                print(f"\nLoading initial file...")
                initial_duration, initial_stats = method(initial_df, engine)

                # Get count after initial load with error handling
                try:
                    with engine.connect() as conn:
                        initial_count = conn.execute(text("SELECT COUNT(*) FROM employees")).scalar()
                        print(f"Count after initial load: {initial_count} records")

                        # Print a sample from the database to verify data was inserted correctly
                        if initial_count > 0:
                            sample = conn.execute(text("SELECT * FROM employees LIMIT 1")).fetchone()
                            print(f"Sample DB record: {sample}")
                except Exception as e:
                    print(f"Error checking initial count: {str(e)}")
                    initial_count = 0

                # 5. Load Subsequent File
                print(f"\nLoading subsequent file...")
                update_duration, update_stats = method(update_df, engine)

                # Get final count with error handling
                try:
                    with engine.connect() as conn:
                        final_count = conn.execute(text("SELECT COUNT(*) FROM employees")).scalar()
                        print(f"Final count: {final_count} records")
                except Exception as e:
                    print(f"Error checking final count: {str(e)}")
                    final_count = initial_count  # Assume no change if we can't check

                result = {
                    'Method': name,
                    'Initial Size': initial_size,
                    'Update Size': update_size,
                    'Initial Load Time': f"{initial_duration:.2f}s",
                    'Initial Records': initial_count,
                    'Initial Avg CPU': f"{initial_stats['avg_cpu']:.1f}%",
                    'Initial Max CPU': f"{initial_stats['max_cpu']:.1f}%",
                    'Initial Avg Memory': f"{initial_stats['avg_memory']:.1f}%",
                    'Initial Max Memory': f"{initial_stats['max_memory']:.1f}%",
                    'Update Load Time': f"{update_duration:.2f}s",
                    'Update Records': final_count - initial_count,
                    'Update Avg CPU': f"{update_stats['avg_cpu']:.1f}%",
                    'Update Max CPU': f"{update_stats['max_cpu']:.1f}%",
                    'Update Avg Memory': f"{update_stats['avg_memory']:.1f}%",
                    'Update Max Memory': f"{update_stats['max_memory']:.1f}%",
                    'Final Records': final_count,
                    'Total Time': f"{(initial_duration + update_duration):.2f}s"
                }

                results.append(result)

                print(f"\nMethod {idx} Results:")
                print(f"Initial Load: {initial_duration:.2f}s ({initial_count} records)")
                print(f"Initial Load Resource Usage:")
                print(f"  Avg CPU: {initial_stats['avg_cpu']:.1f}%, Max CPU: {initial_stats['max_cpu']:.1f}%")
                print(f"  Avg Memory: {initial_stats['avg_memory']:.1f}%, Max Memory: {initial_stats['max_memory']:.1f}%")
                print(f"\nUpdate Load: {update_duration:.2f}s ({final_count - initial_count} records)")
                print(f"Update Load Resource Usage:")
                print(f"  Avg CPU: {update_stats['avg_cpu']:.1f}%, Max CPU: {update_stats['max_cpu']:.1f}%")
                print(f"  Avg Memory: {update_stats['avg_memory']:.1f}%, Max Memory: {update_stats['max_memory']:.1f}%")

            except Exception as e:
                print(f"Error in {name} method: {str(e)}")
                results.append({
                    'Method': name,
                    'Initial Size': initial_size,
                    'Update Size': update_size,
                    'Initial Load Time': 'Failed',
                    'Initial Records': 'Failed',
                    'Initial Avg CPU': 'Failed',
                    'Initial Max CPU': 'Failed',
                    'Initial Avg Memory': 'Failed',
                    'Initial Max Memory': 'Failed',
                    'Update Load Time': 'Failed',
                    'Update Records': 'Failed',
                    'Update Avg CPU': 'Failed',
                    'Update Max CPU': 'Failed',
                    'Update Avg Memory': 'Failed',
                    'Update Max Memory': 'Failed',
                    'Final Records': 'Failed',
                    'Total Time': 'Failed'
                })

        all_results[f"{initial_size}_{update_size}"] = results

        # Print results for current data size
        print(f"\nResults for Initial Size: {initial_size}, Update Size: {update_size}")
        print("=" * 140)
        headers = [
            'Method', 'Initial Load Time', 'Initial Records', 'Initial Avg CPU', 'Initial Max Memory',
            'Update Load Time', 'Update Records', 'Update Avg CPU', 'Update Max Memory',
            'Total Time'
        ]
        row_format = "{:<20} {:<20} {:<15} {:<15} {:<20} {:<20} {:<15} {:<15} {:<20} {:<15}"
        print(row_format.format(*headers))
        print("-" * 140)
        for result in results:
            print(row_format.format(
                result['Method'],
                result['Initial Load Time'],
                str(result['Initial Records']),
                result['Initial Avg CPU'],
                result['Initial Max Memory'],
                result['Update Load Time'],
                str(result['Update Records']),
                result['Update Avg CPU'],
                result['Update Max Memory'],
                result['Total Time']
            ))

    # Print comparative summary across all data sizes
    print("\n\nComparative Summary Across All Data Sizes")
    print("=" * 100)
    print("Data Size (Initial, Update) | Best Method | Worst Method | Average Load Time")
    print("-" * 100)

    for initial_size, update_size in data_sizes:
        results = all_results[f"{initial_size}_{update_size}"]
        valid_results = [r for r in results if r['Total Time'] != 'Failed']

        if valid_results:
            # Convert time strings to float (removing 's' suffix)
            for r in valid_results:
                if isinstance(r['Total Time'], str) and r['Total Time'].endswith('s'):
                    r['Total Time'] = float(r['Total Time'][:-1])

            best_method = min(valid_results, key=lambda x: x['Total Time'])
            worst_method = max(valid_results, key=lambda x: x['Total Time'])
            avg_time = sum(r['Total Time'] for r in valid_results) / len(valid_results)

            print(f"{initial_size:,}, {update_size:,} | {best_method['Method']} ({best_method['Total Time']:.2f}s) | "
                  f"{worst_method['Method']} ({worst_method['Total Time']:.2f}s) | {avg_time:.2f}s")
        else:
            print(f"{initial_size:,}, {update_size:,} | All methods failed")

if __name__ == "__main__":
    main()



=== 1. Setting up PostgreSQL Database ===
Setting up PostgreSQL in Google Colab...
Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
postgresql is already the newest version (14+238).
postgresql-con

  return bound(*args, **kwds)


Split data into 4 chunks of approximately 250000 records each
Successfully processed 1000000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 4.72%
Max CPU Usage: 6.20%
Min CPU Usage: 1.50%
Average Memory Usage: 4.42%
Max Memory Usage: 8.50%
Min Memory Usage: 1.80%
Count after initial load: 1000000 records
Sample DB record: (0, 'Danilo Acevedo', 'cursor2052@protonmail.com', '591 Hillway Glen', '+19561599585', datetime.date(1973, 6, 2), 'Male', 'Poplar Bluff Corp', 'Meat Inspector', Decimal('36535.48'), 'No', datetime.datetime(2025, 2, 26, 23, 26, 45, 973772), datetime.datetime(2099, 12, 31, 23, 59, 59), True)

Loading subsequent file...
Split data into 4 chunks of approximately 25000 records each


  return bound(*args, **kwds)


Successfully processed 100000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 3.22%
Max CPU Usage: 6.40%
Min CPU Usage: 1.10%
Average Memory Usage: 2.07%
Max Memory Usage: 2.60%
Min Memory Usage: 1.80%
Final count: 1100000 records

Method 2 Results:
Initial Load: 80.57s (1000000 records)
Initial Load Resource Usage:
  Avg CPU: 4.7%, Max CPU: 6.2%
  Avg Memory: 4.4%, Max Memory: 8.5%

Update Load: 20.00s (100000 records)
Update Load Resource Usage:
  Avg CPU: 3.2%, Max CPU: 6.4%
  Avg Memory: 2.1%, Max Memory: 2.6%

=== Method 3: Row-by-row ===

Resetting table for method Row-by-row...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 1.54%
Max CPU Usage: 3.00%
Min CPU Usage: 1.10%
Average Memory Usage: 1.90%
Max Memory Usage: 1.90%
Min Memory Usage: 1.80%
Count after initial load: 1000000 records
Sample DB record: (0, 'Danilo Acevedo', 'cursor2052@protonmail.com', '591 Hillway Glen', '+19561599585', datet

  return bound(*args, **kwds)


Successfully processed 100000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 3.84%
Max CPU Usage: 4.40%
Min CPU Usage: 0.80%
Average Memory Usage: 2.14%
Max Memory Usage: 2.60%
Min Memory Usage: 1.80%
Count after initial load: 100000 records
Sample DB record: (25000, 'Grayce Mejia', 'waste1830@live.com', '1246 Ross Bayou', '+1-364-435-3140', datetime.date(1984, 6, 20), 'Female', 'Pinellas Park Corp', 'Sportswoman', Decimal('152849.36'), 'No', datetime.datetime(2025, 2, 26, 23, 47, 34, 167581), datetime.datetime(2099, 12, 31, 23, 59, 59), True)

Loading subsequent file...
Split data into 4 chunks of approximately 2500 records each


  return bound(*args, **kwds)


Successfully processed 10000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 3.20%
Max CPU Usage: 4.10%
Min CPU Usage: 2.70%
Average Memory Usage: 1.87%
Max Memory Usage: 2.00%
Min Memory Usage: 1.80%
Final count: 110000 records

Method 2 Results:
Initial Load: 8.40s (100000 records)
Initial Load Resource Usage:
  Avg CPU: 3.8%, Max CPU: 4.4%
  Avg Memory: 2.1%, Max Memory: 2.6%

Update Load: 2.05s (10000 records)
Update Load Resource Usage:
  Avg CPU: 3.2%, Max CPU: 4.1%
  Avg Memory: 1.9%, Max Memory: 2.0%

=== Method 3: Row-by-row ===

Resetting table for method Row-by-row...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 1.43%
Max CPU Usage: 3.10%
Min CPU Usage: 0.50%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 100000 records
Sample DB record: (0, 'Moses Joyce', 'eight1819@duck.com', '578 Edna Bayou', '+1-620-328-8585', datetime.date(1976, 1

  return bound(*args, **kwds)


Successfully processed 10000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.30%
Max CPU Usage: 0.30%
Min CPU Usage: 0.30%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 10000 records
Sample DB record: (0, 'Britta Hardin', 'glory1888@duck.com', '888 Mcnair Walk', '+16088813132', datetime.date(1999, 5, 17), 'Female', 'Barnstable Town Corp', 'Instrument Engineer', Decimal('49397.57'), 'No', datetime.datetime(2025, 2, 26, 23, 49, 48, 290797), datetime.datetime(2099, 12, 31, 23, 59, 59), True)

Loading subsequent file...
Split data into 4 chunks of approximately 250 records each


  return bound(*args, **kwds)


Successfully processed 1000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 3.70%
Max CPU Usage: 3.70%
Min CPU Usage: 3.70%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Final count: 11000 records

Method 2 Results:
Initial Load: 0.96s (10000 records)
Initial Load Resource Usage:
  Avg CPU: 0.3%, Max CPU: 0.3%
  Avg Memory: 1.8%, Max Memory: 1.8%

Update Load: 0.33s (1000 records)
Update Load Resource Usage:
  Avg CPU: 3.7%, Max CPU: 3.7%
  Avg Memory: 1.8%, Max Memory: 1.8%

=== Method 3: Row-by-row ===

Resetting table for method Row-by-row...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 1.84%
Max CPU Usage: 3.10%
Min CPU Usage: 0.90%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 10000 records
Sample DB record: (0, 'Britta Hardin', 'glory1888@duck.com', '888 Mcnair Walk', '+16088813132', datetime.date(1999, 5, 17)

  return bound(*args, **kwds)


Successfully processed 1000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.30%
Max CPU Usage: 0.30%
Min CPU Usage: 0.30%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 1000 records
Sample DB record: (250, 'Tamekia Beck', 'examines2014@yahoo.com', '1340 Coventry Walk', '+13211564750', datetime.date(1994, 9, 29), 'Male', 'Easley Corp', 'Cab Driver', Decimal('33088.90'), 'No', datetime.datetime(2025, 2, 26, 23, 50, 9, 688816), datetime.datetime(2099, 12, 31, 23, 59, 59), True)

Loading subsequent file...
Split data into 4 chunks of approximately 25 records each


  return bound(*args, **kwds)


Successfully processed 100 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.80%
Max CPU Usage: 0.80%
Min CPU Usage: 0.80%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Final count: 1100 records

Method 2 Results:
Initial Load: 0.26s (1000 records)
Initial Load Resource Usage:
  Avg CPU: 0.3%, Max CPU: 0.3%
  Avg Memory: 1.8%, Max Memory: 1.8%

Update Load: 0.21s (100 records)
Update Load Resource Usage:
  Avg CPU: 0.8%, Max CPU: 0.8%
  Avg Memory: 1.8%, Max Memory: 1.8%

=== Method 3: Row-by-row ===

Resetting table for method Row-by-row...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 0.70%
Max CPU Usage: 0.70%
Min CPU Usage: 0.70%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 1000 records
Sample DB record: (0, 'Amal Conrad', 'functionality1892@yahoo.com', '268 Cowles Trace', '+1-205-316-9428', datetime.date(1986, 

  return bound(*args, **kwds)


Successfully processed 100 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.30%
Max CPU Usage: 0.30%
Min CPU Usage: 0.30%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 100 records
Sample DB record: (75, 'Jerica Tran', 'driven1816@outlook.com', '346 Alpine Hills', '+1-419-602-8139', datetime.date(2003, 3, 20), 'Female', 'Huntsville Corp', 'Jockey', Decimal('169710.32'), 'Yes', datetime.datetime(2025, 2, 26, 23, 50, 21, 875361), datetime.datetime(2099, 12, 31, 23, 59, 59), True)

Loading subsequent file...
Split data into 4 chunks of approximately 2 records each


  return bound(*args, **kwds)


Successfully processed 10 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.70%
Max CPU Usage: 0.70%
Min CPU Usage: 0.70%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Final count: 110 records

Method 2 Results:
Initial Load: 0.20s (100 records)
Initial Load Resource Usage:
  Avg CPU: 0.3%, Max CPU: 0.3%
  Avg Memory: 1.8%, Max Memory: 1.8%

Update Load: 0.20s (10 records)
Update Load Resource Usage:
  Avg CPU: 0.7%, Max CPU: 0.7%
  Avg Memory: 1.8%, Max Memory: 1.8%

=== Method 3: Row-by-row ===

Resetting table for method Row-by-row...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 0.70%
Max CPU Usage: 0.70%
Min CPU Usage: 0.70%
Average Memory Usage: 1.80%
Max Memory Usage: 1.80%
Min Memory Usage: 1.80%
Count after initial load: 100 records
Sample DB record: (0, 'Allen Pate', 'state1852@example.com', '1031 Montecito Path', '+1-410-088-9889', datetime.date(1981, 5, 25), '