In [1]:
!apt-get update
!apt-get install -y mysql-server default-libmysqlclient-dev python3-dev

!pip install pandas numpy sqlalchemy psutil mimesis dask "dask[dataframe]" tqdm mysql-connector-python pymysql

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
default-libmysqlclient-dev is already the newest version (1.0.8).
mysql-server is already the newest version (8.0.41-0ubuntu0.22.04.1).
python3-dev i

In [2]:
import os
import csv
import time
import psutil
import threading
import pandas as pd
import numpy as np
from datetime import datetime, date
from typing import List, Dict
from sqlalchemy import create_engine, text
from multiprocessing import Pool
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from mimesis.locales import Locale
from mimesis.schema import Fieldset
import tempfile
import io
import subprocess
import mysql.connector
from mysql.connector import Error
import shutil


def setup_mysql_colab():
    """Setup MySQL in Google Colab - set root password, enable local_infile,
    grant FILE privilege, and set secure_file_priv to /content/mysql."""
    print("Setting up MySQL in Google Colab...")

    # Install MySQL if needed
    if not os.path.exists("/usr/bin/mysql"):
        print("Installing MySQL...")
        get_ipython().system("apt-get update")
        get_ipython().system("apt-get install -y mysql-server python3-dev default-libmysqlclient-dev")
        get_ipython().system("pip install PyMySQL")

    # Start MySQL
    get_ipython().system("service mysql start")
    time.sleep(2)

    # Set root password using sudo
    try:
        cmd = "sudo mysql -u root -e \"ALTER USER 'root'@'localhost' IDENTIFIED WITH mysql_native_password BY 'password';\""
        get_ipython().system(cmd)
        get_ipython().system("sudo mysql -u root -ppassword -e 'FLUSH PRIVILEGES;'")
        print("Root password set using sudo.")
    except Exception as e:
        print(f"Error setting root password with sudo: {e}")
        raise

    # Create Database and User with proper privileges including FILE privilege
    try:
        conn = mysql.connector.connect(user='root', password='password', host='localhost')
        cursor = conn.cursor()
        cursor.execute("CREATE DATABASE IF NOT EXISTS employees;")
        cursor.execute("CREATE USER IF NOT EXISTS 'mysql_user'@'localhost' IDENTIFIED BY 'password';")
        cursor.execute("GRANT ALL PRIVILEGES ON employees.* TO 'mysql_user'@'localhost';")
        # Grant FILE privilege to allow file-based operations
        cursor.execute("GRANT FILE ON *.* TO 'mysql_user'@'localhost';")
        cursor.execute("FLUSH PRIVILEGES;")
        conn.commit()
        print("Database and user 'mysql_user' created/configured successfully.")
    except mysql.connector.Error as err:
        print(f"Error creating database/user: {err}")
        raise
    finally:
        if 'cursor' in locals():
            cursor.close()
        if 'conn' in locals():
            conn.close()

    # Create /content/mysql directory for secure file operations
    get_ipython().system("mkdir -p /content/mysql")

    # Update MySQL configuration to set secure_file_priv to /content/mysql
    # Replace existing secure_file_priv line if present; otherwise, append it.
    get_ipython().system("sudo sed -i '/^secure_file_priv/s|=.*|= /content/mysql|' /etc/mysql/mysql.conf.d/mysqld.cnf")
    get_ipython().system("sudo bash -c 'grep -q \"^secure_file_priv\" /etc/mysql/mysql.conf.d/mysqld.cnf || echo \"secure_file_priv = /content/mysql\" >> /etc/mysql/mysql.conf.d/mysqld.cnf'")

    # Update bind-address and restart MySQL
    get_ipython().system("sudo sed -i 's/bind-address.*/bind-address = 0.0.0.0/' /etc/mysql/mysql.conf.d/mysqld.cnf")
    get_ipython().system("service mysql restart")
    time.sleep(5)

    # Enable LOCAL INFILE on the server
    get_ipython().system("sudo mysql -u root -ppassword -e \"SET GLOBAL local_infile = 1;\"")
    print("LOCAL INFILE enabled.")

    # Update connection string to enable local_infile on the client side
    connection_string = "mysql+pymysql://mysql_user:password@localhost:3306/employees?local_infile=1"
    engine = create_engine(connection_string)

    # Test connection to ensure everything is set up correctly
    max_attempts = 30
    attempt = 0
    while attempt < max_attempts:
        try:
            print(f"Attempting to connect to database... (Attempt {attempt + 1}/{max_attempts})")
            with engine.connect() as connection:
                connection.execute(text("SELECT 1"))
            print("Successfully connected to MySQL!")
            return connection_string
        except Exception as e:
            print(f"Connection attempt failed: {str(e)}")
            attempt += 1
            time.sleep(2)

    raise Exception("Failed to connect to MySQL after maximum attempts")




def setup_database(engine):
    """Create SCD Type 2 table schema"""
    with engine.connect() as conn:
        conn.execute(text("""
            CREATE TABLE IF NOT EXISTS employees (
                employee_id INT,
                name VARCHAR(100),
                email VARCHAR(100),
                address TEXT,
                phone VARCHAR(50),
                date_of_birth DATE,
                gender VARCHAR(10),
                company VARCHAR(100),
                position VARCHAR(100),
                salary DECIMAL(10,2),
                retired VARCHAR(3),
                valid_from TIMESTAMP,
                valid_to TIMESTAMP,
                is_current BOOLEAN,
                PRIMARY KEY (employee_id, valid_from)
            )
        """))
        conn.commit()


def generate_data(row_count: int) -> pd.DataFrame:
    """Generate synthetic data using Mimesis Fieldset"""
    fieldset = Fieldset(locale=Locale.EN)

    # Generate all fields at once using Fieldset
    employee_ids = list(range(row_count))
    names = fieldset("full_name", i=row_count)
    emails = fieldset("email", i=row_count)
    addresses = fieldset("address", i=row_count)
    phones = fieldset("telephone", i=row_count)
    dates = [str(date.isoformat()) for date in fieldset("date", start=1950, end=2005, i=row_count)]
    genders = np.random.choice(["Male", "Female"], size=row_count).tolist()
    cities = fieldset("city", i=row_count)
    positions = fieldset("occupation", i=row_count)
    salaries = np.round(np.random.uniform(30000, 200000, row_count), 2).tolist()
    retired = np.random.choice(["Yes", "No"], size=row_count).tolist()

    # Create records using list comprehension with zip
    records = [
        {
            "employee_id": emp_id,
            "name": name,
            "email": email,
            "address": address,
            "phone": phone,
            "date_of_birth": dob,
            "gender": gender,
            "company": f"{city} Corp",
            "position": position,
            "salary": salary,
            "retired": retired_status
        }
        for emp_id, name, email, address, phone, dob, gender, city, position, salary, retired_status
        in zip(employee_ids, names, emails, addresses, phones, dates, genders,
               cities, positions, salaries, retired)
    ]

    return pd.DataFrame(records)


def identify_changes(new_df: pd.DataFrame, engine) -> pd.DataFrame:
    """Compare new data with existing records and identify changes"""
    current_records = pd.read_sql(
        """
        SELECT * FROM employees
        WHERE is_current = true
        """,
        engine
    )

    if len(current_records) > 0:
        merged = new_df.merge(
            current_records,
            on='employee_id',
            how='left',
            suffixes=('_new', '_current')
        )

        changed_mask = (
            (merged['name_new'] != merged['name_current']) |
            (merged['email_new'] != merged['email_current']) |
            (merged['address_new'] != merged['address_current']) |
            (merged['phone_new'] != merged['phone_current']) |
            (merged['position_new'] != merged['position_current']) |
            (merged['salary_new'] != merged['salary_current'])
        )

        new_mask = merged['name_current'].isna()

        new_df['change_type'] = 'no_change'
        new_df.loc[new_mask, 'change_type'] = 'insert'
        new_df.loc[changed_mask & ~new_mask, 'change_type'] = 'update'
    else:
        new_df['change_type'] = 'insert'

    return new_df


def apply_scd2_changes(df: pd.DataFrame, engine) -> pd.DataFrame:
    """Apply SCD Type 2 changes to the data"""
    current_timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # Ensure datetime format for 'valid_from' and 'valid_to'
    df['valid_from'] = current_timestamp
    # Use a more compatible "far future" date
    df['valid_to'] = pd.to_datetime('2038-01-19 03:14:07').strftime('%Y-%m-%d %H:%M:%S')  # Changed date
    df['is_current'] = True

    # Ensure date fields are correctly formatted
    if 'date_of_birth' in df.columns:
        df['date_of_birth'] = pd.to_datetime(df['date_of_birth'], errors='coerce').dt.strftime('%Y-%m-%d')

    updates = df[df['change_type'] == 'update']
    if not updates.empty:
        with engine.begin() as conn:
            employee_ids = tuple(updates['employee_id'].tolist())
            if len(employee_ids) == 1:
                query = text("""
                    UPDATE employees
                    SET valid_to = :valid_to,
                        is_current = FALSE
                    WHERE employee_id = :employee_id
                    AND is_current = TRUE
                """)
                conn.execute(query, {"valid_to": current_timestamp, "employee_id": employee_ids[0]})
            else:
                query = text("""
                    UPDATE employees
                    SET valid_to = :valid_to,
                        is_current = FALSE
                    WHERE employee_id IN :employee_ids
                    AND is_current = TRUE
                """)
                conn.execute(query, {"valid_to": current_timestamp, "employee_ids": employee_ids})

    return df.drop(columns=['change_type'])



def reset_table(engine):
    """Drop and recreate the employees table"""
    try:
        with engine.connect() as conn:
            conn.execute(text("DROP TABLE IF EXISTS employees"))
            conn.commit()

        # Recreate the table schema
        setup_database(engine)

        print("Table dropped and recreated successfully")
    except Exception as e:
        print(f"Error resetting table: {str(e)}")
        raise


# Modify truncate_table to use reset_table instead
def truncate_table(engine):
    """Use reset_table instead of truncate for cleaner state"""
    reset_table(engine)

def monitor_resources(interval, stats):
    """Monitor CPU and memory usage"""
    while not stats['stop']:
        stats['cpu'].append(psutil.cpu_percent(interval=None))
        stats['memory'].append(psutil.virtual_memory().percent)
        time.sleep(interval)


def print_resource_stats(stats):
    """Print resource usage statistics"""
    print("\nResource Usage Statistics:")
    print(f"Average CPU Usage: {sum(stats['cpu']) / len(stats['cpu']):.2f}%")
    print(f"Max CPU Usage: {max(stats['cpu']):.2f}%")
    print(f"Min CPU Usage: {min(stats['cpu']):.2f}%")
    print(f"Average Memory Usage: {sum(stats['memory']) / len(stats['memory']):.2f}%")
    print(f"Max Memory Usage: {max(stats['memory']):.2f}%")
    print(f"Min Memory Usage: {min(stats['memory']):.2f}%")


def monitor_performance(func):
    """Decorator to monitor performance of loading methods"""
    def wrapper(*args, **kwargs):
        stats = {'cpu': [], 'memory': [], 'stop': False}

        # Start monitoring thread
        monitor_thread = threading.Thread(target=monitor_resources, args=(1, stats))
        monitor_thread.start()

        try:
            # Execute the loading function
            start_time = time.time()
            func(*args, **kwargs)
            duration = time.time() - start_time

            # Stop monitoring
            stats['stop'] = True
            monitor_thread.join()

            # Calculate resource statistics
            resource_stats = {
                'duration': duration,
                'avg_cpu': sum(stats['cpu']) / len(stats['cpu']) if stats['cpu'] else 0,
                'max_cpu': max(stats['cpu']) if stats['cpu'] else 0,
                'avg_memory': sum(stats['memory']) / len(stats['memory']) if stats['memory'] else 0,
                'max_memory': max(stats['memory']) if stats['memory'] else 0
            }

            print_resource_stats(stats)
            return duration, resource_stats

        except Exception as e:
            stats['stop'] = True
            monitor_thread.join()
            raise e

    return wrapper

@monitor_performance
def load_row_by_row(df: pd.DataFrame, engine):
    """Load data row by row with SCD Type 2"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes(df, engine)

    with engine.begin() as conn:
        for _, row in df.iterrows():
            conn.execute(
                text("""
                    INSERT INTO employees
                    VALUES (:employee_id, :name, :email, :address, :phone,
                           :date_of_birth, :gender, :company, :position,
                           :salary, :retired, :valid_from, :valid_to, :is_current)
                """),
                row.to_dict()
            )

    duration = time.time() - start_time
    return duration


@monitor_performance
def load_bulk_pandas(df: pd.DataFrame, engine):
    """Load data using pandas bulk insert"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes(df, engine)
    df.to_sql('employees', engine, if_exists='append', index=False, method='multi', chunksize=1000)

    duration = time.time() - start_time
    return duration


@monitor_performance
def load_streaming_chunks(df: pd.DataFrame, engine, chunk_size=1000):
    """Load data in chunks"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes(df, engine)

    for chunk_start in range(0, len(df), chunk_size):
        chunk = df.iloc[chunk_start:chunk_start + chunk_size]
        chunk.to_sql('employees', engine, if_exists='append', index=False, method='multi')

    duration = time.time() - start_time
    return duration


def parallel_worker(chunk_data):
    """Worker function for parallel processing with proper connection and error handling"""
    try:
        # Create a new engine for each worker to avoid connection sharing issues
        engine = create_engine("mysql+pymysql://mysql_user:password@localhost:3306/employees")

        # Use with context to ensure proper resource management
        with engine.begin() as conn:
            # Use if_exists='append' to ensure we don't recreate the table
            chunk_data.to_sql('employees', conn, if_exists='append', index=False, method='multi')

        return len(chunk_data)  # Return the number of records processed
    except Exception as e:
        print(f"Worker error: {str(e)}")
        raise


@monitor_performance
def load_parallel(df: pd.DataFrame, engine, num_processes=4):
    """Load data using parallel processing with proper error handling"""
    start_time = time.time()

    try:
        df = identify_changes(df, engine)
        df = apply_scd2_changes(df, engine)  # Apply SCD2 changes

        # Split the dataframe into chunks
        chunks = np.array_split(df, num_processes)
        print(f"Split data into {len(chunks)} chunks of approximately {len(df) // num_processes} records each")

        # Use Pool to process chunks in parallel with proper error handling
        with Pool(num_processes) as pool:
            try:
                # Use map_async with get() to catch worker exceptions
                results = pool.map_async(parallel_worker, chunks)
                processed_counts = results.get()  # This will raise any exceptions from workers
                total_processed = sum(processed_counts) if processed_counts else 0
                print(f"Successfully processed {total_processed} records in parallel")
            except Exception as e:
                print(f"Error in parallel processing: {str(e)}")
                raise

    except Exception as e:
        print(f"Error in load_parallel: {str(e)}")
        raise

    duration = time.time() - start_time
    return duration


@monitor_performance
def load_dask(df: pd.DataFrame, engine, npartitions=4):
    """Load data using Dask"""
    start_time = time.time()

    df = identify_changes(df, engine)
    df = apply_scd2_changes(df, engine)  # Apply SCD2 changes

    ddf = dd.from_pandas(df, npartitions=npartitions)
    with ProgressBar():
        for partition in ddf.partitions:
            partition.compute().to_sql('employees', engine, if_exists='append', index=False)

    duration = time.time() - start_time
    return duration


@monitor_performance
def load_mysql_load_data_modified(df: pd.DataFrame, engine):
    """Load data using MySQL LOAD DATA INFILE command - modified for Colab environments."""
    start_time = time.time()

    try:
        df = identify_changes(df, engine)
        df = apply_scd2_changes(df, engine)

        # Create a copy of the dataframe to avoid modifying the original
        copy_df = df.copy()

        # Convert timestamp columns to proper format
        copy_df['valid_from'] = pd.to_datetime(copy_df['valid_from']).dt.strftime('%Y-%m-%d %H:%M:%S')
        copy_df['valid_to'] = pd.to_datetime(copy_df['valid_to']).dt.strftime('%Y-%m-%d %H:%M:%S')

        # Convert date_of_birth to date format
        copy_df['date_of_birth'] = pd.to_datetime(copy_df['date_of_birth']).dt.strftime('%Y-%m-%d')

        # Handle boolean values - MySQL uses 1/0 for booleans
        copy_df['is_current'] = copy_df['is_current'].map({True: 1, False: 0})

        # Escape special characters in text fields
        text_cols = ['name', 'email', 'address', 'phone', 'gender', 'company', 'position', 'retired']
        for col in text_cols:
            copy_df[col] = copy_df[col].astype(str).str.replace('\\', '\\\\')
            copy_df[col] = copy_df[col].str.replace(',', '\\,')
            copy_df[col] = copy_df[col].str.replace('\t', ' ')
            copy_df[col] = copy_df[col].str.replace('\n', ' ')
            copy_df[col] = copy_df[col].str.replace('"', '\\"')

        # Ensure numeric types
        copy_df['employee_id'] = copy_df['employee_id'].astype(int)
        copy_df['salary'] = copy_df['salary'].astype(float)

        # Create a temporary CSV file
        with tempfile.NamedTemporaryFile(mode='w', encoding='utf-8', delete=False, suffix='.csv') as f:
            csv_path = f.name
            copy_df.to_csv(
                f,
                index=False,
                sep=',',
                quoting=csv.QUOTE_ALL,  # Quote all fields
                quotechar='"',
                doublequote=True,
                lineterminator='\n'
            )

        # First, check if we can determine MySQL's secure-file-priv setting
        try:
            with engine.connect() as conn:
                result = conn.execute(text("SHOW VARIABLES LIKE 'secure_file_priv'"))
                row = result.fetchone()
                if row:
                    secure_file_priv = row[1]
                else:
                    secure_file_priv = None
                print(f"MySQL secure_file_priv: {secure_file_priv}")
        except Exception:
            secure_file_priv = None
            print("Could not determine secure_file_priv setting, will try local file load")

        secure_file_path = csv_path
        if secure_file_priv and secure_file_priv.lower() != 'null':
            if secure_file_priv:
                # Copy file to secure directory
                dest_path = os.path.join(secure_file_priv, os.path.basename(csv_path))
                shutil.copy2(csv_path, dest_path)
                secure_file_path = dest_path

                # Try to set permissions if needed
                try:
                    os.chmod(secure_file_path, 0o644)
                except Exception:
                    print("Warning: Could not change file permissions")

        try:
            # Create a database connection
            conn = mysql.connector.connect(
                host="localhost",
                user="mysql_user",
                password="password",
                database="employees"
            )

            cursor = conn.cursor()

            # Get the column names string for the LOAD DATA INFILE command
            columns = ", ".join([f"`{col}`" for col in copy_df.columns])

            # Different approach based on secure_file_priv
            if not secure_file_priv or secure_file_priv.lower() == 'null':
                # If secure_file_priv is NULL, fall back to direct INSERT
                print("LOAD DATA INFILE disabled, falling back to batch INSERT")
                raise mysql.connector.Error("LOAD DATA INFILE disabled") # Force the fallback

            else:
                # Try LOCAL first as it's more compatible across environments
                load_data_query = f"""
                LOAD DATA LOCAL INFILE '{secure_file_path}'
                INTO TABLE employees
                FIELDS TERMINATED BY ','
                ENCLOSED BY '"'
                LINES TERMINATED BY '\\n'
                IGNORE 1 ROWS
                ({columns});
                """

                try:
                    print("Trying LOAD DATA LOCAL INFILE...")
                    cursor.execute(load_data_query)
                    conn.commit()
                except mysql.connector.Error as e: # Catch MySQL errors specifically
                    print(f"LOCAL INFILE failed: {str(e)}")
                    print("Trying standard LOAD DATA INFILE...")

                    # Try without LOCAL
                    load_data_query = load_data_query.replace("LOCAL INFILE", "INFILE")
                    cursor.execute(load_data_query)
                    conn.commit()

                print(f"LOAD DATA INFILE operation successful, loaded {len(copy_df)} records")

        except mysql.connector.Error as e:  # Catch MySQL errors *here*
            print(f"MySQL Error (LOAD DATA or fallback): {str(e)}")
            # Fallback to batch INSERT
            print("Falling back to batch INSERT...")

            batch_size = 1000
            for i in range(0, len(copy_df), batch_size):
                batch = copy_df.iloc[i:i+batch_size]

                # Generate placeholders for parameterized query
                placeholders = ", ".join(["%s"] * len(copy_df.columns))
                cols = ", ".join([f"`{col}`" for col in copy_df.columns])

                # Convert dataframe to list of tuples for executemany
                rows = [tuple(x) for x in batch.to_numpy()]

                # Execute batch insert
                insert_query = f"INSERT INTO employees ({cols}) VALUES ({placeholders})"
                cursor.executemany(insert_query, rows)
                conn.commit()

            print(f"Batch INSERT operation successful, loaded {len(copy_df)} records")


        finally: # Use finally to ensure cleanup *always* happens
            if 'cursor' in locals():
                cursor.close()
            if 'conn' in locals():
                conn.close()

            # Remove temporary files
            try:
                os.remove(csv_path)
                if secure_file_path != csv_path and os.path.exists(secure_file_path):
                    os.remove(secure_file_path)
            except Exception:
                print("Warning: Could not remove temporary files")



    except Exception as e:
        print(f"Error in MySQL LOAD DATA: {str(e)}")
        raise

    duration = time.time() - start_time
    return duration

def main():
    # 1. Setup Database
    print("\n=== 1. Setting up MySQL Database ===")
    connection_string = setup_mysql_colab()
    engine = create_engine(connection_string)
    setup_database(engine)

    # Define test data sizes
    data_sizes = [
        (1000000, 100000),
        (100000, 10000),
        (10000, 1000),
        (1000, 100),
        (100, 10)
    ]

    # Define the loading methods
    methods = [
        (load_mysql_load_data_modified, "MySQL LOAD DATA"),  # Keep the modified version
        (load_parallel, "Parallel Processing"),
        (load_row_by_row, "Row-by-row"),
        (load_bulk_pandas, "Bulk Pandas"),
        (load_streaming_chunks, "Streaming Chunks"),
        (load_dask, "Dask")
    ]

    all_results = {}
    num_runs = 1 # Number of runs for each method

    # Loop through different data sizes
    for initial_size, update_size in data_sizes:
        print(f"\n\n================================================")
        print(f"Testing with Initial Size: {initial_size}, Update Size: {update_size}")
        print(f"================================================")

        # Reset the table at the beginning of each data size test
        print("\nResetting table for new data size test...")
        reset_table(engine)

        # 2. Create Initial Load File
        print(f"\n=== 2. Creating Initial Load File ({initial_size} records) ===")
        initial_df = generate_data(initial_size)
        print(f"Generated {len(initial_df)} records for initial load")

        # 3. Create Subsequent Load File
        print(f"\n=== 3. Creating Subsequent Load File ({update_size} records) ===")
        update_df = generate_data(update_size)
        print(f"Generated {len(update_df)} records for subsequent load")


        # Execute methods sequentially
        for idx, (method, name) in enumerate(methods, start=1):
            print(f"\n=== Method {idx}: {name} ===")
            method_results = []

            for run in range(num_runs):
                print(f"\nRun {run + 1} of {num_runs}")

                # Reset table before each method run (except first run of first method)
                if idx > 1 or run > 0 :
                    print(f"\nResetting table for method {name}, run {run+1}...")
                    reset_table(engine)

                try:
                    # 4. Load Initial File
                    print(f"\nLoading initial file...")
                    initial_duration, initial_stats = method(initial_df.copy(), engine) # Pass a copy

                    # Get count after initial load
                    with engine.connect() as conn:
                        initial_count = conn.execute(text("SELECT COUNT(*) FROM employees")).scalar()
                        print(f"Count after initial load: {initial_count} records")

                    # 5. Load Subsequent File
                    print(f"\nLoading subsequent file...")
                    update_duration, update_stats = method(update_df.copy(), engine) # Pass a copy

                    # Get final count
                    with engine.connect() as conn:
                        final_count = conn.execute(text("SELECT COUNT(*) FROM employees")).scalar()
                        print(f"Final count: {final_count} records")

                    result = {
                        'Method': name,
                        'Initial Size': initial_size,
                        'Update Size': update_size,
                        'Initial Load Time': initial_duration,
                        'Initial Records': initial_count,
                        'Initial Avg CPU': initial_stats['avg_cpu'],
                        'Initial Max CPU': initial_stats['max_cpu'],
                        'Initial Avg Memory': initial_stats['avg_memory'],
                        'Initial Max Memory': initial_stats['max_memory'],
                        'Update Load Time': update_duration,
                        'Update Records': final_count - initial_count,
                        'Update Avg CPU': update_stats['avg_cpu'],
                        'Update Max CPU': update_stats['max_cpu'],
                        'Update Avg Memory': update_stats['avg_memory'],
                        'Update Max Memory': update_stats['max_memory'],
                        'Final Records': final_count,
                        'Total Time': initial_duration + update_duration
                    }
                    method_results.append(result)

                except Exception as e:
                    print(f"Error in {name} method: {str(e)}")
                    method_results.append({
                        'Method': name,
                        'Initial Size': initial_size,
                        'Update Size': update_size,
                        'Initial Load Time': 'Failed',
                        'Initial Records': 'Failed',
                        'Initial Avg CPU': 'Failed',
                        'Initial Max CPU': 'Failed',
                        'Initial Avg Memory': 'Failed',
                        'Initial Max Memory': 'Failed',
                        'Update Load Time': 'Failed',
                        'Update Records': 'Failed',
                        'Update Avg CPU': 'Failed',
                        'Update Max CPU': 'Failed',
                        'Update Avg Memory': 'Failed',
                        'Update Max Memory': 'Failed',
                        'Final Records': 'Failed',
                        'Total Time': 'Failed'
                    })
            # Calculate average results for the method
            avg_result = {}
            successful_runs = [r for r in method_results if r['Total Time'] != 'Failed']
            if successful_runs:
                for key in successful_runs[0].keys():
                    if isinstance(successful_runs[0][key], (int, float)):
                        avg_result[key] = sum(r[key] for r in successful_runs) / len(successful_runs)
                    else:
                        avg_result[key] = successful_runs[0][key]  # Use first run value for non-numeric

                if f"{initial_size}_{update_size}" not in all_results:
                    all_results[f"{initial_size}_{update_size}"] = []
                all_results[f"{initial_size}_{update_size}"].append(avg_result)
                print(f"\nAverage Results for Method {idx} ({name}) after {num_runs} runs:")
                print(avg_result)

            else:
                print(f"All runs for method {name} failed.")
                if f"{initial_size}_{update_size}" not in all_results:
                     all_results[f"{initial_size}_{update_size}"] = []
                all_results[f"{initial_size}_{update_size}"].append({
                    'Method': name,
                    'Initial Size': initial_size,
                    'Update Size': update_size,
                    'Initial Load Time': 'Failed',
                    'Initial Records': 'Failed',
                    'Initial Avg CPU': 'Failed',
                    'Initial Max CPU': 'Failed',
                    'Initial Avg Memory': 'Failed',
                    'Initial Max Memory': 'Failed',
                    'Update Load Time': 'Failed',
                    'Update Records': 'Failed',
                    'Update Avg CPU': 'Failed',
                    'Update Max CPU': 'Failed',
                    'Update Avg Memory': 'Failed',
                    'Update Max Memory': 'Failed',
                    'Final Records': 'Failed',
                    'Total Time': 'Failed'
                })

        # Print results for current data size (after all methods and runs)
        print(f"\nResults for Initial Size: {initial_size}, Update Size: {update_size}")
        print("=" * 140)
        headers = [
            'Method', 'Initial Load Time', 'Initial Records', 'Initial Avg CPU', 'Initial Max Memory',
            'Update Load Time', 'Update Records', 'Update Avg CPU', 'Update Max Memory',
            'Total Time'
        ]
        row_format = "{:<20} {:<20} {:<15} {:<15} {:<20} {:<20} {:<15} {:<15} {:<20} {:<15}"
        print(row_format.format(*headers))
        print("-" * 140)
        for result in all_results[f"{initial_size}_{update_size}"]:
            print(row_format.format(
                result['Method'],
                f"{result['Initial Load Time']:.2f}s" if isinstance(result['Initial Load Time'], (int, float)) else result['Initial Load Time'],
                str(int(result['Initial Records'])) if isinstance(result['Initial Records'], (int, float)) else result['Initial Records'],
                f"{result['Initial Avg CPU']:.1f}%" if isinstance(result['Initial Avg CPU'], (int, float)) else result['Initial Avg CPU'],
                f"{result['Initial Max Memory']:.1f}%" if isinstance(result['Initial Max Memory'], (int, float)) else result['Initial Max Memory'],
                f"{result['Update Load Time']:.2f}s" if isinstance(result['Update Load Time'],(int, float)) else result['Update Load Time'],
                str(int(result['Update Records'])) if isinstance(result['Update Records'], (int, float)) else result['Update Records'],
                f"{result['Update Avg CPU']:.1f}%" if isinstance(result['Update Avg CPU'], (int, float)) else result['Update Avg CPU'],
                f"{result['Update Max Memory']:.1f}%" if isinstance(result['Update Max Memory'], (int, float)) else result['Update Max Memory'],
                f"{result['Total Time']:.2f}s" if isinstance(result['Total Time'], (int, float)) else result['Total Time']
            ))


    # Print comparative summary across all data sizes
    print("\n\nComparative Summary Across All Data Sizes")
    print("=" * 100)
    print("Data Size (Initial, Update) | Best Method | Worst Method | Average Load Time")
    print("-" * 100)

if __name__ == "__main__":
    main()



=== 1. Setting up MySQL Database ===
Setting up MySQL in Google Colab...
 * Starting MySQL database server mysqld
   ...done.
Root password set using sudo.
Database and user 'mysql_user' created/configured successfully.
 * Stopping MySQL database server mysqld
   ...done.
 * Starting MySQL database server mysqld
   ...done.
LOCAL INFILE enabled.
Attempting to connect to database... (Attempt 1/30)
Successfully connected to MySQL!


Testing with Initial Size: 1000000, Update Size: 100000

Resetting table for new data size test...
Table dropped and recreated successfully

=== 2. Creating Initial Load File (1000000 records) ===
Generated 1000000 records for initial load

=== 3. Creating Subsequent Load File (100000 records) ===
Generated 100000 records for subsequent load

=== Method 1: MySQL LOAD DATA ===

Run 1 of 1

Loading initial file...
MySQL secure_file_priv: /content/mysql/
Trying LOAD DATA LOCAL INFILE...
LOCAL INFILE failed: LOAD DATA LOCAL INFILE file request rejected due to re

  return bound(*args, **kwds)


Split data into 4 chunks of approximately 250000 records each
Successfully processed 1000000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 4.13%
Max CPU Usage: 5.90%
Min CPU Usage: 0.80%
Average Memory Usage: 4.54%
Max Memory Usage: 7.00%
Min Memory Usage: 2.00%
Count after initial load: 1000000 records

Loading subsequent file...
Split data into 4 chunks of approximately 25000 records each


  return bound(*args, **kwds)


Successfully processed 100000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 2.59%
Max CPU Usage: 5.40%
Min CPU Usage: 1.00%
Average Memory Usage: 2.12%
Max Memory Usage: 2.50%
Min Memory Usage: 2.00%
Final count: 1100000 records

Average Results for Method 2 (Parallel Processing) after 1 runs:
{'Method': 'Parallel Processing', 'Initial Size': 1000000.0, 'Update Size': 100000.0, 'Initial Load Time': 87.78988122940063, 'Initial Records': 1000000.0, 'Initial Avg CPU': 4.132183908045975, 'Initial Max CPU': 5.9, 'Initial Avg Memory': 4.539080459770116, 'Initial Max Memory': 7.0, 'Update Load Time': 33.65342926979065, 'Update Records': 100000.0, 'Update Avg CPU': 2.5939393939393938, 'Update Max CPU': 5.4, 'Update Avg Memory': 2.1151515151515157, 'Update Max Memory': 2.5, 'Final Records': 1100000.0, 'Total Time': 121.44331049919128}

=== Method 3: Row-by-row ===

Run 1 of 1

Resetting table for method Row-by-row, run 1...
Table dropped and recreated successfully

Loading 

  return bound(*args, **kwds)


Successfully processed 100000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 3.87%
Max CPU Usage: 5.20%
Min CPU Usage: 1.00%
Average Memory Usage: 2.37%
Max Memory Usage: 2.60%
Min Memory Usage: 2.00%
Count after initial load: 100000 records

Loading subsequent file...
Split data into 4 chunks of approximately 2500 records each


  return bound(*args, **kwds)


Successfully processed 10000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 1.97%
Max CPU Usage: 2.50%
Min CPU Usage: 1.30%
Average Memory Usage: 2.05%
Max Memory Usage: 2.20%
Min Memory Usage: 2.00%
Final count: 110000 records

Average Results for Method 2 (Parallel Processing) after 1 runs:
{'Method': 'Parallel Processing', 'Initial Size': 100000.0, 'Update Size': 10000.0, 'Initial Load Time': 8.444286584854126, 'Initial Records': 100000.0, 'Initial Avg CPU': 3.866666666666667, 'Initial Max CPU': 5.2, 'Initial Avg Memory': 2.366666666666667, 'Initial Max Memory': 2.6, 'Update Load Time': 3.5332751274108887, 'Update Records': 10000.0, 'Update Avg CPU': 1.9749999999999999, 'Update Max CPU': 2.5, 'Update Avg Memory': 2.05, 'Update Max Memory': 2.2, 'Final Records': 110000.0, 'Total Time': 11.977561712265015}

=== Method 3: Row-by-row ===

Run 1 of 1

Resetting table for method Row-by-row, run 1...
Table dropped and recreated successfully

Loading initial file...

Res

  return bound(*args, **kwds)


Successfully processed 10000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 1.90%
Max CPU Usage: 3.30%
Min CPU Usage: 0.50%
Average Memory Usage: 2.05%
Max Memory Usage: 2.10%
Min Memory Usage: 2.00%
Count after initial load: 10000 records

Loading subsequent file...
Split data into 4 chunks of approximately 250 records each


  return bound(*args, **kwds)


Successfully processed 1000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.60%
Max CPU Usage: 0.60%
Min CPU Usage: 0.60%
Average Memory Usage: 2.00%
Max Memory Usage: 2.00%
Min Memory Usage: 2.00%
Final count: 11000 records

Average Results for Method 2 (Parallel Processing) after 1 runs:
{'Method': 'Parallel Processing', 'Initial Size': 10000.0, 'Update Size': 1000.0, 'Initial Load Time': 1.3618381023406982, 'Initial Records': 10000.0, 'Initial Avg CPU': 1.9, 'Initial Max CPU': 3.3, 'Initial Avg Memory': 2.05, 'Initial Max Memory': 2.1, 'Update Load Time': 0.4846186637878418, 'Update Records': 1000.0, 'Update Avg CPU': 0.6, 'Update Max CPU': 0.6, 'Update Avg Memory': 2.0, 'Update Max Memory': 2.0, 'Final Records': 11000.0, 'Total Time': 1.84645676612854}

=== Method 3: Row-by-row ===

Run 1 of 1

Resetting table for method Row-by-row, run 1...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 1.70%
Ma

  return bound(*args, **kwds)


Successfully processed 1000 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.30%
Max CPU Usage: 0.30%
Min CPU Usage: 0.30%
Average Memory Usage: 2.00%
Max Memory Usage: 2.00%
Min Memory Usage: 2.00%
Count after initial load: 1000 records

Loading subsequent file...
Split data into 4 chunks of approximately 25 records each


  return bound(*args, **kwds)


Successfully processed 100 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.90%
Max CPU Usage: 0.90%
Min CPU Usage: 0.90%
Average Memory Usage: 2.00%
Max Memory Usage: 2.00%
Min Memory Usage: 2.00%
Final count: 1100 records

Average Results for Method 2 (Parallel Processing) after 1 runs:
{'Method': 'Parallel Processing', 'Initial Size': 1000.0, 'Update Size': 100.0, 'Initial Load Time': 0.3223602771759033, 'Initial Records': 1000.0, 'Initial Avg CPU': 0.3, 'Initial Max CPU': 0.3, 'Initial Avg Memory': 2.0, 'Initial Max Memory': 2.0, 'Update Load Time': 0.2216651439666748, 'Update Records': 100.0, 'Update Avg CPU': 0.9, 'Update Max CPU': 0.9, 'Update Avg Memory': 2.0, 'Update Max Memory': 2.0, 'Final Records': 1100.0, 'Total Time': 0.5440254211425781}

=== Method 3: Row-by-row ===

Run 1 of 1

Resetting table for method Row-by-row, run 1...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 0.50%
Max CPU 

  return bound(*args, **kwds)


Successfully processed 100 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.30%
Max CPU Usage: 0.30%
Min CPU Usage: 0.30%
Average Memory Usage: 2.00%
Max Memory Usage: 2.00%
Min Memory Usage: 2.00%
Count after initial load: 100 records

Loading subsequent file...
Split data into 4 chunks of approximately 2 records each


  return bound(*args, **kwds)


Successfully processed 10 records in parallel

Resource Usage Statistics:
Average CPU Usage: 0.50%
Max CPU Usage: 0.50%
Min CPU Usage: 0.50%
Average Memory Usage: 2.00%
Max Memory Usage: 2.00%
Min Memory Usage: 2.00%
Final count: 110 records

Average Results for Method 2 (Parallel Processing) after 1 runs:
{'Method': 'Parallel Processing', 'Initial Size': 100.0, 'Update Size': 10.0, 'Initial Load Time': 0.17844247817993164, 'Initial Records': 100.0, 'Initial Avg CPU': 0.3, 'Initial Max CPU': 0.3, 'Initial Avg Memory': 2.0, 'Initial Max Memory': 2.0, 'Update Load Time': 0.204209566116333, 'Update Records': 10.0, 'Update Avg CPU': 0.5, 'Update Max CPU': 0.5, 'Update Avg Memory': 2.0, 'Update Max Memory': 2.0, 'Final Records': 110.0, 'Total Time': 0.38265204429626465}

=== Method 3: Row-by-row ===

Run 1 of 1

Resetting table for method Row-by-row, run 1...
Table dropped and recreated successfully

Loading initial file...

Resource Usage Statistics:
Average CPU Usage: 0.50%
Max CPU Usage: