In [1]:
!pip install psutil mimesis pandas dask dask[dataframe]



In [2]:
import csv
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import threading
import psutil
import time
from multiprocessing import Pool
from mimesis import Person, Address, Datetime
from mimesis.enums import Gender
import random
import os
import glob

def monitor_resources(interval, stats):
    while not stats['stop']:
        cpu = psutil.cpu_percent(interval=None)
        memory = psutil.virtual_memory().percent
        stats['cpu'].append(cpu)
        stats['memory'].append(memory)
        time.sleep(interval)

def print_resource_stats(stats):
    print(f"Average CPU Usage: {sum(stats['cpu']) / len(stats['cpu']):.2f}%")
    print(f"Max CPU Usage: {max(stats['cpu']):.2f}%")
    print(f"Min CPU Usage: {min(stats['cpu']):.2f}%")
    print(f"Average Memory Usage: {sum(stats['memory']) / len(stats['memory']):.2f}%")
    print(f"Max Memory Usage: {max(stats['memory']):.2f}%")
    print(f"Min Memory Usage: {min(stats['memory']):.2f}%")

def generate_data(row_count):
    person = Person('en')
    address = Address('en')
    datetime = Datetime('en')
    return [{
        "Name": person.full_name(gender=random.choice([Gender.MALE, Gender.FEMALE])),
        "Email": person.email(),
        "Address": address.address(),
        "Phone": person.telephone(),
        "Date of Birth": datetime.date().isoformat(),
        "Gender": random.choice(["Male", "Female"]),
        "Company": address.city() + " Corp",
        "Position": person.occupation(),
        "Salary": round(random.uniform(30000, 200000), 2),
        "Retired": random.choice(["Yes", "No"])
    } for _ in range(row_count)]

def remove_test_files(directory, patterns):
    os.chdir(directory)
    for pattern in patterns:
        for file in glob.glob(pattern):
            try:
                os.remove(file)
                print(f"Removed file: {file}")
            except OSError as e:
                print(f"Error: {file} : {e.strerror}")

def row_by_row_approach(num_rows):
    print("Starting Row-by-row Approach")
    with open('row_by_row.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=generate_data(1)[0].keys())
        writer.writeheader()
        for data in generate_data(num_rows):
            writer.writerow(data)

def dataframe_approach(num_rows):
    print("Starting DataFrame Approach")
    df = pd.DataFrame(generate_data(num_rows))
    df.to_csv('dataframe.csv', index=False)

def streaming_chunks_approach(num_rows, chunk_size):
    print("Starting Streaming Chunks Approach")
    with open('streaming_chunks.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=generate_data(1)[0].keys())
        writer.writeheader()
        num_chunks = (num_rows + chunk_size - 1) // chunk_size
        for _ in range(num_chunks):
            writer.writerows(generate_data(min(chunk_size, num_rows)))
            num_rows -= chunk_size

def worker(data_chunk, index):
    file_name = f'parallel_output_{index}.csv'
    with open(file_name, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=data_chunk[0].keys())
        writer.writeheader()
        writer.writerows(data_chunk)

def parallel_processing_approach(num_rows, num_processes):
    print("Starting Parallel Processing Approach with", num_processes, "processes")
    pool = Pool(num_processes)
    chunk_size = num_rows // num_processes
    chunks = [generate_data(chunk_size + (1 if i < num_rows % num_processes else 0)) for i in range(num_processes)]
    pool.starmap(worker, [(chunk, i) for i, chunk in enumerate(chunks)])
    pool.close()
    pool.join()
    consolidate_files('parallel_output_*.csv', 'final_parallel_output.csv')

def dask_approach(num_rows, npartitions, output_filename):
    print("Starting Dask Approach with", npartitions, "partitions")
    ddf = dd.from_pandas(pd.DataFrame(generate_data(num_rows)), npartitions=npartitions)
    temp_output = 'temp_dask_output_*.csv'
    ddf.to_csv(temp_output, index=False)
    with ProgressBar():
        consolidate_files('temp_dask_output_*.csv', output_filename)

def consolidate_files(file_pattern, output_file):
    start_time = time.time()
    files = glob.glob(file_pattern)
    with open(output_file, 'w', newline='') as final_file:
        final_writer = csv.writer(final_file)
        for i, file_path in enumerate(files):
            with open(file_path, 'r') as source_file:
                reader = csv.reader(source_file)
                if i != 0:
                    next(reader)  # Skip header for all but the first file
                final_writer.writerows(reader)
            os.remove(file_path)
    end_time = time.time()
    print(f"Consolidation completed in {end_time - start_time:.2f} seconds.")

def process_approach(approach_func, num_rows, *args):
    stats = {'cpu': [], 'memory': [], 'stop': False}
    monitor_thread = threading.Thread(target=monitor_resources, args=(10, stats))
    monitor_thread.start()

    start_time = time.time()
    approach_func(num_rows, *args)
    duration = time.time() - start_time

    stats['stop'] = True
    monitor_thread.join()

    print(f"{approach_func.__name__} completed in {duration:.2f} seconds.")
    print_resource_stats(stats)

def main():
    directory = './'
    patterns = ['row_by_row.csv', 'dataframe.csv', 'streaming_chunks.csv',
                'parallel_output_*.csv', 'final_parallel_output.csv',
                'temp_dask_output_*.csv', 'final_dask_output.csv']

    # List of num_rows and chunk_size combinations
    combinations = [
        (10000000, 100000),
        (1000000, 10000),
        (100000, 5000),
        (10000, 1000),
        (1000, 100),
        (100, 10)
    ]

    num_cores = psutil.cpu_count(logical=True)  # Determine the number of cores available

    for num_rows, chunk_size in combinations:
        # Remove existing files to ensure a clean start
        remove_test_files(directory, patterns)

        print(f"Processing {num_rows} rows with chunk size {chunk_size}:")

        # Execute each approach with the current combination of num_rows and chunk_size
        process_approach(row_by_row_approach, num_rows)
        process_approach(dataframe_approach, num_rows)
        process_approach(streaming_chunks_approach, num_rows, chunk_size)
        process_approach(parallel_processing_approach, num_rows, num_cores)
        process_approach(dask_approach, num_rows, num_cores, 'final_dask_output.csv')
        print("\n")  # Adds a newline for better separation of output for each combination

if __name__ == "__main__":
    main()


Processing 10000000 rows with chunk size 100000:
Starting Row-by-row Approach
row_by_row_approach completed in 284.78 seconds.
Average CPU Usage: 1.56%
Max CPU Usage: 1.80%
Min CPU Usage: 1.20%
Average Memory Usage: 2.57%
Max Memory Usage: 3.40%
Min Memory Usage: 1.40%
Starting DataFrame Approach
dataframe_approach completed in 298.20 seconds.
Average CPU Usage: 1.51%
Max CPU Usage: 1.70%
Min CPU Usage: 1.00%
Average Memory Usage: 2.70%
Max Memory Usage: 3.90%
Min Memory Usage: 1.60%
Starting Streaming Chunks Approach
streaming_chunks_approach completed in 280.21 seconds.
Average CPU Usage: 1.57%
Max CPU Usage: 1.80%
Min CPU Usage: 1.20%
Average Memory Usage: 1.70%
Max Memory Usage: 1.70%
Min Memory Usage: 1.70%
Starting Parallel Processing Approach with 96 processes
Consolidation completed in 50.32 seconds.
parallel_processing_approach completed in 325.37 seconds.
Average CPU Usage: 2.24%
Max CPU Usage: 5.10%
Min CPU Usage: 1.30%
Average Memory Usage: 3.29%
Max Memory Usage: 6.00%
Min