In [1]:
!pip install psutil mimesis pandas dask dask[dataframe]



In [2]:
import psutil
import platform
from datetime import datetime

def get_system_info():
    print("Gathering system information...")

    # CPU information
    print("CPU:")
    print(f"Physical cores: {psutil.cpu_count(logical=False)}")
    print(f"Total cores: {psutil.cpu_count(logical=True)}")
    cpu_freq = psutil.cpu_freq()
    print(f"Max Frequency: {cpu_freq.max:.2f}Mhz")
    print(f"Min Frequency: {cpu_freq.min:.2f}Mhz")
    print(f"Current Frequency: {cpu_freq.current:.2f}Mhz")
    print(f"Total CPU Usage: {psutil.cpu_percent()}%")

    # Memory Information
    print("\nMemory:")
    svmem = psutil.virtual_memory()
    print(f"Total: {get_size(svmem.total)}")
    print(f"Available: {get_size(svmem.available)}")
    print(f"Used: {get_size(svmem.used)}")
    print(f"Percentage: {svmem.percent}%")

    # Disk Information
    print("\nDisk Information:")
    partitions = psutil.disk_partitions()
    for partition in partitions:
        print(f"=== Device: {partition.device} ===")
        print(f"  Mountpoint: {partition.mountpoint}")
        print(f"  File system type: {partition.fstype}")
        try:
            partition_usage = psutil.disk_usage(partition.mountpoint)
        except PermissionError:
            continue
        print(f"  Total Size: {get_size(partition_usage.total)}")
        print(f"  Used: {get_size(partition_usage.used)}")
        print(f"  Free: {get_size(partition_usage.free)}")
        print(f"  Percentage: {partition_usage.percent}%")

    # OS information
    print("\nOperating System:")
    print(f"System: {platform.system()}")
    print(f"Node Name: {platform.node()}")
    print(f"Release: {platform.release()}")
    print(f"Version: {platform.version()}")
    print(f"Machine: {platform.machine()}")
    print(f"Processor: {platform.processor()}")

def get_size(bytes, suffix="B"):
    """
    Scale bytes to its proper format
    e.g.:
        1253656 => '1.20MB'
        1253656678 => '1.17GB'
    """
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor

if __name__ == "__main__":
    get_system_info()

Gathering system information...
CPU:
Physical cores: 48
Total cores: 96
Max Frequency: 0.00Mhz
Min Frequency: 0.00Mhz
Current Frequency: 2000.17Mhz
Total CPU Usage: 1.2%

Memory:
Total: 334.56GB
Available: 330.02GB
Used: 2.35GB
Percentage: 1.4%

Disk Information:
=== Device: /dev/root ===
  Mountpoint: /usr/sbin/docker-init
  File system type: ext2
  Total Size: 1.93GB
  Used: 1.13GB
  Free: 819.42MB
  Percentage: 58.5%
=== Device: /dev/sda1 ===
  Mountpoint: /etc/resolv.conf
  File system type: ext4
  Total Size: 232.07GB
  Used: 28.62GB
  Free: 203.43GB
  Percentage: 12.3%
=== Device: /dev/sda1 ===
  Mountpoint: /etc/hostname
  File system type: ext4
  Total Size: 232.07GB
  Used: 28.62GB
  Free: 203.43GB
  Percentage: 12.3%
=== Device: /dev/sda1 ===
  Mountpoint: /etc/hosts
  File system type: ext4
  Total Size: 232.07GB
  Used: 28.62GB
  Free: 203.43GB
  Percentage: 12.3%

Operating System:
System: Linux
Node Name: f6015db84b51
Release: 6.1.85+
Version: #1 SMP PREEMPT_DYNAMIC Thu J

In [None]:
import csv
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
import threading
import psutil
import time
from multiprocessing import Pool
from mimesis import Person, Address, Datetime
from mimesis.enums import Gender
import random
import os
import glob

def monitor_resources(interval, stats):
    while not stats['stop']:
        cpu = psutil.cpu_percent(interval=None)
        memory = psutil.virtual_memory().percent
        stats['cpu'].append(cpu)
        stats['memory'].append(memory)
        time.sleep(interval)

def print_resource_stats(stats):
    print(f"Average CPU Usage: {sum(stats['cpu']) / len(stats['cpu']):.2f}%")
    print(f"Max CPU Usage: {max(stats['cpu']):.2f}%")
    print(f"Min CPU Usage: {min(stats['cpu']):.2f}%")
    print(f"Average Memory Usage: {sum(stats['memory']) / len(stats['memory']):.2f}%")
    print(f"Max Memory Usage: {max(stats['memory']):.2f}%")
    print(f"Min Memory Usage: {min(stats['memory']):.2f}%")

def generate_data(row_count):
    person = Person('en')
    address = Address('en')
    datetime = Datetime('en')
    return [{
        "Name": person.full_name(gender=random.choice([Gender.MALE, Gender.FEMALE])),
        "Email": person.email(),
        "Address": address.address(),
        "Phone": person.telephone(),
        "Date of Birth": datetime.date().isoformat(),
        "Gender": random.choice(["Male", "Female"]),
        "Company": address.city() + " Corp",
        "Position": person.occupation(),
        "Salary": round(random.uniform(30000, 200000), 2),
        "Retired": random.choice(["Yes", "No"])
    } for _ in range(row_count)]

def remove_test_files(directory, patterns):
    os.chdir(directory)
    for pattern in patterns:
        for file in glob.glob(pattern):
            try:
                os.remove(file)
                print(f"Removed file: {file}")
            except OSError as e:
                print(f"Error: {file} : {e.strerror}")

def row_by_row_approach(num_rows):
    print("Starting Row-by-row Approach")
    with open('row_by_row.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=generate_data(1)[0].keys())
        writer.writeheader()
        for data in generate_data(num_rows):
            writer.writerow(data)

def dataframe_approach(num_rows):
    print("Starting DataFrame Approach")
    df = pd.DataFrame(generate_data(num_rows))
    df.to_csv('dataframe.csv', index=False)

def streaming_chunks_approach(num_rows, chunk_size):
    print("Starting Streaming Chunks Approach")
    with open('streaming_chunks.csv', 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=generate_data(1)[0].keys())
        writer.writeheader()
        num_chunks = (num_rows + chunk_size - 1) // chunk_size
        for _ in range(num_chunks):
            writer.writerows(generate_data(min(chunk_size, num_rows)))
            num_rows -= chunk_size

def parallel_processing_approach(num_rows, num_processes):
    print("Starting Parallel Processing Approach with", num_processes, "processes")
    pool = Pool(num_processes)
    chunk_size = num_rows // num_processes
    chunks = [generate_data(chunk_size + (1 if i < num_rows % num_processes else 0)) for i in range(num_processes)]
    pool.starmap(worker, [(chunk, i) for i, chunk in enumerate(chunks)])
    pool.close()
    pool.join()
    consolidate_files('parallel_output_*.csv', 'final_parallel_output.csv')

def dask_approach(num_rows, npartitions, output_filename):
    print("Starting Dask Approach with", npartitions, "partitions")
    ddf = dd.from_pandas(pd.DataFrame(generate_data(num_rows)), npartitions=npartitions)
    temp_output = 'temp_dask_output_*.csv'
    ddf.to_csv(temp_output, index=False)
    with ProgressBar():
        consolidate_files('temp_dask_output_*.csv', output_filename)

def consolidate_files(file_pattern, output_file):
    start_time = time.time()
    files = glob.glob(file_pattern)
    with open(output_file, 'w', newline='') as final_file:
        final_writer = csv.writer(final_file)
        for i, file_path in enumerate(files):
            with open(file_path, 'r') as source_file:
                reader = csv.reader(source_file)
                if i != 0:
                    next(reader)  # Skip header for all but the first file
                final_writer.writerows(reader)
            os.remove(file_path)
    end_time = time.time()
    print(f"Consolidation completed in {end_time - start_time:.2f} seconds.")

def process_approach(approach_func, num_rows, *args):
    stats = {'cpu': [], 'memory': [], 'stop': False}
    monitor_thread = threading.Thread(target=monitor_resources, args=(10, stats))
    monitor_thread.start()

    start_time = time.time()
    approach_func(num_rows, *args)
    duration = time.time() - start_time

    stats['stop'] = True
    monitor_thread.join()

    print(f"{approach_func.__name__} completed in {duration:.2f} seconds.")
    print_resource_stats(stats)

def main():
    directory = './'
    patterns = ['row_by_row.csv', 'dataframe.csv', 'streaming_chunks.csv',
                'parallel_output_*.csv', 'final_parallel_output.csv',
                'temp_dask_output_*.csv', 'final_dask_output.csv']
    remove_test_files(directory, patterns)
    num_rows = 100000000
    chunk_size = 100000
    num_cores = psutil.cpu_count(logical=True)
    process_approach(row_by_row_approach, num_rows)
    process_approach(dataframe_approach, num_rows)
    process_approach(streaming_chunks_approach, num_rows, chunk_size)
    process_approach(parallel_processing_approach, num_rows, num_cores)
    process_approach(dask_approach, num_rows, num_cores, 'final_dask_output.csv')

if __name__ == "__main__":
    main()

Starting DataFrame Approach
