In [32]:
# Test version 4

# Experimental version 3...

import os
import gzip
import shutil
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed

def compress_file(file_path, dry_run):
    """
    Compress a single file and remove the original if not in dry-run mode.
    
    :file_path: Path of the file to compress.
    :dry_run: If True, only simulate the compression.
    """
    compressed_file_path = f"{file_path}.gz"

    if dry_run:
        logging.info(f"Dry-run: Would compress {file_path} -> {compressed_file_path}")
        
    else:
        with open(file_path, 'rb') as original_file:
            with gzip.open(compressed_file_path, 'wb') as compressed_file:
                shutil.copyfileobj(original_file, compressed_file)

        os.remove(file_path)
        logging.info(f"Compressed and deleted: {file_path} -> {compressed_file_path}")
        

def compress_files_in_directory(directory, dry_run=False, log_file="compression.log", max_workers=None):
    """
    Compress files in the given directory using gzip and delete the original files.
    
    :directory: The path of the directory where the files are located.
    :dry_run: If True, simulate the operation without making changes.
    :log_file: Path to the log file for logging information.
    :max_workers: The maximum number of threads to use for compression.
    """

    # Clear any existing logging handlers to ensure we configure from scratch
    logging.getLogger().handlers.clear()

    # Set up logging to console only
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.StreamHandler()  # Log to the console
        ]
    )

    if dry_run:
        logging.info("[Dry Run] No changes will be made.")

    # Collect all file paths
    files_to_compress = []
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            files_to_compress.append(file_path)

    # Log the total number of files
    logging.info(f"Found {len(files_to_compress)} files to compress.")


    # Adjust max_workers based on CPU count if not provided
    if max_workers is None:
        max_workers = os.cpu_count() * 2  # Use twice the number of CPU cores for thread pool size
        logging.info(f"Using {max_workers} workers for parallel processing.")

    # Use ThreadPoolExecutor to compress files in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit compression tasks
        futures = [executor.submit(compress_file, file_path, dry_run) for file_path in files_to_compress]
        
        # Wait for all futures to complete
        for future in as_completed(futures):
            future.result()  # This will raise any exceptions if they occur during execution

    logging.info("Compression process completed.")


In [None]:
# Decompressing
# Version 1 

import os
import gzip
import shutil

def decompress_files_in_folder(folder_path):
    # Ensure the folder exists
    if not os.path.isdir(folder_path):
        print(f"Folder {folder_path} does not exist.")
        return

    # Iterate over all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.gz'):
            compressed_file_path = os.path.join(folder_path, file_name)
            decompressed_file_path = os.path.join(folder_path, file_name[:-3])  # Remove .gz extension

            # Decompress the file
            with gzip.open(compressed_file_path, 'rb') as f_in:
                with open(decompressed_file_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

            # Delete the compressed file
            os.remove(compressed_file_path)
            print(f"Decompressed and deleted {compressed_file_path}")


In [33]:
# Example usage:
directory_path = r'C:\Users\ge49nes\Test'
compress_files_in_directory(directory_path, dry_run=True, max_workers=None)
decompress_files_in_folder(folder_path)

2024-10-11 21:25:45,418 - INFO - [Dry Run] No changes will be made.
2024-10-11 21:25:45,418 - INFO - Found 1 files to compress.
2024-10-11 21:25:45,423 - INFO - Using 16 workers for parallel processing.
2024-10-11 21:25:45,424 - INFO - Dry-run: Would compress C:\Users\ge49nes\Test\190520_DK_ex vivo_PMC_2_63x_1024x1024_Avg4_zomm5_3color_Lng_adaptive.ims -> C:\Users\ge49nes\Test\190520_DK_ex vivo_PMC_2_63x_1024x1024_Avg4_zomm5_3color_Lng_adaptive.ims.gz
2024-10-11 21:25:45,427 - INFO - Compression process completed.
