In [21]:
import os
import shutil
import hashlib
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
from tqdm import tqdm
import time

# Retry decorator for handling retries
def retry(attempts=3, delay=1):
    def decorator(func):
        def wrapper(*args, **kwargs):
            for attempt in range(attempts):
                try:
                    return func(*args, **kwargs)
                except Exception as e:
                    logging.error(f"Attempt {attempt + 1} failed with error: {e}")
                    time.sleep(delay)
            raise Exception(f"All {attempts} attempts failed.")
        return wrapper
    return decorator

def calculate_checksum(file_path, algorithm='sha256'):
    """
    Calculate the checksum of a file using the specified algorithm.

    Args:
        file_path (str): Path to the file for which to calculate the checksum.
        algorithm (str): The checksum algorithm to use ('sha256' by default).

    Returns:
        str: The calculated checksum in hexadecimal format.
    """
    hash_algo = hashlib.new(algorithm)
    with open(file_path, 'rb') as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_algo.update(chunk)
    return hash_algo.hexdigest()

@retry(attempts=3, delay=2)
def copy_and_cleanup_file(file, target_folder, destination_folder, extensions, dry_run=None):
    """
    Copy and clean up a single file. This function is intended to be run in parallel.

    Args:
        file (str): The file to be copied and cleaned up.
        target_folder (str): The path to the folder containing the files to be copied and cleaned up.
        destination_folder (str): The path to the folder where the files should be copied.
        extensions (tuple): File extensions to filter (e.g., ('.raw', '.tif', '.png', '.pdf')).

    Returns:
        dict: A dictionary containing the status of the operation for the file.
    """
    
    result = {"file": file, "status": "", "message": ""}
    
    try:
        # Check if the file has the correct extension
        if not file.lower().endswith(extensions):
            result["status"] = "skipped"
            result["message"] = f"File extension does not match. Skipped {file}."
            return result
        
        # Extract the destination subfolder name (first 6 characters of the file name)
        destination_subfolder_name = file[:6]
        destination_subfolder = os.path.join(destination_folder, destination_subfolder_name)
        
        # Create the destination subfolder if it doesn't exist
        if not os.path.exists(destination_subfolder):
            if dry_run:
                logging.info(f"[Dry Run] Would create directory: {destination_subfolder}")
            else:
                os.makedirs(destination_subfolder)
                logging.info(f"Created directory: {destination_subfolder}")
        
        # Determine the destination file path
        destination_file_path = os.path.join(destination_subfolder, file)
        source_file_path = os.path.join(target_folder, file)

        # Check if the file already exists in the destination folder
        if os.path.exists(destination_file_path):
            # Check if the file sizes are the same
            source_size = os.path.getsize(source_file_path)
            destination_size = os.path.getsize(destination_file_path)

            if source_size == destination_size:
                # Check if the checksums match
                source_checksum = calculate_checksum(source_file_path)
                destination_checksum = calculate_checksum(destination_file_path)

                if source_checksum == destination_checksum:
                    if dry_run:
                        logging.info(f"[Dry Run] Would delete file from target folder: {file}")
                    # If both the size and checksum match, delete the source file
                    else:
                        os.remove(source_file_path)
                        result["status"] = "skipped"
                        result["message"] = f"File already exists and is identical. Deleted from target: {file}."
                        logging.info(f"Deleted file from target folder: {file}")
                else:
                    result["status"] = "error"
                    result["message"] = f"Checksum mismatch for {file}. Not deleting the source."
                    logging.error(f"Checksum mismatch for {file}.")
            else:
                # If the file sizes differ, delete the destination file and copy again
                if dry_run:
                    logging.info(f"[Dry Run] Would replace file in destination folder: {file}")
                
                else:
                    logging.info(f"File sizes differ for {file}. Replacing the destination file.")
                    os.remove(destination_file_path)
                    shutil.copy(source_file_path, destination_file_path)
                    logging.info(f"Copied: {file} to {destination_subfolder}")

                # Check the copied file for size and checksum match
                copied_size = os.path.getsize(destination_file_path)
                copied_checksum = calculate_checksum(destination_file_path)

                if copied_size == source_size and copied_checksum == calculate_checksum(source_file_path):
                    if dry_run:
                        logging.info(f"[Dry Run] Would delete original after successful re-copy: {file}")
                    else:
                        os.remove(source_file_path)
                        result["status"] = "copied"
                        result["message"] = f"Re-copied and deleted original: {file}."
                        logging.info(f"Deleted original after successful re-copy: {file}")
                else:
                    result["status"] = "error"
                    result["message"] = f"Re-copy failed for {file}. Not deleting original."
                    logging.error(f"Re-copy failed for {file}.")
        else:
            if dry_run:
                logging.info(f"[Dry Run] Would copy: {file} to {destination_subfolder}")
            else:
                # Copy the file to the destination subfolder
                shutil.copy(source_file_path, destination_file_path)
                logging.info(f"Copied: {file} to {destination_subfolder}")

            # Verify the copied file by comparing both file size and checksum
            copied_size = os.path.getsize(destination_file_path)
            source_size = os.path.getsize(source_file_path)
            copied_checksum = calculate_checksum(destination_file_path)
            source_checksum = calculate_checksum(source_file_path)

            if copied_size == source_size and copied_checksum == source_checksum:
                if dry_run:
                    logging.info(f"[Dry Run] Would delete original after successful copy: {file}")
                else:
                    os.remove(source_file_path)
                    result["status"] = "copied"
                    result["message"] = f"Copied and deleted original: {file}."
                    logging.info(f"Deleted original after successful copy: {file}")
            else:
                # If the size or checksum does not match, delete the copied file
                os.remove(destination_file_path)
                result["status"] = "error"
                result["message"] = f"File integrity check failed for {file}. Not deleting original."
                logging.error(f"File integrity check failed for {file}. Original not deleted.")
    
    except OSError as e:
        logging.error(f"OSError for file {file}: {e.strerror}. Code: {e.errno}")
        result["status"] = "error"
        result["message"] = f"OSError encountered: {e.strerror}. Skipped {file}."
    
    except Exception as e:
        logging.error(f"Error processing file {file}: {e}")
        result["status"] = "error"
        result["message"] = f"Error processing {file}: {str(e)}"
    
    return result


def generate_report(results, report_file_path):
    """
    Generate a summary report of the file operations.

    Args:
        results (list): A list of dictionaries containing the status of each file operation.
        report_file_path (str): The path where the summary report should be saved.
    """
    with open(report_file_path, 'w') as report_file:
        report_file.write("Copy and Cleanup Summary Report\n")
        report_file.write(f"Date: {datetime.now()}\n\n")
        report_file.write(f"{'File':<40} {'Status':<10} {'Message'}\n")
        report_file.write(f"{'-'*40} {'-'*10} {'-'*40}\n")
        
        for result in results:
            report_file.write(f"{result['file']:<40} {result['status']:<10} {result['message']}\n")
        
        total_files = len(results)
        copied_files = sum(1 for r in results if r['status'] == 'copied')
        skipped_files = sum(1 for r in results if r['status'] == 'skipped')
        error_files = sum(1 for r in results if r['status'] == 'error')
        
        report_file.write(f"\nTotal files processed: {total_files}\n")
        report_file.write(f"Files copied: {copied_files}\n")
        report_file.write(f"Files skipped: {skipped_files}\n")
        report_file.write(f"Errors: {error_files}\n")

    logging.info(f"Summary report generated at {report_file_path}")

def copy_and_cleanup_files(target_folder, destination_folder, extensions=('.raw',), max_workers=None, 
                           report=False, report_file_path="summary_report.txt",
                           start_date=None, end_date=None, dry_run=True):
    """
    Copies files from the target folder to the appropriate subfolder in the destination folder
    based on the first six characters of each file name (representing a date in the format YYYYMM).
    Files are filtered by the specified extensions specified in the respective argument. If a file 
    with the same name already exists in the destination folder, it will be deleted from the target folder. 
    The operation is performed using multithreading to handle large folders efficiently, and file 
    integrity is checked using SHA256 before deleting the original. Optionally, a summary report 
    can be generated after the operation is complete.

    Args:
        target_folder (str): The path to the folder containing the files to be copied and cleaned up.
        destination_folder (str): The path to the folder where the files should be copied to.
        extensions (str, tuple): File extensions to filter (e.g., ('.raw', '.pdf', '.png')).
        max_workers (int): The maximum number of threads to use for parallel processing. If None, automatically adjusts based on CPU cores.
        generate_report (bool): Whether to generate a summary report after the operation.
        report_file_path (str): The path where the summary report should be saved (if generate_report is True).
        start_date (datetime): Filter files with a modification or creation date after this date.
        end_date (datetime): Filter files with a modification or creation date before this date.

    Example:
        If the target folder contains a file named '20230516_SA_DK.raw', it will be copied to
        'destination_folder/202305/', and then the original file will be deleted from the target folder. If the folder '.../202305'
        does not exist it will be created.
    """
    # Configure logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    logging.info("Starting the copy and cleanup process.")

    if dry_run:
        logging.info("[Dry Run] No changes will be made.")

    # Adjust max_workers based on CPU count if not provided
    if max_workers is None:
        max_workers = os.cpu_count() * 2  # Use twice the number of CPU cores for thread pool size
        logging.info(f"Using {max_workers} workers for parallel processing.")

    # List all files in the target folder
    files = os.listdir(target_folder)

    # Filter files based on modification/creation date if start_date and/or end_date are provided
    if start_date or end_date:
        filtered_files = []
        for file in files:
            file_path = os.path.join(target_folder, file)
            file_mtime = datetime.fromtimestamp(os.path.getmtime(file_path))  # Modification time
            
            # Check if file falls within the date range
            if (start_date is None or file_mtime >= start_date) and (end_date is None or file_mtime <= end_date):
                filtered_files.append(file)
        
        files = filtered_files
        logging.info(f"Filtered files based on date range. {len(files)} files remain after filtering.")

    # Progress bar
    progress_bar = tqdm(total=len(files), desc="Processing files")

    # Use ThreadPoolExecutor for parallel processing
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {executor.submit(copy_and_cleanup_file, file, target_folder, destination_folder, extensions, dry_run): file for file in files}
        for future in as_completed(future_to_file):
            file = future_to_file[future]
            try:
                result = future.result()
                results.append(result)
                logging.info(result["message"])
            except Exception as exc:
                logging.error(f'File {file} generated an exception: {exc}')
            finally:
                progress_bar.update(1)
    
    progress_bar.close()

    # Optionally generate a summary report
    if report:
        generate_report(results, report_file_path)

    logging.info("Copy and cleanup process completed.")

In [23]:
copy_and_cleanup_files(
    target_folder=r"C:\Users\User\Desktop\Folder1",
    destination_folder=r"C:\Users\User\Desktop\Folder2",
    extensions=('.pdf', '.png'),
    dry_run=None, report=False
)

2024-09-28 19:29:02,363 - INFO - Starting the copy and cleanup process.
2024-09-28 19:29:02,363 - INFO - Using 32 workers for parallel processing.
Processing files:   0%|                                                                          | 0/7 [00:00<?, ?it/s]2024-09-28 19:29:02,363 - INFO - Created directory: C:\Users\User\Desktop\Folder2\202203
2024-09-28 19:29:02,363 - INFO - Created directory: C:\Users\User\Desktop\Folder2\202205
2024-09-28 19:29:02,363 - INFO - Created directory: C:\Users\User\Desktop\Folder2\202406
2024-09-28 19:29:02,363 - INFO - Created directory: C:\Users\User\Desktop\Folder2\202408
2024-09-28 19:29:02,377 - INFO - Copied: 20240822_SA_DK.pdf to C:\Users\User\Desktop\Folder2\202408
2024-09-28 19:29:02,377 - INFO - Copied: 20240622_SA_DK_2.pdf to C:\Users\User\Desktop\Folder2\202406
2024-09-28 19:29:02,377 - INFO - Copied: 20240822_Hela_DK_1.pdf to C:\Users\User\Desktop\Folder2\202408
2024-09-28 19:29:02,377 - INFO - Deleted file from target folder: 202305