In [3]:
!pip install openpyxl
import os
import pandas as pd
import datetime
from difflib import SequenceMatcher
import re
from openpyxl import Workbook
from openpyxl.styles import PatternFill
from openpyxl.utils.dataframe import dataframe_to_rows
from datetime import datetime
from pathlib import Path



In [4]:
#optimized
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

def get_folder_size(path):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_size += os.path.getsize(fp)
    return total_size

def format_size(size_bytes):
    # Convert bytes to a human-readable format
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size_bytes < 1024:
            return f"{size_bytes:.2f} {unit}"
        size_bytes /= 1024

def process_folder_info(root_path, dirpath, filenames, extensions):
    current_path = Path(dirpath)
    
    # Count matching files in current directory
    matching_files = sum(1 for f in filenames if any(f.lower().endswith(ext) for ext in extensions))
    pattern = re.compile(r'\[\d+,\d+\]')
    pattern_files = sum(1 for f in filenames if pattern.search(f) and any(f.lower().endswith(ext) for ext in extensions))
    
    # Calculate depth (number of parent directories from root)
    depth = len(current_path.relative_to(root_path).parts)
    
    # Get parent folder name (or root if at top level)
    parent_folder = current_path.parent.name or 'root'
    if depth == 0:
        parent_folder = 'root'
        
    # Get folder creation time
    try:
        creation_timestamp = os.path.getctime(current_path)
        creation_date = datetime.fromtimestamp(creation_timestamp)
        creation_date_str = creation_date.strftime('%Y-%m-%d %H:%M:%S')
    except Exception:
        creation_date = None
        creation_date_str = "Unknown"
        
    # Calculate folder size
    try:
        size_bytes = get_folder_size(current_path)
        size_readable = format_size(size_bytes)
    except Exception:
        size_bytes = 0
        size_readable = "Unknown"
        
    return {
        'folder_path': str(current_path),
        'depth': depth,
        'parent_folder': parent_folder,
        'folder_name': current_path.name or root_path.name,
        'num_matching_files': matching_files,
        'num_pattern_files': pattern_files,
        'creation_date': creation_date,
        'creation_date_str': creation_date_str,
        'size_bytes': size_bytes,
        'size_readable': size_readable
    }

def folders_to_dataframe(root_dir, extensions=['.ndpi', '.qptiff', '.tif', '.tiff']):
    """
    Creates a pandas DataFrame showing the directory structure, files, dates, and sizes.
    
    Args:
        root_dir (str): The root directory to search within.
        extensions (list, optional): List of file extensions to consider.
    
    Returns:
        pandas.DataFrame: A DataFrame containing:
            - folder_path: Full path to the folder
            - depth: Nesting level (0 = root)
            - parent_folder: Name of parent directory
            - folder_name: Name of current directory
            - num_matching_files: Number of files with specified extensions
            - creation_date: Timestamp when the folder was created
            - creation_date_str: Human-readable creation date
            - size_bytes: Total size of folder and contents in bytes
            - size_readable: Human-readable size format
    """
    root_path = Path(root_dir)
    folders_data = []

    with ThreadPoolExecutor() as executor:
        futures = []
        for dirpath, dirnames, filenames in os.walk(root_dir):
            futures.append(executor.submit(process_folder_info, root_path, dirpath, filenames, extensions))
        
        for future in futures:
            folders_data.append(future.result())
    
    # Create DataFrame and sort by folder path for hierarchical viewing
    df = pd.DataFrame(folders_data)
    # df = df.sort_values(['depth', 'folder_path'])
    # df = df.reset_index(drop=True)
    
    return df

# Example usage
# df = folders_to_dataframe('/path/to/root_dir')
# print(df.head())

In [8]:
root_directory = 'Z:/'
directory_df = folders_to_dataframe(root_directory)

# View the directory structure
# print(directory_df)

# Get folders with matching files
# folders_with_files = directory_df[directory_df['num_matching_files'] > 0]
directory_df.to_csv('R_with_patterns.csv', index = False)

In [9]:
directory_df.head()

Unnamed: 0,folder_path,depth,parent_folder,folder_name,num_matching_files,num_pattern_files,creation_date,creation_date_str,size_bytes,size_readable
0,Z:\,0,root,,1,0,2022-05-30 15:17:38.945809,2022-05-30 15:17:38,0,Unknown
1,Z:\.afm,1,root,.afm,0,0,2022-05-30 15:17:39.144871,2022-05-30 15:17:39,38,38.00 B
2,Z:\.ptrash,1,root,.ptrash,0,0,2022-05-30 15:17:39.145358,2022-05-30 15:17:39,0,0.00 B
3,Z:\.snapshots,1,root,.snapshots,0,0,1970-01-01 00:00:00.000000,1970-01-01 00:00:00,0,0.00 B
4,Z:\BCPP,1,root,BCPP,0,0,2024-01-24 10:04:01.934358,2024-01-24 10:04:01,147140540512,137.04 GB


In [None]:
os.listdir(root_directory)