In [None]:
import os
import shutil
import subprocess
from pathlib import Path
from collections import defaultdict
import uuid


In [22]:
def classify_extension(ext):
    image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'}
    audio_exts = {'.mp3', '.wav', '.aac', '.ogg', '.flac', '.m4a'}
    video_exts = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv'}
    document_exts = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.csv'}
    archive_exts = {'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'}
    code_exts = {'.py', '.js', '.html', '.css', '.java', '.c', '.cpp', '.php'}

    ext = ext.lower()
    if ext in image_exts:
        return 'Images'
    elif ext in audio_exts:
        return 'Audio'
    elif ext in video_exts:
        return 'Videos'
    elif ext in document_exts:
        return 'Documents'
    elif ext in archive_exts:
        return 'Archives'
    elif ext in code_exts:
        return 'Code'
    else:
        return 'Other'


In [None]:
def copy_file_to_category(file_path, output_base):
    ext = Path(file_path).suffix
    category = classify_extension(ext)
    ext_folder_name = ext[1:] if ext.startswith('.') else ext  # Remove the dot


    ext_folder = os.path.join(output_base, category, ext)
    os.makedirs(ext_folder, exist_ok=True)

    dest_path = os.path.join(ext_folder, os.path.basename(file_path))
    try:
        shutil.copy2(file_path, dest_path)
        return (category, ext)
    except Exception as e:
        print(f"❌ Failed to copy {file_path}: {e}")
        return None


In [24]:
def extract_and_categorize_files_by_type(source_dir, output_dir):
    summary = defaultdict(lambda: defaultdict(int))

    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.startswith('.'):
                continue  # Skip hidden/system files
            file_path = os.path.join(root, file)

            # Confirm it's a real file
            try:
                result = subprocess.run(['file', file_path], stdout=subprocess.PIPE, text=True)
                if not result.stdout:
                    continue
            except Exception as e:
                print(f"Error detecting file type for {file_path}: {e}")
                continue

            result = copy_file_to_category(file_path, output_dir)
            if result:
                category, ext_folder_name = result
                summary[category][ext_folder_name] += 1

    return summary

In [25]:
def print_summary(summary):
    for category, ext_map in summary.items():
        print(f"\n📁 {category}:")
        for ext, count in sorted(ext_map.items()):
            print(f"  • {ext}: {count}")

In [26]:
source_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/"
output_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/organized_files"

summary = extract_and_categorize_files_by_type(source_dir, output_dir)
print_summary(summary)


❌ Failed to copy /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1/Volumes/JOSH/NoTar-13-4-1/System/Library/LocationBundles/CoreTelephony.framework: [Errno 2] No such file or directory: '/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1/Volumes/JOSH/NoTar-13-4-1/System/Library/LocationBundles/CoreTelephony.framework'
❌ Failed to copy /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1/Volumes/JOSH/NoTar-13-4-1/System/Library/LocationBundles/WirelessDiagnostics.bundle: [Errno 2] No such file or directory: '/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1/Volumes/JOSH/NoTar-13-4-1/System/Library/LocationBundles/WirelessDiagnostics.bundle'
❌ Failed to copy /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1/Volumes/JOSH/NoTar-13-4-1/System/Library/LocationBundles/Weather.framework: [Errno 2] No such file or directory: '/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1/Volum