In [1]:
import os
import shutil
import subprocess
from pathlib import Path
from collections import defaultdict
import uuid


In [2]:
def classify_extension(ext):
    image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'}
    audio_exts = {'.mp3', '.wav', '.aac', '.ogg', '.flac', '.m4a'}
    video_exts = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv'}
    document_exts = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.csv'}
    archive_exts = {'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'}
    code_exts = {'.py', '.js', '.html', '.css', '.java', '.c', '.cpp', '.php'}

    ext = ext.lower()
    if ext in image_exts:
        return 'Images'
    elif ext in audio_exts:
        return 'Audio'
    elif ext in video_exts:
        return 'Videos'
    elif ext in document_exts:
        return 'Documents'
    elif ext in archive_exts:
        return 'Archives'
    elif ext in code_exts:
        return 'Code'
    else:
        return 'Other'


In [3]:
def copy_file_to_category(file_path, output_base):
    ext = Path(file_path).suffix.lower()
    category = classify_extension(ext)
    ext_folder_name = ext.lstrip('.') if ext else 'no_ext'  # removes leading dot or uses 'no_ext'

    dest_folder = os.path.join(output_base, category, ext_folder_name)

    if os.path.isfile(dest_folder):
        print(f"⚠️ Skipping: '{dest_folder}' exists as a file, not a folder.")
        return None

    os.makedirs(dest_folder, exist_ok=True)

    dest_path = os.path.join(dest_folder, os.path.basename(file_path))
    try:
        shutil.copy2(file_path, dest_path)
        return (category, ext_folder_name)
    except PermissionError:
        print(f"⛔ Skipped (Permission denied): {file_path}")
    except Exception as e:
        print(f"❌ Failed to copy {file_path}: {e}")
    return None


In [4]:
def extract_and_categorize_files_by_type(source_dir, output_dir):
    summary = defaultdict(lambda: defaultdict(int))

    for root, _, files in os.walk(source_dir):
        for file in files:
            if file.startswith('.'):
                continue  # Skip hidden/system files
            file_path = os.path.join(root, file)

            try:
                result = subprocess.run(['file', file_path], stdout=subprocess.PIPE, text=True)
                if not result.stdout:
                    continue
            except Exception as e:
                print(f"Error detecting file type for {file_path}: {e}")
                continue

            result = copy_file_to_category(file_path, output_dir)
            if result:
                category, ext_folder_name = result
                summary[category][ext_folder_name] += 1

    return summary


In [5]:
def print_summary(summary):
    for category, ext_map in summary.items():
        print(f"\n📁 {category}:")
        for ext, count in sorted(ext_map.items()):
            print(f"  • {ext}: {count}")

In [8]:
source_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/"
output_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/organized_files/android"

summary = extract_and_categorize_files_by_type(source_dir, output_dir)
print_summary(summary)


⛔ Skipped (Permission denied): /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/misc/vold/user_keys/de/0/version
⛔ Skipped (Permission denied): /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/misc/vold/user_keys/ce/0/current/version
⛔ Skipped (Permission denied): /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/misc/textclassifier/metadata/classification/version
⛔ Skipped (Permission denied): /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/misc/textclassifier/metadata/actions_suggestions/version
⛔ Skipped (Permission denied): /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/misc/textclassifier/metadata/lang_id/version
⛔ Skipped (Permission denied): /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/system_ce/0/usagestats/version
⛔ Skipped (Permission denie