In [None]:
# %wget https://digitalcorpora.s3.amazonaws.com/corpora/mobile/ios_13_4_1/ios_13_4_1.zip -O ios_13_4_1.zip
# %wget https://digitalcorpora.s3.amazonaws.com/corpora/mobile/android_13/android_13_data.tar.gz

In [1]:
%wget https://digitalcorpora.s3.amazonaws.com/corpora/mobile/android_13/android_13_data.tar.gz -O android_13_data.tar.gz

UsageError: Line magic function `%wget` not found.


Extract Files from Image

In [3]:
%pip install -q py7zr
%pip install -q rarfile

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import py7zr
import rarfile
import shutil
import tarfile
import uuid
import zipfile
from collections import defaultdict

In [3]:
# 📂 Extraction Functions
def extract_zip(input_dir, output_dir):
    with zipfile.ZipFile(input_dir, 'r') as zip_ref:
        print(f"✅ Extracting ZIP from: {input_dir}")
        zip_ref.extractall(output_dir)
    print(f"✅ Extracted ZIP to: {output_dir}")

def extract_tar(input_dir, output_dir):
    with tarfile.open(input_dir, 'r:*') as tar_ref:
        print(f"✅ Extracting TAR from: {input_dir}")
        tar_ref.extractall(output_dir)
    print(f"✅ Extracted TAR to: {output_dir}")

def extract_7z(input_dir, output_dir):
    with py7zr.SevenZipFile(input_dir, mode='r') as archive:
        print(f"✅ Extracting 7z from: {input_dir}")
        archive.extractall(path=output_dir)
    print(f"✅ Extracted 7Z to: {output_dir}")

def extract_rar(input_dir, output_dir):
    try:
        with rarfile.RarFile(input_dir) as rar_ref:
            print(f"✅ Extracting RAR from: {input_dir}")
            rar_ref.extractall(path=output_dir)
        print(f"✅ Extracted RAR to: {output_dir}")
    except rarfile.NeedFirstVolume:
        print(f"❌ Need first volume for multi-part RAR: {input_dir}")
    except Exception as e:
        print(f"❌ Failed to extract RAR: {e}")

In [4]:
# 🧠 Main Function: Pass file path + base output directory
def extract_all_archives_in_directory(input_dir, output_dir):
    supported_exts = [".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar"]

    # Walk through the directory to find archive files
    for root, _, files in os.walk(input_dir):
        for file in files:
            ext = os.path.splitext(file)[1].lower()
            if ext in supported_exts:
                file_path = os.path.join(root, file)
                base_name = os.path.splitext(file)[0]
                output_path = os.path.join(output_dir, base_name)
                os.makedirs(output_path, exist_ok=True)

                try:
                    if ext == ".zip":
                        extract_zip(file_path, output_path)
                    elif ext in [".tar", ".gz", ".bz2", ".xz"]:
                        extract_tar(file_path, output_path)
                    elif ext == ".7z":
                        extract_7z(file_path, output_path)
                    elif ext == ".rar":
                        extract_rar(file_path, output_path)
                except Exception as e:
                    print(f"❌ Error extracting {file_path}: {e}")

In [5]:
input_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/images/"
output_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/"


In [6]:
extract_all_archives_in_directory(input_dir, output_dir)

✅ Extracting TAR from: /home/etinoxa/Documents/Python/Datasets/ai_forensics/images/android_13_data.tar.gz
❌ Error extracting /home/etinoxa/Documents/Python/Datasets/ai_forensics/images/android_13_data.tar.gz: [Errno 13] Permission denied: '/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/android_13_data.tar/data/user/0/com.instagram.android/lib-compressed/dso_lock'


In [60]:
def list_file_extensions(output_dir):
    extensions = set()

    for root, _, files in os.walk(output_dir):
        for file in files:
            _, ext = os.path.splitext(file)
            if ext and len(ext) < 5:  # Only include if extension is < 5 characters (e.g. '.txt')
                extensions.add(ext.lower())

    return sorted(extensions)


In [61]:
list_file_extensions(output_dir)

['.0',
 '.1',
 '.10',
 '.106',
 '.11',
 '.128',
 '.2',
 '.3',
 '.62',
 '.8',
 '.a',
 '.a10',
 '.a11',
 '.a12',
 '.a13',
 '.a8',
 '.a8x',
 '.a9',
 '.acv',
 '.adb',
 '.aif',
 '.app',
 '.art',
 '.ast',
 '.atx',
 '.avl',
 '.avt',
 '.baf',
 '.bdg',
 '.bin',
 '.bnk',
 '.br',
 '.btb',
 '.bz2',
 '.c',
 '.c3b',
 '.c3h',
 '.caf',
 '.car',
 '.cb0',
 '.cbm',
 '.cdm',
 '.cdt',
 '.cer',
 '.cfg',
 '.cid',
 '.cig',
 '.ck',
 '.com',
 '.cpp',
 '.crc',
 '.crt',
 '.csl',
 '.css',
 '.csv',
 '.ctb',
 '.cti',
 '.cts',
 '.cv',
 '.dat',
 '.db',
 '.der',
 '.dfu',
 '.dic',
 '.dis',
 '.dmc',
 '.dmg',
 '.dmu',
 '.dnn',
 '.doc',
 '.dom',
 '.ds',
 '.dtd',
 '.enh',
 '.ent',
 '.env',
 '.eot',
 '.et',
 '.exp',
 '.f16',
 '.fdt',
 '.fdx',
 '.flr',
 '.fnm',
 '.fs',
 '.fsh',
 '.fst',
 '.gen',
 '.gif',
 '.glb',
 '.gpg',
 '.gpu',
 '.gri',
 '.gz',
 '.h',
 '.h2',
 '.hdr',
 '.hfd',
 '.hmm',
 '.htm',
 '.icc',
 '.idx',
 '.img',
 '.ims',
 '.in',
 '.inl',
 '.int',
 '.inv',
 '.ios',
 '.ips',
 '.ir',
 '.jpg',
 '.js',
 '.kb',
 '.kf',


In [62]:

# 📂 File Type Classification
def classify_extension(ext):
    image_exts = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.svg'}
    audio_exts = {'.mp3', '.wav', '.aac', '.ogg', '.flac', '.m4a'}
    video_exts = {'.mp4', '.mkv', '.avi', '.mov', '.wmv', '.flv'}
    document_exts = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.csv'}
    archive_exts = {'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2'}
    code_exts = {'.py', '.js', '.html', '.css', '.java', '.c', '.cpp', '.php'}

    if ext in image_exts:
        return 'Images'
    elif ext in audio_exts:
        return 'Audio'
    elif ext in video_exts:
        return 'Videos'
    elif ext in document_exts:
        return 'Documents'
    elif ext in archive_exts:
        return 'Archives'
    elif ext in code_exts:
        return 'Code'
    else:
        return 'Other'


In [79]:
# 📂 Count Extensions by Type
def count_extensions_by_type(output_dir):
    ext_count = defaultdict(int)
    type_grouped = defaultdict(lambda: defaultdict(int))

    for root, _, files in os.walk(output_dir):
        for file in files:
            _, ext = os.path.splitext(file)
            ext = ext.lower()
            if ext and len(ext) < 5:
                ext_count[ext] += 1
                file_type = classify_extension(ext)
                type_grouped[file_type][ext] += 1

    return type_grouped

In [80]:
result = count_extensions_by_type(output_dir)

for category, exts in result.items():
    print(f"\n📁 {category}:")
    for ext, count in exts.items():
        print(f"  • {ext}: {count}")

In [75]:
def analyze_and_collect_files(input_dir):
    """
    Returns a mapping of file_type -> ext -> [file_paths]
    """
    file_map = defaultdict(lambda: defaultdict(list))

    for root, _, files in os.walk(input_dir):
        for file in files:
            _, ext = os.path.splitext(file)
            ext = ext.lower()
            if ext and len(ext) < 6:  # Only include if extension is < 6 characters
                file_type = classify_extension(ext)
                file_path = os.path.join(root, file)
                file_map[file_type][ext].append(file_path)
    
    return file_map


In [76]:
def move_files_by_type(file_map, destination_base_dir):
    for file_type, ext_group in file_map.items():
        for ext, files in ext_group.items():
            target_folder = os.path.join(destination_base_dir, file_type, ext.strip('.'))
            os.makedirs(target_folder, exist_ok=True)

            for file_path in files:
                filename = os.path.basename(file_path)
                target_path = os.path.join(target_folder, filename)

                # Prevent overwrite by renaming if file exists using uuid
                if os.path.exists(target_path):
                    name, extension = os.path.splitext(filename)
                    unique_suffix = uuid.uuid4().hex[:6]
                    filename = f"{name}_{unique_suffix}{extension}"
                    target_path = os.path.join(target_folder, filename)

                try:
                    shutil.move(file_path, target_path)
                    print(f"✅ Moved {file_path} → {target_path}")
                except Exception as e:
                    print(f"❌ Failed to move {file_path}: {e}")


In [78]:
# Define your source (where the files are) and target folders
source_extracted_dir = output_dir      # Where your extracted files are
organized_output_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/organized_files"      # Where to move files into

# Step 1: Analyze and group files by type and extension
file_map = analyze_and_collect_files(source_extracted_dir)

# Step 2: Move files to new structure
move_files_by_type(file_map, organized_output_dir)


In [52]:
result = count_extensions_by_type(organized_output_dir)

for category, exts in result.items():
    print(f"\n📁 {category}:")
    for ext, count in exts.items():
        print(f"  • {ext}: {count}")


📁 Code:
  • .cpp: 4
  • .js: 3260
  • .c: 1
  • .css: 74
  • .py: 4

📁 Other:
  • .ser: 2
  • .10: 1
  • .a9: 2
  • .ds: 117
  • .log: 140
  • .lua: 3
  • .dmc: 22
  • .com: 1
  • .gpg: 4
  • .inv: 1
  • .xsl: 8
  • .a8: 1
  • .nib: 1518
  • .icc: 20
  • .out: 2
  • .11: 1
  • .tip: 1
  • .ent: 3
  • .sc: 2
  • .xsd: 14
  • .dmg: 2
  • .pb: 28
  • .kb: 2
  • .tgs: 31
  • .opx: 81
  • .p12: 13
  • .sct: 20
  • .tvd: 1
  • .tif: 44
  • .gpu: 1
  • .sig: 2
  • .csl: 1
  • .rt: 1
  • .sks: 19
  • .bin: 288
  • .ims: 1
  • .hfd: 1
  • .art: 68
  • .old: 1
  • .mdb: 2
  • .8: 2
  • .tvx: 1
  • .net: 56
  • .2: 1
  • .kf: 3
  • .re: 2
  • .sbd: 1
  • .scn: 45
  • .mdl: 23
  • .atx: 353
  • .der: 20
  • .cts: 1
  • .h2: 1
  • .ctb: 116
  • .psh: 1
  • .ios: 1
  • .spd: 1
  • .sst: 1
  • .rl: 2
  • .vfs: 1
  • .thm: 1
  • .pvr: 6
  • .md5: 1
  • .ver: 1
  • .pnr: 3
  • .dtd: 6
  • .pri: 3
  • .m4r: 96
  • .cb0: 1
  • .mom: 868
  • .cid: 2
  • .3: 3
  • .otf: 227
  • .a8x: 1
  • .tga: 1
  • .pb