In [None]:
# %wget https://digitalcorpora.s3.amazonaws.com/corpora/mobile/ios_13_4_1/ios_13_4_1.zip -O ios_13_4_1.zip

Extract Files from Image

In [3]:
%pip install -q py7zr
%pip install -q rarfile

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import zipfile
import tarfile
import py7zr
import rarfile

In [15]:
# 📂 Extraction Functions
def extract_zip(input_dir, output_dir):
    with zipfile.ZipFile(input_dir, 'r') as zip_ref:
        print(f"✅ Extracting ZIP from: {input_dir}")
        zip_ref.extractall(output_dir)
    print(f"✅ Extracted ZIP to: {output_dir}")

def extract_tar(input_dir, output_dir):
    with tarfile.open(input_dir, 'r:*') as tar_ref:
        print(f"✅ Extracting TAR from: {input_dir}")
        tar_ref.extractall(output_dir)
    print(f"✅ Extracted TAR to: {output_dir}")

def extract_7z(input_dir, output_dir):
    with py7zr.SevenZipFile(input_dir, mode='r') as archive:
        print(f"✅ Extracting 7z from: {input_dir}")
        archive.extractall(path=output_dir)
    print(f"✅ Extracted 7Z to: {output_dir}")

def extract_rar(input_dir, output_dir):
    try:
        with rarfile.RarFile(input_dir) as rar_ref:
            print(f"✅ Extracting RAR from: {input_dir}")
            rar_ref.extractall(path=output_dir)
        print(f"✅ Extracted RAR to: {output_dir}")
    except rarfile.NeedFirstVolume:
        print(f"❌ Need first volume for multi-part RAR: {input_dir}")
    except Exception as e:
        print(f"❌ Failed to extract RAR: {e}")

In [16]:
# 🧠 Main Function: Pass file path + base output directory
def extract_all_archives_in_directory(input_dir, output_dir):
    supported_exts = [".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar"]

    # Walk through the directory to find archive files
    for root, _, files in os.walk(input_dir):
        for file in files:
            ext = os.path.splitext(file)[1].lower()
            if ext in supported_exts:
                file_path = os.path.join(root, file)
                base_name = os.path.splitext(file)[0]
                output_path = os.path.join(output_dir, base_name)
                os.makedirs(output_path, exist_ok=True)

                try:
                    if ext == ".zip":
                        extract_zip(file_path, output_path)
                    elif ext in [".tar", ".gz", ".bz2", ".xz"]:
                        extract_tar(file_path, output_path)
                    elif ext == ".7z":
                        extract_7z(file_path, output_path)
                    elif ext == ".rar":
                        extract_rar(file_path, output_path)
                except Exception as e:
                    print(f"❌ Error extracting {file_path}: {e}")

In [18]:
input_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/images/"
output_dir = "/home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/"

extract_all_archives_in_directory(input_dir, output_dir)

✅ Extracting TAR from: /home/etinoxa/Documents/Python/Datasets/ai_forensics/images/13-4-1.tar
✅ Extracted TAR to: /home/etinoxa/Documents/Python/Datasets/ai_forensics/extracted/13-4-1


In [22]:
def list_file_extensions(output_dir):
    extensions = set()

    for root, _, files in os.walk(output_dir):
        for file in files:
            _, ext = os.path.splitext(file)
            if ext and len(ext) < 5:  # Only include if extension is < 5 characters (e.g. '.txt')
                extensions.add(ext.lower())

    return sorted(extensions)


In [24]:
list_file_extensions(output_dir)

['.0',
 '.1',
 '.10',
 '.106',
 '.11',
 '.128',
 '.2',
 '.3',
 '.62',
 '.8',
 '.a',
 '.a10',
 '.a11',
 '.a12',
 '.a13',
 '.a8',
 '.a8x',
 '.a9',
 '.acv',
 '.adb',
 '.aif',
 '.app',
 '.art',
 '.ast',
 '.atx',
 '.avl',
 '.avt',
 '.baf',
 '.bdg',
 '.bin',
 '.bnk',
 '.br',
 '.btb',
 '.bz2',
 '.c',
 '.c3b',
 '.c3h',
 '.caf',
 '.car',
 '.cb0',
 '.cbm',
 '.cdm',
 '.cdt',
 '.cer',
 '.cfg',
 '.cid',
 '.cig',
 '.ck',
 '.com',
 '.cpp',
 '.crc',
 '.crt',
 '.csl',
 '.css',
 '.csv',
 '.ctb',
 '.cti',
 '.cts',
 '.cv',
 '.dat',
 '.db',
 '.der',
 '.dfu',
 '.dic',
 '.dis',
 '.dmc',
 '.dmg',
 '.dmu',
 '.dnn',
 '.doc',
 '.dom',
 '.ds',
 '.dtd',
 '.enh',
 '.ent',
 '.env',
 '.eot',
 '.et',
 '.exp',
 '.f16',
 '.fdt',
 '.fdx',
 '.flr',
 '.fnm',
 '.fs',
 '.fsh',
 '.fst',
 '.gen',
 '.gif',
 '.glb',
 '.gpg',
 '.gpu',
 '.gri',
 '.gz',
 '.h',
 '.h2',
 '.hdr',
 '.hfd',
 '.hmm',
 '.htm',
 '.icc',
 '.idx',
 '.img',
 '.ims',
 '.in',
 '.inl',
 '.int',
 '.inv',
 '.ios',
 '.ips',
 '.ir',
 '.jpg',
 '.js',
 '.kb',
 '.kf',
