<a href="https://colab.research.google.com/github/fatcrapinmybutt/fredprime-legal-system/blob/main/Copy_of_Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os, sys, json, sqlite3, hashlib, shutil, subprocess, re
from pathlib import Path
from datetime import datetime, timezone

# ---- Auto-detect a real root (prefers D:/ for Windows; falls back for Colab) ----
# CANDIDATES = ["D:/", "/content/drive/MyDrive", "/content", str(Path.cwd())]
# ROOTS = [Path(p) for p in CANDIDATES if Path(p).exists()]
# if not ROOTS:
#     print("No valid scan root found."); sys.exit(1)

# For Colab, we will use a default root directory
ROOTS = [Path("/content/drive/MyDrive/litgation_OS$")]  # Default root directory

ACTION = "copy"  # copy|move|link
OCR_ENABLED = True
BATES_ENABLED = True
BATES_PREFIX = "AJP"
BATES_START = 1

RUN_STAMP = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
OUTDIR = Path.cwd() / f"OUTPUT_{RUN_STAMP}"

REQ = [
    "pandas",
    "Pillow",
    "python-docx",
    "PyPDF2",
    "pytesseract",
    "pdfminer.six",
    "chardet",
]
try:
    import importlib.util

    miss = [p for p in REQ if importlib.util.find_spec(p.split(".")[0]) is None]
    if miss:
        subprocess.check_call([sys.executable, "-m", "pip", "install", *miss])
except Exception as e:
    print("dep warn:", e)

import pandas as pd
from PIL import Image, ExifTags
from docx import Document as DocxDocument
from PyPDF2 import PdfReader, PdfWriter
import pytesseract, chardet
from pdfminer.high_level import extract_text as pdfminer_extract_text


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p


def sha256_file(p: Path, buf=1 << 20):
    h = hashlib.sha256()
    with p.open("rb") as f:
        for c in iter(lambda: f.read(buf), b""):
            h.update(c)
    return h.hexdigest()


def detect_encoding(b: bytes):
    try:
        return (chardet.detect(b) or {}).get("encoding") or "utf-8"
    except:
        return "utf-8"


def text_preview(s, n=1400):
    return re.sub(r"\s+", " ", s or "")[:n]


def exif_dict(img):
    try:
        return {
            ExifTags.TAGS.get(k, str(k)): (v.decode() if isinstance(v, bytes) else v)
            for k, v in (img.getexif() or {}).items()
        }
    except:
        return {}


class Org:
    def __init__(self):
        self.roots = ROOTS
        self.out = ensure_dir(OUTDIR)
        self.org = ensure_dir(self.out / "ORGANIZED")
        self.ocr = ensure_dir(self.out / "OCR")
        self.bates_dir = ensure_dir(self.out / "BATES")
        self.db = sqlite3.connect(self.out / "evidence.db")
        self.db.execute(
            "CREATE TABLE IF NOT EXISTS files(sha TEXT PRIMARY KEY, path TEXT, organized TEXT, bates TEXT, meta TEXT)"
        )
        self.bates = BATES_START

    def meta(self, p: Path):
        e = p.suffix.lower()
        try:
            if e == ".pdf":
                m = {"pages": len(PdfReader(str(p)).pages)}
                try:
                    m["text"] = text_preview(pdfminer_extract_text(str(p)))
                except:
                    pass
                return m
            if e == ".docx":
                d = DocxDocument(str(p))
                return {"text": text_preview("\n".join(x.text for x in d.paragraphs))}
            if e in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".gif", ".webp"):
                with Image.open(str(p)) as im:
                    m = {
                        "format": im.format,
                        "mode": im.mode,
                        "size": im.size,
                        "exif": exif_dict(im),
                    }
                    if OCR_ENABLED:
                        try:
                            txt = pytesseract.image_to_string(im)
                            (self.ocr / (p.stem + ".txt")).write_text(
                                txt, encoding="utf-8", errors="ignore"
                            )
                            m["ocr"] = text_preview(txt)
                        except Exception as ex:
                            m["ocr_err"] = str(ex)
                    return m
            if e in (
                ".txt",
                ".log",
                ".md",
                ".json",
                ".jsonl",
                ".csv",
                ".ini",
                ".conf",
                ".yml",
                ".yaml",
            ):
                b = p.read_bytes()[:400_000]
                return {"text": text_preview(b.decode(detect_encoding(b), "ignore"))}
        except Exception as ex:
            return {"err": str(ex)}
        return {}

    def place(self, src: Path, sha: str):
        dst = self.org / sha[:2] / (sha + src.suffix.lower())
        ensure_dir(dst.parent)
        if not dst.exists():
            try:
                if ACTION == "copy":
                    shutil.copy2(src, dst)
                elif ACTION == "move":
                    shutil.move(src, dst)
                elif ACTION == "link":
                    os.link(src, dst)
                else:
                    shutil.copy2(src, dst)
            except Exception:
                shutil.copy2(src, dst)
        return dst

    def run(self):
        recs = []
        for root in self.roots:
            for dp, _, files in os.walk(root):
                for name in files:
                    p = Path(dp) / name
                    if self.out in p.parents:
                        continue
                    try:
                        sha = sha256_file(p)
                        meta = self.meta(p)
                        dst = self.place(p, sha)
                        bid = ""
                        if BATES_ENABLED and p.suffix.lower() == ".pdf":
                            bid = f"AJP-{self.bates:07d}"
                            self.bates += 1
                            shutil.copy2(dst, self.bates_dir / f"{sha}_{bid}.pdf")
                        self.db.execute(
                            "INSERT OR REPLACE INTO files VALUES(?,?,?,?,?)",
                            (
                                sha,
                                str(p),
                                str(dst),
                                bid,
                                json.dumps(meta, ensure_ascii=False),
                            ),
                        )
                        recs.append(
                            {
                                "path": str(p),
                                "organized": str(dst),
                                "sha": sha,
                                "bates": bid,
                                **meta,
                            }
                        )
                    except Exception as e:
                        print("fail:", p, e)
        self.db.commit()
        self.db.close()
        pd.DataFrame(recs).to_csv(self.out / "evidence.csv", index=False)
        (self.out / "MANIFEST.json").write_text(
            json.dumps(
                {
                    "roots": [str(r) for r in self.roots],
                    "scanned_root": str(self.roots[0]),
                    "outdir": str(self.out),
                    "files": len(recs),
                    "utc_finished": datetime.now(timezone.utc).isoformat(),
                },
                indent=2,
            )
        )
        print("[scan root]", self.roots[0])
        print("[done]", self.out)


Org().run()

[scan root] /content/drive/MyDrive/litgation_OS$
[done] /content/OUTPUT_20250908_170554


In [None]:
import pandas as pd
import os

# Get the latest output directory name from the previous run's stdout
output_dirs = [d for d in os.listdir("/content") if d.startswith("OUTPUT_")]
latest_output_dir = sorted(output_dirs)[-1] if output_dirs else None

if latest_output_dir:
    csv_path = os.path.join("/content", latest_output_dir, "evidence.csv")
    if os.path.exists(csv_path):
        df_evidence = pd.read_csv(csv_path)
        display(df_evidence)
    else:
        print(f"Error: {csv_path} not found.")
else:
    print("Error: No OUTPUT directory found.")

EmptyDataError: No columns to parse from file

In [None]:
#!/usr/bin/env python3
# EVIDENCE_ORGANIZER.py
# Purpose: Forensically ingest, organize, and process evidence files.
# Outputs:
#   OUTPUT/
#     evidence.db                  (SQLite)
#     evidence.csv                 (flat table)
#     evidence.jsonl               (full JSON lines)
#     MANIFEST.json                (summary + run config)
#     LEDGER.jsonl                 (chain-of-custody style events)
#     ORGANIZED/                   (copy|move|link by type/date/hash)
#     UNPACKED/                    (safe archive extraction)
#     THUMBNAILS/                  (image thumbs)
#     OCR/                         (OCR text for images and image-only PDFs)
#     BATES/                       (Bates-stamped PDF copies if enabled)
#
# Dependencies auto-install (if allowed): pip, pandas, pillow, python-docx, PyPDF2, pikepdf, pytesseract, pdfminer.six, chardet, tqdm
# Optional external tools: tesseract-ocr, ffprobe (from ffmpeg), exiftool (fallback)
#
# Usage:
#   python EVIDENCE_ORGANIZER.py --root "F:\Evidence" --action copy --unpack --bates "AJP" --bates-start 1
#   python EVIDENCE_ORGANIZER.py --root "/path/in" "/other/path" --threads 8 --no-ocr --no-bates
#
# Notes:
# - Originals are never modified. Organized tree is separate.
# - Bates stamping overlays page footer if ReportLab present; else filename-only Bates copies with mapping are created.
# - OCR requires Tesseract installed and on PATH. Without it, OCR is skipped gracefully.
# - Archive extraction is safe-mode: blocks absolute paths and .. traversal.
# - Duplicate detection by SHA256. Duplicates listed and can be linked/copied to ORGANIZED/DEDUP but not deleted.

import os, sys, json, sqlite3, hashlib, zipfile, tarfile, io, shutil, subprocess, argparse, re, time
from pathlib import Path
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed

# ---------- Dependency management ----------
REQUIRED_PY = [
    ("pandas", "pandas"),
    ("PIL", "Pillow"),
    ("docx", "python-docx"),
    ("PyPDF2", "PyPDF2"),
    ("pikepdf", "pikepdf"),
    ("pytesseract", "pytesseract"),
    ("pdfminer", "pdfminer.six"),
    ("chardet", "chardet"),
    ("tqdm", "tqdm"),
]


def ensure_deps():
    try:
        import importlib

        missing = []
        for mod, pkg in REQUIRED_PY:
            try:
                importlib.import_module(mod)
            except Exception:
                missing.append(pkg)
        if missing:
            # Try installing
            print(f"[deps] Installing: {missing}")
            subprocess.check_call([sys.executable, "-m", "pip", "install", *missing])
    except Exception as e:
        print(f"[deps] Warning: failed auto-install: {e}")


ensure_deps()

# Now import installed modules
import pandas as pd
from PIL import Image, ExifTags
from docx import Document as DocxDocument
from PyPDF2 import PdfReader, PdfWriter
import pytesseract
from pdfminer.high_level import extract_text as pdfminer_extract_text
import chardet
from tqdm import tqdm

# ---------- Helpers ----------
APP_VERSION = "1.7.0"
NOW_ISO = datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
SAFE_ARCHIVE_EXT = {
    ".zip",
    ".tar",
    ".tgz",
    ".tar.gz",
    ".tar.bz2",
    ".tar.xz",
    ".7z",
}  # 7z read via external if present
DOC_EXT = {".docx", ".doc", ".rtf", ".odt", ".pages"}
PDF_EXT = {".pdf"}
IMG_EXT = {".jpg", ".jpeg", ".png", ".tiff", ".tif", ".bmp", ".gif", ".webp", ".heic"}
TXT_EXT = {
    ".txt",
    ".md",
    ".csv",
    ".log",
    ".json",
    ".jsonl",
    ".yml",
    ".yaml",
    ".ini",
    ".conf",
}
AV_EXT = {
    ".mp3",
    ".wav",
    ".m4a",
    ".aac",
    ".flac",
    ".ogg",
    ".mp4",
    ".mkv",
    ".mov",
    ".avi",
    ".wmv",
    ".webm",
}


def sha256_file(p: Path, buf_size=1024 * 1024) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        while True:
            b = f.read(buf_size)
            if not b:
                break
            h.update(b)
    return h.hexdigest()


def bytes_safe_read(p: Path, limit=5_000_000):
    with p.open("rb") as f:
        return f.read(limit)


def detect_encoding(data: bytes):
    try:
        if not data:
            return "utf-8"
        d = chardet.detect(data)
        return d.get("encoding") or "utf-8"
    except:
        return "utf-8"


def run_ffprobe(path: Path):
    try:
        cmd = [
            "ffprobe",
            "-v",
            "error",
            "-print_format",
            "json",
            "-show_format",
            "-show_streams",
            str(path),
        ]
        out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=30)
        return json.loads(out.decode("utf-8", "ignore"))
    except Exception:
        return None


def have_reportlab():
    try:
        import reportlab

        return True
    except:
        return False


def text_preview(s: str, n=1200):
    s = re.sub(r"\s+", " ", s or "")
    return s[:n]


def exif_dict(img: Image.Image):
    try:
        raw = img.getexif()
        if not raw:
            return {}
        label = {}
        for k, v in raw.items():
            nm = ExifTags.TAGS.get(k, str(k))
            # Convert bytes metadata safely
            if isinstance(v, bytes):
                try:
                    v = v.decode("utf-8", "ignore")
                except:
                    v = str(v)
            label[nm] = v
        return label
    except Exception:
        return {}


def is_image_only_pdf(pdf_path: Path):
    try:
        reader = PdfReader(str(pdf_path))
        # Heuristic: pages with no text via pdfminer extraction
        # Short-circuit: if pdfminer finds any text, treat as not image-only
        txt = pdfminer_extract_text(str(pdf_path))
        if txt and txt.strip():
            return False
        return True
    except Exception:
        return False


def safe_extract_zip(zp: Path, outdir: Path):
    with zipfile.ZipFile(zp, "r") as z:
        for m in z.infolist():
            # guard
            name = m.filename
            if name.startswith("/") or ".." in Path(name).parts:
                continue
            dest = outdir / name
            dest.parent.mkdir(parents=True, exist_ok=True)
            with z.open(m, "r") as src, dest.open("wb") as dst:
                shutil.copyfileobj(src, dst)


def safe_extract_tar(tp: Path, outdir: Path):
    mode = "r"
    if str(tp).endswith(".tar.gz") or str(tp).endswith(".tgz"):
        mode = "r:gz"
    elif str(tp).endswith(".tar.bz2"):
        mode = "r:bz2"
    elif str(tp).endswith(".tar.xz"):
        mode = "r:xz"
    with tarfile.open(tp, mode) as t:
        for m in t.getmembers():
            name = m.name
            if name.startswith("/") or ".." in Path(name).parts:
                continue
            dest = outdir / name
            if m.isdir():
                dest.mkdir(parents=True, exist_ok=True)
            else:
                dest.parent.mkdir(parents=True, exist_ok=True)
                with t.extractfile(m) as src, dest.open("wb") as dst:
                    if src:
                        shutil.copyfileobj(src, dst)


def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)
    return p


def bates_id(prefix: str, n: int) -> str:
    return f"{prefix}-{n:07d}"


# ---------- Core processor ----------
class EvidenceOrganizer:
    def __init__(
        self,
        roots,
        outdir,
        threads=6,
        action="copy",
        do_ocr=True,
        do_unpack=False,
        do_bates=True,
        bates_prefix="BATES",
        bates_start=1,
    ):
        self.roots = [Path(r).resolve() for r in roots]
        self.outdir = Path(outdir).resolve()
        self.db_path = self.outdir / "evidence.db"
        self.csv_path = self.outdir / "evidence.csv"
        self.jsonl_path = self.outdir / "evidence.jsonl"
        self.manifest_path = self.outdir / "MANIFEST.json"
        self.ledger_path = self.outdir / "LEDGER.jsonl"
        self.org_dir = ensure_dir(self.outdir / "ORGANIZED")
        self.unpack_dir = ensure_dir(self.outdir / "UNPACKED")
        self.thumb_dir = ensure_dir(self.outdir / "THUMBNAILS")
        self.ocr_dir = ensure_dir(self.outdir / "OCR")
        self.bates_dir = ensure_dir(self.outdir / "BATES")
        self.dup_dir = ensure_dir(self.org_dir / "DEDUP")
        self.threads = threads
        self.action = action  # copy|move|link
        self.do_ocr = do_ocr
        self.do_unpack = do_unpack
        self.do_bates = do_bates
        self.bates_prefix = bates_prefix
        self.bates_counter = bates_start
        self.db = None
        self._init_db()

    # ----- DB -----
    def _init_db(self):
        ensure_dir(self.outdir)
        self.db = sqlite3.connect(self.db_path)
        self.db.execute(
            """
        CREATE TABLE IF NOT EXISTS files(
            id INTEGER PRIMARY KEY,
            path TEXT NOT NULL,
            relpath TEXT NOT NULL,
            root TEXT NOT NULL,
            size INTEGER,
            mtime REAL,
            ctime REAL,
            ext TEXT,
            mime TEXT,
            sha256 TEXT UNIQUE,
            type TEXT,      -- class: pdf/doc/img/av/txt/other
            meta_json TEXT, -- extracted metadata
            ocr_path TEXT,  -- OCR output text file if any
            organized_path TEXT, -- copy/move/link target
            bates_id TEXT   -- if stamped
        );"""
        )
        self.db.execute(
            "CREATE UNIQUE INDEX IF NOT EXISTS idx_sha256 ON files(sha256);"
        )
        self.db.execute("CREATE INDEX IF NOT EXISTS idx_type ON files(type);")
        self.db.commit()

    def _insert_or_update(self, rec: dict):
        cols = [
            "path",
            "relpath",
            "root",
            "size",
            "mtime",
            "ctime",
            "ext",
            "mime",
            "sha256",
            "type",
            "meta_json",
            "ocr_path",
            "organized_path",
            "bates_id",
        ]
        vals = [rec.get(k) for k in cols]
        try:
            self.db.execute(
                f"""
                INSERT INTO files({",".join(cols)}) VALUES ({",".join(["?"]*len(cols))})
                ON CONFLICT(sha256) DO UPDATE SET
                    path=excluded.path,
                    relpath=excluded.relpath,
                    root=excluded.root,
                    size=excluded.size,
                    mtime=excluded.mtime,
                    ctime=excluded.ctime,
                    ext=excluded.ext,
                    mime=excluded.mime,
                    type=excluded.type,
                    meta_json=excluded.meta_json,
                    ocr_path=excluded.ocr_path,
                    organized_path=excluded.organized_path,
                    bates_id=excluded.bates_id
            """,
                vals,
            )
            self.db.commit()
        except sqlite3.Error as e:
            print(f"[db] {e} for {rec.get('path')}")

    # ----- Chain-of-custody ledger -----
    def log_event(self, ev: dict):
        ev = dict(ev)
        ev["ts"] = datetime.utcnow().isoformat() + "Z"
        with self.ledger_path.open("a", encoding="utf-8") as f:
            f.write(json.dumps(ev, ensure_ascii=False) + "\n")

    # ----- File classification -----
    def classify(self, p: Path):
        ext = p.suffix.lower()
        if ext in PDF_EXT:
            return "pdf"
        if ext in DOC_EXT:
            return "doc"
        if ext in IMG_EXT:
            return "img"
        if ext in TXT_EXT:
            return "txt"
        if ext in AV_EXT:
            return "av"
        return "other"

    # ----- Metadata extractors -----
    def meta_pdf(self, p: Path):
        d = {}
        try:
            reader = PdfReader(str(p))
            info = reader.metadata or {}
            d["pages"] = len(reader.pages)
            d["pdf_info"] = {k: str(v) for k, v in info.items()}
            # quick text preview
            try:
                txt = pdfminer_extract_text(str(p))
                d["text_preview"] = text_preview(txt)
            except Exception:
                d["text_preview"] = ""
        except Exception as e:
            d["error"] = f"{e}"
        return d

    def meta_doc(self, p: Path):
        d = {}
        try:
            if p.suffix.lower() == ".docx":
                doc = DocxDocument(str(p))
                props = doc.core_properties
                d["docx_props"] = {
                    k: getattr(props, k)
                    for k in dir(props)
                    if not k.startswith("_") and not callable(getattr(props, k))
                }
                # text preview
                chunks = []
                for para in doc.paragraphs[:200]:
                    if para.text:
                        chunks.append(para.text)
                d["text_preview"] = text_preview("\n".join(chunks))
            else:
                # best effort using bytes read + encoding detect
                data = bytes_safe_read(p, limit=1_500_000)
                enc = detect_encoding(data)
                d["raw_preview"] = text_preview(data.decode(enc, "ignore"))
        except Exception as e:
            d["error"] = f"{e}"
        return d

    def meta_img(self, p: Path):
        d = {}
        try:
            with Image.open(str(p)) as im:
                d["format"] = im.format
                d["mode"] = im.mode
                d["size"] = im.size
                d["exif"] = exif_dict(im)
                # thumbnail
                th = self.thumb_dir / (p.stem + "_thumb.jpg")
                im2 = im.copy()
                im2.thumbnail((800, 800))
                im2.save(th, "JPEG", quality=85)
                d["thumbnail"] = str(th)
                # OCR
                if self.do_ocr:
                    try:
                        txt = pytesseract.image_to_string(im2)
                        ocrp = self.ocr_dir / (p.stem + ".txt")
                        ocrp.write_text(txt, encoding="utf-8", errors="ignore")
                        d["ocr_path"] = str(ocrp)
                        d["text_preview"] = text_preview(txt)
                    except Exception as e:
                        d["ocr_error"] = str(e)
        except Exception as e:
            d["error"] = f"{e}"
        return d

    def meta_txt(self, p: Path):
        d = {}
        try:
            data = bytes_safe_read(p, limit=5_000_000)
            enc = detect_encoding(data)
            s = data.decode(enc, "ignore")
            d["encoding"] = enc
            d["text_preview"] = text_preview(s, n=4000)
        except Exception as e:
            d["error"] = f"{e}"
        return d

    def meta_av(self, p: Path):
        d = {}
        probe = run_ffprobe(p)
        if probe:
            d["ffprobe"] = probe
        else:
            d["note"] = "ffprobe not available"
        return d

    # ----- Organize copy/move/link -----
    def organize_target(self, rec: dict):
        ftype = rec["type"] or "other"
        sha = rec["sha256"]
        ext = rec["ext"] or ""
        dt_dir = datetime.utcfromtimestamp(rec["mtime"] or time.time()).strftime(
            "%Y/%m/%d"
        )
        base = f"{sha[:12]}{ext}"
        target = self.org_dir / ftype / dt_dir / base
        ensure_dir(target.parent)
        return target

    def place_file(self, src: Path, dst: Path, mode: str):
        if dst.exists():
            return
        if mode == "copy":
            shutil.copy2(src, dst)
        elif mode == "move":
            shutil.move(src, dst)
        elif mode == "link":
            try:
                os.link(src, dst)
            except Exception:
                shutil.copy2(src, dst)
        else:
            shutil.copy2(src, dst)
        self.log_event({"op": "place", "mode": mode, "src": str(src), "dst": str(dst)})

    # ----- Bates stamping -----
    def bates_pdf_overlay(self, pdf_in: Path, pdf_out: Path, bates: str):
        if not have_reportlab():
            # Fallback: copy only; rename includes bates id
            shutil.copy2(pdf_in, pdf_out)
            return "filename_only"
        # Build 1-page footer overlay and merge on each page
        from reportlab.pdfgen import canvas
        from reportlab.lib.pagesizes import letter
        from reportlab.lib.units import inch

        tmp_overlay = pdf_out.parent / (pdf_out.stem + "_overlay.pdf")
        # Create overlay
        c = canvas.Canvas(str(tmp_overlay), pagesize=letter)
        w, h = letter
        c.setFont("Helvetica", 10)
        c.drawString(0.7 * inch, 0.5 * inch, f"{bates}")
        c.save()
        # Merge
        reader = PdfReader(str(pdf_in))
        overlay = PdfReader(str(tmp_overlay))
        over_pg = overlay.pages[0]
        writer = PdfWriter()
        for pg in reader.pages:
            pg.merge_page(over_pg)
            writer.add_page(pg)
        with pdf_out.open("wb") as f:
            writer.write(f)
        tmp_overlay.unlink(missing_ok=True)
        return "overlay"

    # ----- Archive unpack -----
    def maybe_unpack(self, p: Path):
        if not self.do_unpack:
            return
        ext = p.suffix.lower()
        try:
            if ext == ".zip":
                out = self.unpack_dir / (p.stem + "_unzipped")
                ensure_dir(out)
                safe_extract_zip(p, out)
                self.log_event({"op": "unpack_zip", "src": str(p), "dst": str(out)})
            elif ext in {
                ".tar",
                ".tgz",
                ".gz",
                ".bz2",
                ".xz",
                ".tar.gz",
                ".tar.bz2",
                ".tar.xz",
            }:
                out = self.unpack_dir / (p.stem + "_untarred")
                ensure_dir(out)
                safe_extract_tar(p, out)
                self.log_event({"op": "unpack_tar", "src": str(p), "dst": str(out)})
            elif ext == ".7z":
                # requires 7z in PATH
                out = self.unpack_dir / (p.stem + "_7z")
                ensure_dir(out)
                try:
                    subprocess.check_call(
                        ["7z", "x", "-y", str(p), f"-o{out}"], timeout=120
                    )
                    self.log_event(
                        {"op": "unpack_7z_fail", "src": str(p), "err": str(e)}
                    )
                except Exception as e:
                    self.log_event(
                        {"op": "unpack_7z_fail", "src": str(p), "err": str(e)}
                    )
        except Exception as e:
            self.log_event({"op": "unpack_fail", "src": str(p), "err": str(e)})

    # ----- Per-file pipeline -----
    def process_one(self, p: Path, root: Path):
        try:
            stat = p.stat()
            ext = p.suffix.lower()
            ftype = self.classify(p)
            sha = sha256_file(p)
            mime = None  # minimized; Python-magic optional
            meta = {}
            ocr_path = None

            if ftype == "pdf":
                meta = self.meta_pdf(p)
                # OCR image-only PDFs
                if self.do_ocr and is_image_only_pdf(p):
                    try:
                        # Extract per-page images then OCR via Tesseract's PDF mode if available
                        # Simple fallback: pdfminer text already empty; we run tesseract directly to text
                        ocr_txt = ""
                        try:
                            out_txt = self.ocr_dir / (p.stem + "_pdf_ocr.txt")
                            subprocess.check_call(
                                [
                                    "tesseract",
                                    str(p),
                                    str(out_txt.with_suffix("")),
                                    "pdf",
                                    "txt",
                                ],
                                timeout=240,
                            )
                            # Tesseract writes .txt automatically
                            if out_txt.exists():
                                ocr_txt = out_txt.read_text(
                                    encoding="utf-8", errors="ignore"
                                )
                                ocr_path = str(out_txt)
                        except Exception:
                            # Final fallback: no OCR
                            pass
                        if ocr_txt:
                            meta["ocr_text_preview"] = text_preview(ocr_txt)
                    except Exception as e:
                        meta["ocr_error"] = str(e)

            elif ftype == "doc":
                meta = self.meta_doc(p)
            elif ftype == "img":
                meta = self.meta_img(p)
                ocr_path = meta.get("ocr_path")
            elif ftype == "txt":
                meta = self.meta_txt(p)
            elif ftype == "av":
                meta = self.meta_av(p)
            else:
                # try small preview
                try:
                    data = bytes_safe_read(p, limit=512_000)
                    enc = detect_encoding(data)
                    s = data.decode(enc, "ignore")
                    meta = {"raw_preview": text_preview(s)}
                except Exception:
                    meta = {}

            # organize
            rec = {
                "path": str(p),
                "relpath": str(p.relative_to(root)),
                "root": str(root),
                "size": stat.st_size,
                "mtime": stat.st_mtime,
                "ctime": stat.st_ctime,
                "ext": ext,
                "mime": mime or "",
                "sha256": sha,
                "type": ftype,
                "meta_json": json.dumps(meta, ensure_ascii=False),
                "ocr_path": ocr_path or "",
                "organized_path": "",
                "bates_id": "",
            }

            # place organized copy
            dst = self.organize_target(rec)
            self.place_file(p, dst, self.action)
            rec["organized_path"] = str(dst)

            # maybe unpack archives
            if ext in SAFE_ARCHIVE_EXT:
                self.maybe_unpack(p)

            # maybe bates-stamp PDFs
            if self.do_bates and ftype == "pdf":
                bid = bates_id(self.bates_prefix, self.bates_counter)
                self.bates_counter += 1
                outpdf = self.bates_dir / f"{dst.stem}_{bid}.pdf"
                mode = self.bates_pdf_overlay(dst, outpdf, bid)
                rec["bates_id"] = bid
                self.log_event(
                    {
                        "op": "bates",
                        "src": str(dst),
                        "dst": str(outpdf),
                        "bates_id": bid,
                        "mode": mode,
                    }
                )

            # write db
            self._insert_or_update(rec)
            return rec

        except Exception as e:
            self.log_event({"op": "process_fail", "path": str(p), "err": str(e)})
            return None

    # ----- Walk -----
    def iter_files(self):
        # Limit the number of root directories processed
        root_limit = 20  # Set your desired limit here
        processed_roots_count = 0
        for root in self.roots:
            if processed_roots_count >= root_limit:
                break
            if root.is_dir():
                processed_roots_count += 1
                for dirpath, _, filenames in os.walk(root):
                    for name in filenames:
                        p = Path(dirpath) / name
                        # skip our own OUTPUT
                        if self.outdir in p.parents:
                            continue
                        yield p, root

    # ----- Run -----
    def run(self):
        # manifest start
        manifest = {
            "version": APP_VERSION,
            "started": NOW_ISO,
            "roots": [str(x) for x in self.roots],
            "outdir": str(self.outdir),
            "threads": self.threads,
            "action": self.action,
            "ocr": self.do_ocr,
            "unpack": self.do_unpack,
            "bates": self.do_bates,
            "bates_prefix": self.bates_prefix,
        }
        self.manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

        # scan and process
        records = []
        futures = []
        with ThreadPoolExecutor(max_workers=self.threads) as ex:
            # Get an iterable of files to process, potentially limited
            file_iterator = list(
                self.iter_files()
            )  # Convert to list to get total count for tqdm
            for p, root in tqdm(file_iterator, desc="Processing"):
                futures.append(ex.submit(self.process_one, p, root))

            for fu in tqdm(
                as_completed(futures), total=len(futures), desc="Processing"
            ):
                rec = fu.result()
                if rec:
                    records.append(rec)

        # export csv/jsonl
        rows = []
        with self.jsonl_path.open("w", encoding="utf-8") as jf:
            for r in records:
                jf.write(json.dumps(r, ensure_ascii=False) + "\n")
                mj = json.loads(r["meta_json"]) if r.get("meta_json") else {}
                rows.append(
                    {
                        "path": r["path"],
                        "organized_path": r.get("organized_path", ""),
                        "type": r.get("type", ""),
                        "ext": r.get("ext", ""),
                        "size": r.get("size", 0),
                        "sha256": r.get("sha256", ""),
                        "mtime": r.get("mtime", 0),
                        "bates_id": r.get("bates_id", ""),
                        "text_preview": (
                            mj.get("text_preview")
                            or mj.get("raw_preview")
                            or mj.get("ocr_text_preview")
                            or ""
                        )[:1000],
                        "pages": mj.get("pages", ""),
                    }
                )
        if rows:
            pd.DataFrame(rows).to_csv(self.csv_path, index=False)

        # summary + duplicates
        dup_map = {}
        for r in records:
            sha = r.get("sha256")
            if not sha:
                continue
            dup_map.setdefault(sha, []).append(r)
        duplicates = {k: v for k, v in dup_map.items() if len(v) > 1}

        # write duplicate list, and link copies into DEDUP
        dups_out = self.outdir / "duplicates.json"
        dups_out.write_text(json.dumps(duplicates, indent=2), encoding="utf-8")
        for sha, items in duplicates.items():
            base_dir = ensure_dir(self.dup_dir / sha[:8])
            for idx, item in enumerate(items, 1):
                src = Path(item.get("organized_path") or item.get("path"))
                if not src or not src.exists():
                    continue
                dst = base_dir / f"{idx:02d}_{src.name}"
                if not dst.exists():
                    try:
                        os.link(src, dst)
                    except Exception:
                        shutil.copy2(src, dst)
                self.log_event(
                    {
                        "op": "dup_collect",
                        "sha256": sha,
                        "src": str(src),
                        "dst": str(dst),
                    }
                )

        # finalize manifest
        manifest["finished"] = (
            datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
        )
        manifest["files_total"] = len(records)
        manifest["duplicates"] = len(duplicates)
        self.manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")

        # also export minimal DB snapshot to ensure write success
        self.db.commit()
        self.db.close()
        print(f"[done] DB: {self.db_path}")
        print(f"[done] CSV: {self.csv_path}")
        print(f"[done] JSONL: {self.jsonl_path}")
        print(f"[done] MANIFEST: {self.manifest_path}")
        print(f"[done] LEDGER: {self.ledger_path}")
        print(f"[done] ORGANIZED: {self.org_dir}")
        print(f"[done] BATES: {self.bates_dir}  (if enabled)")
        print(f"[done] OCR: {self.ocr_dir}      (if any)")
        print(f"[done] UNPACKED: {self.unpack_dir} (if enabled)")
        print(f"[done] DEDUP: {self.dup_dir}")


# ---------- CLI ----------
def parse_args():
    ap = argparse.ArgumentParser(
        description="Forensic evidence organizer and processor"
    )
    ap.add_argument("--root", nargs="+", required=True, help="Root folder(s) to scan")
    ap.add_argument("--out", default="OUTPUT", help="Output directory")
    ap.add_argument("--threads", type=int, default=6, help="Concurrency level")
    ap.add_argument(
        "--action",
        choices=["copy", "move", "link"],
        default="copy",
        help="Place files into ORGANIZED with this mode",
    )
    ap.add_argument("--no-ocr", action="store_true", help="Disable OCR")
    ap.add_argument("--unpack", action="store_true", help="Extract archives safely")
    ap.add_argument("--no-bates", action="store_true", help="Disable Bates stamping")
    ap.add_argument(
        "--bates", default="BATES", help="Bates prefix, e.g., AJP, EXH, CASE2025"
    )
    ap.add_argument("--bates-start", type=int, default=1, help="Starting Bates number")

    # For Colab execution, we need to parse args differently or provide defaults.
    # We will provide defaults here for demonstration.
    # args, unknown = ap.parse_known_args() # parse known args first
    # if not args.root:
    #     # Provide a default root if none is given (for Colab execution)
    #     args.root = ["/content/drive/MyDrive/litgation_OS$"] # Example default root directory
    # return args
    # In Colab, we bypass argparse for direct execution with defaults
    class Args:
        def __init__(self):
            self.root = ["/content/drive/MyDrive/litgation_OS$"]
            self.out = "/content/drive/MyDrive/OUTPUT"  # Modified output path
            self.threads = 6
            self.action = "copy"
            self.no_ocr = False
            self.unpack = False
            self.no_bates = False
            self.bates = "BATES"
            self.bates_start = 1

    return Args()


# Directly call main function to avoid argparse issues in Colab
main()

  NOW_ISO = datetime.utcnow().replace(microsecond=0).isoformat()+"Z"
Processing: 0it [00:00, ?it/s]
Processing: 0it [00:00, ?it/s]

[done] DB: /content/drive/MyDrive/OUTPUT/evidence.db
[done] CSV: /content/drive/MyDrive/OUTPUT/evidence.csv
[done] JSONL: /content/drive/MyDrive/OUTPUT/evidence.jsonl
[done] MANIFEST: /content/drive/MyDrive/OUTPUT/MANIFEST.json
[done] LEDGER: /content/drive/MyDrive/OUTPUT/LEDGER.jsonl
[done] ORGANIZED: /content/drive/MyDrive/OUTPUT/ORGANIZED
[done] BATES: /content/drive/MyDrive/OUTPUT/BATES  (if enabled)
[done] OCR: /content/drive/MyDrive/OUTPUT/OCR      (if any)
[done] UNPACKED: /content/drive/MyDrive/OUTPUT/UNPACKED (if enabled)
[done] DEDUP: /content/drive/MyDrive/OUTPUT/ORGANIZED/DEDUP



  manifest["finished"] = datetime.utcnow().replace(microsecond=0).isoformat()+"Z"
