**Block 1. Initialization and Imports**

In [None]:
import os
import sys
import time
import warnings
import traceback

import pandas as pd
import numpy as np

import re, unicodedata, csv, datetime as _dt

# COM interface for Outlook
import win32com.client
from win32com.client import constants
from win32com.client import Dispatch

# Optional: ensure proper Unicode output in Windows console/Jupyter
import locale
locale.setlocale(locale.LC_ALL, '')

# Suppress common warnings (optional)
warnings.filterwarnings("ignore")

# Working directory setup
USER_DESKTOP = os.path.join(os.path.expanduser("~"), "Desktop")
RUN_DATE = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
REPORT_DIR = os.path.join(USER_DESKTOP, f"Outlook_DupeRun_{RUN_DATE}")
os.makedirs(REPORT_DIR, exist_ok=True)

print(f"[Init] Working directory: {REPORT_DIR}")
print(f"[Init] Time: {RUN_DATE}")

**Block 2: Connect to Outlook (MAPI) and inventory Stores / Root Folders**

In [None]:
try:
    outlook = Dispatch("Outlook.Application")
    ns = outlook.GetNamespace("MAPI")
except Exception as e:
    raise RuntimeError("Cannot connect to Outlook MAPI. Is Outlook installed and profile configured?") from e

def list_stores(ns):
    rows = []
    print("=== STORES ===")
    for i in range(1, ns.Stores.Count + 1):
        s = ns.Stores.Item(i)
        display = getattr(s, "DisplayName", "")
        fpath   = getattr(s, "FilePath", "")       # PST/OST path (may be empty for some accounts)
        is_def  = bool(getattr(s, "IsDefaultStore", False))
        rows.append({"idx": i, "DisplayName": display, "FilePath": fpath, "IsDefaultStore": is_def})
        print(f"{i}. DisplayName: {display} | FilePath: {fpath} | default={is_def}")
    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(REPORT_DIR, "stores.csv"), index=False, encoding="utf-8-sig")
    return df

def list_root_folders(ns):
    rows = []
    print("\n=== ROOT FOLDERS ===")
    for i in range(1, ns.Folders.Count + 1):
        f = ns.Folders.Item(i)  # MAPIFolder (root of each store/account)
        rows.append({"idx": i, "Name": f.Name, "FolderPath": f.FolderPath})
        print(f"{i}. Name: {f.Name} | FolderPath: {f.FolderPath}")
    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(REPORT_DIR, "root_folders.csv"), index=False, encoding="utf-8-sig")
    return df

# Default Inbox of the default store (6 = olFolderInbox)
try:
    inbox_default = ns.DefaultStore.GetDefaultFolder(6)
    print("\nDefaultStore Inbox folder path:", inbox_default.FolderPath)
except Exception:
    inbox_default = None
    print("\n[Warn] Could not resolve DefaultStore Inbox.")

# Run inventories
stores_df = list_stores(ns)
roots_df  = list_root_folders(ns)
print(f"\n[Saved] {os.path.join(REPORT_DIR, 'stores.csv')}")
print(f"[Saved] {os.path.join(REPORT_DIR, 'root_folders.csv')}")


**Block 3: Duplicate scan (read-only) with strict criteria**

This block performs a diagnostic scan of a selected Outlook data store (PST/OST) and folder to detect potential duplicate messages.
No items are moved or deleted — only analyzed and logged.

The scan uses strict criteria to identify duplicates based on message metadata.

Detection Criteria

* Time – exact match up to the minute (configurable via `DATE_PRECISION`)
* Recipients – normalized display names from `To`, `CC`, and `BCC`
* Subject – case-normalized, invisible characters stripped; *Re:/Fwd:* prefixes are kept
* Size – absolute difference ≤ 4 KB (configurable via `SIZE_ABS_TOL`)
* Identifiers – `PR_INTERNET_MESSAGE_ID` and `PR_SEARCH_KEY` (if available) for strong matching

Main Options

| Parameter                  | Meaning                                                | Default                           |
| -------------------------- | ------------------------------------------------------ | --------------------------------- |
| `TARGET_PST_FRAGMENT`      | Part of PST/OST file path to identify the target store | `"Outlook.pst"`                   |
| `TARGET_FOLDER_PATH`       | Full MAPI path (e.g. `\\Outlook\\Sent Items`)          | empty → use `DEFAULT_FOLDER_CODE` |
| `DEFAULT_FOLDER_CODE`      | Fallback folder code (6 = Inbox, 5 = Sent)             | `6`                               |
| `ITEM_LIMIT`               | Maximum number of items to analyze                     | `200`                             |
| `ADDRESS_ROLE`             | `"recipient"` or `"sender"` (defines comparison key)   | `"recipient"`                     |
| `RECIPIENT_INCLUDE_CC_BCC` | Include CC/BCC fields in comparison                    | `True`                            |
| `TIME_FIELD`               | `"sent"`, `"received"`, or `"none"`                    | `"sent"`                          |
| `DATE_PRECISION`           | `"minute"` or `"second"`                               | `"minute"`                        |
| `STRICT_TIME`              | Require exact timestamp equality                       | `True`                            |
| `SIZE_ABS_TOL`             | Allowed absolute difference in message size (bytes)    | `4096`                            |

Outputs

* Console / Notebook output — brief summary of duplicates found.
* `dupe_scan.log` — detailed log in `REPORT_DIR` (Desktop → `Outlook_DupeRun_YYYYMMDD_HHMMSS`).
* `dupe_scan_results.csv` — structured list of suspected duplicates (InternetMessageId, subject, size, time, etc.).

Typical Workflow

1. Run Block 1 + Block 2 first (imports + Outlook connection).
2. Adjust parameters (`TARGET_PST_FRAGMENT`, `TARGET_FOLDER_PATH`, etc.).
3. Run this block.
4. Review results in CSV and log before running the full duplicate-removal phase (Block 4).


In [None]:
# Uses existing: ns (MAPI namespace), REPORT_DIR
# -------------------------
# Parameters (adjust here)
# -------------------------
TARGET_PST_FRAGMENT = "Outlook.pst"   # part of Store FilePath to match (case-insensitive)
TARGET_FOLDER_PATH  = r""             # e.g. r"\\Outlook\\Outbox"; empty -> DEFAULT_FOLDER_CODE
DEFAULT_FOLDER_CODE = 6               # 6=Inbox, 5=Sent Items, 11=Calendar, 12=Contacts, 15=Tasks
ITEM_LIMIT          = 200             # max items to scan (raise for larger samples)

# Subject / recipients normalization
SUBJECT_CASE_INSENSITIVE = True    # normalize subject to lowercase before comparison
SUBJECT_STRIP_INVISIBLES = True    # remove zero-width and control characters from subject
KEEP_RE_FWD_PREFIXES     = True    # keep "Re:" / "Fwd:" prefixes (do not strip)
RECIPIENT_INCLUDE_CC_BCC = True    # include CC/BCC recipients in matching
ADDRESS_ROLE             = "recipient"  # use recipients ("sender" also possible)

# Time / size matching
TIME_FIELD    = "sent"                # "sent" | "received" | "none"
DATE_PRECISION= "minute"              # "minute" | "second"
STRICT_TIME   = True                  # require exact match at chosen precision
SIZE_ABS_TOL  = 4096                  # ±4KB

# Output
LOG_PATH = os.path.join(REPORT_DIR, "dupe_scan.log")
CSV_PATH = os.path.join(REPORT_DIR, "dupe_scan_results.csv")
SHOW_ROWS = 30

# MAPI props
PR_DISPLAY_TO  = "http://schemas.microsoft.com/mapi/proptag/0x0E04001E"
PR_DISPLAY_CC  = "http://schemas.microsoft.com/mapi/proptag/0x0E03001E"
PR_DISPLAY_BCC = "http://schemas.microsoft.com/mapi/proptag/0x0E02001E"

PR_CLIENT_SUBMIT_TIME    = "http://schemas.microsoft.com/mapi/proptag/0x00390040"  # Sent
PR_MESSAGE_DELIVERY_TIME = "http://schemas.microsoft.com/mapi/proptag/0x0E060040"  # Received
PR_INTERNET_MESSAGE_ID   = "http://schemas.microsoft.com/mapi/proptag/0x1035001E"  # string
PR_SEARCH_KEY            = "http://schemas.microsoft.com/mapi/proptag/0x300B0102"  # binary

FALLBACK_RECEIVED_IF_MISSING = True
FALLBACK_MAPI_IF_MISSING     = True
FALLBACK_CREATION_IF_MISSING = True

# -------------------------
# Helpers
# -------------------------
try:
    from pywintypes import TimeType as _PyWinTime
except Exception:
    _PyWinTime = type("Dummy", (), {})

_SURROGATES_RE = re.compile(r'[\ud800-\udfff]')

def sanitize_text(x):
    if x is None: return ""
    if not isinstance(x, str): x = str(x)
    try:
        x.encode("utf-8"); return x
    except UnicodeEncodeError:
        pass
    x = _SURROGATES_RE.sub("�", x)
    return x.encode("utf-8","replace").decode("utf-8")

def log(msg: str):
    s = sanitize_text(msg)
    print(s, flush=True)
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        f.write(s + "\n")

def get_folder_by_path(namespace, path: str):
    p = (path or "").strip("\\")
    if not p:
        return None
    parts = p.split("\\")
    folder = namespace.Folders[parts[0]]
    for name in parts[1:]:
        folder = folder.Folders[name]
    return folder

def _fmt_choice():
    return "%Y-%m-%d %H:%M" if DATE_PRECISION=="minute" else "%Y-%m-%d %H:%M:%S"

def safe_dt_to_str(dt):
    fmt = _fmt_choice()
    if isinstance(dt, _PyWinTime):
        try:    return dt.UTC().Format(fmt)
        except:
            try:    return dt.Format(fmt)
            except: return ""
    if isinstance(dt, _dt.datetime):
        return dt.strftime(fmt)
    if isinstance(dt, str) and dt.strip():
        return dt.strip()
    return ""

def normalize_subject(subj: str) -> str:
    if not subj: return ""
    s = unicodedata.normalize("NFKC", str(subj))
    # strip zero-width and controls
    s = re.sub(r"[\u200B-\u200D\uFEFF]", "", s)
    if SUBJECT_STRIP_INVISIBLES:
        s = s.replace("\xa0"," ").replace("\x00"," ")
        s = re.sub(r"\s+"," ", s.strip())
    if SUBJECT_CASE_INSENSITIVE:
        s = s.lower()
    # keep Re:/Fwd: as configured (no removal here)
    return s

def _display_recipients_via_mapi(mail):
    to = cc = bcc = ""
    try:
        pa = getattr(mail, "PropertyAccessor", None)
        if pa:
            to  = pa.GetProperty(PR_DISPLAY_TO)  or ""
            cc  = pa.GetProperty(PR_DISPLAY_CC)  or ""
            bcc = pa.GetProperty(PR_DISPLAY_BCC) or ""
    except Exception:
        pass
    def _norm(s):
        s = (s or "").replace("\xa0"," ").replace("\x00"," ")
        return re.sub(r"\s+"," ", s.strip()).lower()
    return _norm(to), _norm(cc), _norm(bcc)

def get_party(mail) -> str:
    if ADDRESS_ROLE == "sender":
        a = (getattr(mail, "SenderEmailAddress", "") or "").strip()
        a = a.replace("\xa0"," ").replace("\x00"," ")
        return re.sub(r"\s+"," ", a).lower()
    to_s, cc_s, bcc_s = _display_recipients_via_mapi(mail)
    def split_display(s):
        return [p.strip() for p in re.split(r"[;,]", s) if p.strip()]
    acc = split_display(to_s)
    if RECIPIENT_INCLUDE_CC_BCC:
        acc += split_display(cc_s) + split_display(bcc_s)
    return ",".join(sorted(set(acc)))

def get_time_repr(mail) -> str:
    if TIME_FIELD == "none":
        return ""
    dtv = None
    try:
        if TIME_FIELD == "received":
            dtv = getattr(mail, "ReceivedTime", None) or (getattr(mail, "SentOn", None) if FALLBACK_RECEIVED_IF_MISSING else None)
        else:
            dtv = getattr(mail, "SentOn", None) or (getattr(mail, "ReceivedTime", None) if FALLBACK_RECEIVED_IF_MISSING else None)
    except Exception:
        dtv = None
    s = safe_dt_to_str(dtv)
    if s:
        return s
    if FALLBACK_MAPI_IF_MISSING:
        try:
            pa = getattr(mail, "PropertyAccessor", None)
            if pa:
                dt2 = pa.GetProperty(PR_CLIENT_SUBMIT_TIME if TIME_FIELD=="sent" else PR_MESSAGE_DELIVERY_TIME)
                s2 = safe_dt_to_str(dt2)
                if s2: return s2
        except Exception:
            pass
    if FALLBACK_CREATION_IF_MISSING:
        try:
            dt3 = getattr(mail, "CreationTime", None) or getattr(mail, "LastModificationTime", None)
            s3 = safe_dt_to_str(dt3)
            if s3: return s3
        except Exception:
            pass
    return ""

def size_ok(a: int, b: int) -> bool:
    try:
        return abs(int(a) - int(b)) <= SIZE_ABS_TOL
    except Exception:
        return False

def get_ids(mail):
    inet_id = ""
    search_hex = ""
    try:
        pa = getattr(mail, "PropertyAccessor", None)
        if pa:
            inet_id = pa.GetProperty(PR_INTERNET_MESSAGE_ID) or ""
            raw = pa.GetProperty(PR_SEARCH_KEY)
            if isinstance(raw, (bytes, bytearray)):
                search_hex = raw.hex()
    except Exception:
        pass
    return inet_id, search_hex

def build_key(mail):
    party = get_party(mail)
    subj  = normalize_subject(getattr(mail, "Subject", ""))
    size  = int(getattr(mail, "Size", 0) or 0)
    trepr = get_time_repr(mail)
    inet_id, search_hex = get_ids(mail)
    # Strong identifiers first; others as fallback dimensions
    return (inet_id or "", search_hex or "", party, subj, size, trepr)

def keys_match(k1, k2) -> bool:
    inet1, sk1, p1, s1, sz1, t1 = k1
    inet2, sk2, p2, s2, sz2, t2 = k2

    # If internet-id present on both, require equality
    if inet1 and inet2 and inet1 != inet2:
        return False
    # If search-key present on both, require equality
    if sk1 and sk2 and sk1 != sk2:
        return False

    if p1 != p2: return False
    if s1 != s2: return False

    if STRICT_TIME:
        if not t1 or not t2: return False
        if t1 != t2: return False

    if not size_ok(sz1, sz2): return False
    return True

# -------------------------
# Main scan
# -------------------------
with open(LOG_PATH, "w", encoding="utf-8") as f:
    f.write(f"=== Dupe scan started { _dt.datetime.now() } ===\n")

# Reuse existing ns from Block 2
if "ns" not in globals():
    raise RuntimeError("MAPI namespace 'ns' not found. Run Block 2 first.")

# Locate target store
target = None
for i in range(1, ns.Stores.Count + 1):
    s = ns.Stores.Item(i)
    if TARGET_PST_FRAGMENT.lower() in (getattr(s, "FilePath", "") or "").lower():
        target = s
        break
if not target:
    log(f"[ERR] Store with fragment '{TARGET_PST_FRAGMENT}' not found.")
    raise SystemExit

# Resolve folder: by explicit path or default folder code
folder = get_folder_by_path(ns, TARGET_FOLDER_PATH) if TARGET_FOLDER_PATH else target.GetDefaultFolder(DEFAULT_FOLDER_CODE)

log(f"[Store] {target.DisplayName} | {target.FilePath}")
log(f"[Folder] {folder.FolderPath} | Items: {folder.Items.Count}")
log(f"[Params] role={ADDRESS_ROLE}, CC/BCC={RECIPIENT_INCLUDE_CC_BCC}, time={TIME_FIELD}/{DATE_PRECISION}, "
    f"strict_time={STRICT_TIME}, size_tol={SIZE_ABS_TOL}, limit={ITEM_LIMIT}")

olMailItem = 43
items = folder.Items
total = min(items.Count, ITEM_LIMIT)
log(f"[Scan] Taking first {total} items…")

seen, dups = [], []
for idx in range(1, total + 1):
    try:
        it = items.Item(idx)
    except Exception as e:
        log(f"[WARN] Fetch #{idx} failed: {sanitize_text(e)}")
        continue
    if getattr(it, "Class", None) != olMailItem:
        continue
    mc = getattr(it, "MessageClass", "") or ""
    if not mc.startswith("IPM.Note") or mc.startswith(("REPORT.IPM.NOTE", "IPM.Schedule.Meeting")):
        continue
    try:
        key = build_key(it)
        if any(keys_match(key, prev) for prev in seen):
            inet_id, search_hex, party, subj, size, trepr = key
            dups.append({
                "InternetMessageId": sanitize_text(inet_id),
                "SearchKeyHex":      sanitize_text(search_hex),
                "Counterparty":      sanitize_text(party),
                "Subject":           sanitize_text(subj),
                "Size":              size,
                "Time":              sanitize_text(trepr),
            })
        else:
            seen.append(key)
    except Exception as e:
        log(f"[WARN] Item #{idx} error: {sanitize_text(e)}")

log(f"[Result] Duplicates found: {len(dups)}")

if dups:
    df = pd.DataFrame(dups)
    display(df.head(SHOW_ROWS))
    df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
    log(f"[Saved] {CSV_PATH}")
else:
    log("[Saved] No duplicates; nothing to write.")

log(f"=== Dupe scan finished { _dt.datetime.now() } ===")
print(f"\nReports written to: {REPORT_DIR}")


**Block 4: Duplicate move/cleanup**

Cross-folder duplicates:
When RECURSIVE = True, the script scans all subfolders under the selected root.
The parameter KEEP_ORIGINAL_IN defines where to keep the canonical message:
* "root" → keeps the copy in the top-level folder;
* "deep" → keeps the copy in the deepest subfolder where duplicates occur.

In [None]:
# Recommended: always run with DRY_RUN = True first to verify results before enabling actual moves.
# Moves detected duplicate emails into an auto-created "Duplicates_YYYYMMDD" folder under the selected root.
# Uses strong identifiers when available and keeps one canonical copy per duplicate group.
# KEEP_ORIGINAL_IN = "root" or "deep" — defines where to keep the canonical copy
# when duplicates are found across subfolders (requires RECURSIVE = True).

# -------- Parameters (documented) --------
TARGET_PST_FRAGMENT = "Outlook.pst"      # substring of Store FilePath to select the target PST/OST
TARGET_FOLDER_PATH  = r"\Inbox"         # e.g. r"\Inbox" or r"\Outlook\Outbox"; empty -> root of target store
DEFAULT_FOLDER_CODE = 6                # fallback if TARGET_FOLDER_PATH empty: 6=Inbox, 5=Sent, 3=Deleted, 16=Drafts
RECURSIVE           = True             # recurse into subfolders
DRY_RUN             = True            # True = report only; False = actually move duplicates
KEEP_ORIGINAL_IN    = "deep"           # "root" -> keep the top-level copy; "deep" -> keep the deepest copy

# Subject / recipients normalization
SUBJECT_CASE_INSENSITIVE = True    # normalize subject to lowercase before comparison
SUBJECT_STRIP_INVISIBLES = True    # remove zero-width/control characters from subject
KEEP_RE_FWD_PREFIXES     = True    # keep "Re:" / "Fwd:" prefixes (no stripping)
ADDRESS_ROLE             = "sender"    # use "sender" or "recipient" to build the party key
RECIPIENT_INCLUDE_CC_BCC = True        # if ADDRESS_ROLE=="recipient", include CC/BCC in key

# Time / size matching
DATE_PRECISION = "minute"           # "minute" or "second"
TIME_FIELD     = "sent"             # "sent" | "received" | "none"
STRICT_TIME    = True               # require exact timestamp match at chosen precision
SIZE_ABS_TOL   = 4096               # allowed absolute size difference (bytes)

# Fallbacks for time extraction
FALLBACK_RECEIVED_IF_MISSING = True
FALLBACK_MAPI_IF_MISSING     = True
FALLBACK_CREATION_IF_MISSING = True

# Strong identifiers (MAPI)
PR_CLIENT_SUBMIT_TIME    = "http://schemas.microsoft.com/mapi/proptag/0x00390040"
PR_MESSAGE_DELIVERY_TIME = "http://schemas.microsoft.com/mapi/proptag/0x0E060040"
PR_INTERNET_MESSAGE_ID   = "http://schemas.microsoft.com/mapi/proptag/0x1035001E"  # string
PR_SEARCH_KEY            = "http://schemas.microsoft.com/mapi/proptag/0x300B0102"  # binary

# Safe display address props (no Outlook Guard prompts)
PR_DISPLAY_TO  = "http://schemas.microsoft.com/mapi/proptag/0x0E04001E"
PR_DISPLAY_CC  = "http://schemas.microsoft.com/mapi/proptag/0x0E03001E"
PR_DISPLAY_BCC = "http://schemas.microsoft.com/mapi/proptag/0x0E02001E"

# Skip these folders by name (case-sensitive as Outlook shows)
SKIP_FOLDERS = {
    "Deleted Items","Удаленные","Удалённые",
    "Drafts","Черновики",
    "Junk E-mail","Нежелательная почта"
}

# Output paths (reuse REPORT_DIR from Block 1)
if "REPORT_DIR" not in globals():
    raise RuntimeError("REPORT_DIR not found. Run Block 1 first.")
LOG_PATH  = os.path.join(REPORT_DIR, "dupe_move.log")
CSV_MOVED = os.path.join(REPORT_DIR, "dupe_moved.csv")

# -------- Helpers --------
try:
    from pywintypes import TimeType as _PyWinTime
except Exception:
    _PyWinTime = type("Dummy", (), {})

_SURROGATES_RE = re.compile(r'[\ud800-\udfff]')

def _sanitize(x):
    if x is None: return ""
    s = str(x)
    try:
        s.encode("utf-8"); return s
    except UnicodeEncodeError:
        pass
    s = _SURROGATES_RE.sub("�", s)
    return s.encode("utf-8","replace").decode("utf-8")

def log(msg):
    s = _sanitize(msg)
    print(s, flush=True)
    with open(LOG_PATH, "a", encoding="utf-8") as f:
        ts = _dt.datetime.now().isoformat(sep=" ", timespec="seconds")
        f.write(f"{ts} | {s}\n")

def get_folder_by_path_in_store(store, rel_path: str):
    """Resolve relative MAPI path within a store, e.g. r"\Inbox" or r"\Folder\Subfolder"."""
    f = store.GetRootFolder()
    p = (rel_path or "").strip("\\")
    if not p:
        return f
    for name in p.split("\\"):
        if name in SKIP_FOLDERS:
            # still allow resolving path; only skip when iterating
            pass
        f = f.Folders[name]
    return f

def iter_folders_recursive(folder):
    """Yield folder and all subfolders except those in SKIP_FOLDERS."""
    yield folder
    for i in range(1, folder.Folders.Count + 1):
        sub = folder.Folders.Item(i)
        if sub.Name in SKIP_FOLDERS:
            continue
        yield from iter_folders_recursive(sub)

def _fmt_choice():
    return "%Y-%m-%d %H:%M" if DATE_PRECISION=="minute" else "%Y-%m-%d %H:%M:%S"

def _safe_dt_to_str(dt):
    fmt = _fmt_choice()
    if isinstance(dt, _PyWinTime):
        for fn in ("UTC","Format"):
            pass
        try:    return dt.UTC().Format(fmt)
        except: 
            try:    return dt.Format(fmt)
            except: return ""
    if isinstance(dt, _dt.datetime): return dt.strftime(fmt)
    if isinstance(dt, str) and dt.strip(): return dt.strip()
    return ""

def normalize_subject(subj):
    """Normalize subject for matching; optionally keep Re/Fwd prefixes."""
    if not subj: return ""
    s = unicodedata.normalize("NFKC", str(subj))
    s = re.sub(r"[\u200B-\u200D\uFEFF]", "", s)
    if SUBJECT_STRIP_INVISIBLES:
        s = s.replace("\xa0"," ").replace("\x00","")
        s = re.sub(r"\s+"," ", s.strip())
    if SUBJECT_CASE_INSENSITIVE:
        s = s.lower()
    if not KEEP_RE_FWD_PREFIXES:
        s = re.sub(r"^(re(\[\d+\])?|fw|fwd|ответ|пересылка)[:\-\s]+","", s, flags=re.I)
    return s

def _display_recipients_via_mapi(mail):
    """Read display strings for To/CC/BCC via PropertyAccessor to avoid Guard prompts."""
    to = cc = bcc = ""
    try:
        pa = getattr(mail, "PropertyAccessor", None)
        if pa:
            to  = pa.GetProperty(PR_DISPLAY_TO)  or ""
            cc  = pa.GetProperty(PR_DISPLAY_CC)  or ""
            bcc = pa.GetProperty(PR_DISPLAY_BCC) or ""
    except Exception:
        pass
    def _norm(s):
        s = (s or "").replace("\xa0"," ").replace("\x00"," ")
        return re.sub(r"\s+"," ", s.strip()).lower()
    return _norm(to), _norm(cc), _norm(bcc)

def get_party(mail):
    """Build party key: either sender address or normalized recipient display list."""
    if ADDRESS_ROLE == "sender":
        a = (getattr(mail, "SenderEmailAddress", "") or "").strip()
        a = a.replace("\xa0"," ").replace("\x00"," ")
        return re.sub(r"\s+"," ", a).lower()
    to_s, cc_s, bcc_s = _display_recipients_via_mapi(mail)
    parts = []
    for s in (to_s, cc_s if RECIPIENT_INCLUDE_CC_BCC else "", bcc_s if RECIPIENT_INCLUDE_CC_BCC else ""):
        if s:
            parts += [p.strip() for p in re.split(r"[;,]", s) if p.strip()]
    return ",".join(sorted(set(parts)))

def get_time_repr(mail):
    """Return string timestamp according to TIME_FIELD, with fallbacks."""
    if TIME_FIELD == "none": return ""
    dtv = None
    try:
        if TIME_FIELD == "sent":
            dtv = getattr(mail, "SentOn", None) or (getattr(mail, "ReceivedTime", None) if FALLBACK_RECEIVED_IF_MISSING else None)
        else:
            dtv = getattr(mail, "ReceivedTime", None) or (getattr(mail, "SentOn", None) if FALLBACK_RECEIVED_IF_MISSING else None)
    except Exception:
        dtv = None
    s = _safe_dt_to_str(dtv)
    if s: return s
    if FALLBACK_MAPI_IF_MISSING:
        try:
            pa = getattr(mail,"PropertyAccessor",None)
            if pa:
                dt2 = pa.GetProperty(PR_CLIENT_SUBMIT_TIME if TIME_FIELD=="sent" else PR_MESSAGE_DELIVERY_TIME)
                s2 = _safe_dt_to_str(dt2)
                if s2: return s2
        except Exception:
            pass
    if FALLBACK_CREATION_IF_MISSING:
        try:
            dt3 = getattr(mail,"CreationTime",None) or getattr(mail,"LastModificationTime",None)
            s3 = _safe_dt_to_str(dt3)
            if s3: return s3
        except Exception:
            pass
    return ""

def size_ok(a, b): 
    try:
        return abs(int(a) - int(b)) <= SIZE_ABS_TOL
    except Exception:
        return False

def get_ids(mail):
    """Return (InternetMessageId, SearchKeyHex)."""
    inet_id, search_hex = "", ""
    try:
        pa = getattr(mail, "PropertyAccessor", None)
        if pa:
            inet_id = pa.GetProperty(PR_INTERNET_MESSAGE_ID) or ""
            raw = pa.GetProperty(PR_SEARCH_KEY)
            if isinstance(raw, (bytes, bytearray)):
                search_hex = raw.hex()
    except Exception:
        pass
    return inet_id, search_hex

def _folder_depth(folder_path: str) -> int:
    """Compute depth of \\Root\\A\\B -> 2 (root depth excluded)."""
    return max(0, folder_path.count("\\") - 1)

# -------- Main logic --------
with open(LOG_PATH, "w", encoding="utf-8") as f:
    f.write(f"=== MOVE RUN { _dt.datetime.now().isoformat(sep=' ', timespec='seconds') } ===\n")

# reuse MAPI namespace from Block 2, or connect if missing
if "ns" not in globals():
    from win32com.client import Dispatch, gencache
    try: gencache.EnsureDispatch("Outlook.Application")
    except: pass
    app = Dispatch("Outlook.Application")
    ns  = app.GetNamespace("MAPI")

# locate target store
target = None
for i in range(1, ns.Stores.Count + 1):
    s = ns.Stores.Item(i)
    if TARGET_PST_FRAGMENT.lower() in (getattr(s, "FilePath", "") or "").lower():
        target = s; break
if not target:
    log(f"[ERR] Store with fragment '{TARGET_PST_FRAGMENT}' not found."); 
    raise SystemExit

# resolve root folder
root_folder = get_folder_by_path_in_store(target, TARGET_FOLDER_PATH or "\\")
log(f"[Store] {target.DisplayName} | Root: {root_folder.FolderPath} | KEEP_ORIGINAL_IN={KEEP_ORIGINAL_IN} | DRY_RUN={DRY_RUN}")

# collect mail items (read-only pass)
olMailItem = 43
folders = list(iter_folders_recursive(root_folder)) if RECURSIVE else [root_folder]
all_items = []
for fld in folders:
    try:
        items = fld.Items
        for idx in range(1, items.Count + 1):
            try:
                it = items.Item(idx)
            except Exception as e:
                log(f"[WARN] Fetch {fld.FolderPath} #{idx} failed: {_sanitize(e)}"); 
                continue
            if getattr(it, "Class", None) != olMailItem:
                continue
            mc = getattr(it, "MessageClass", "") or ""
            if not mc.startswith("IPM.Note") or mc.startswith(("REPORT.IPM.NOTE", "IPM.Schedule.Meeting")):
                continue
            party = get_party(it)
            subj  = normalize_subject(getattr(it,"Subject",""))
            size  = int(getattr(it,"Size",0) or 0)
            tstr  = get_time_repr(it)
            if STRICT_TIME and not tstr:
                continue
            inet_id, search_hex = get_ids(it)
            entry_id = _sanitize(getattr(it,"EntryID",""))
            all_items.append({
                "FolderPath": _sanitize(fld.FolderPath),
                "Party": party, "Subject": subj, "Time": tstr, "Size": size,
                "InetId": inet_id, "SearchKeyHex": search_hex, "EntryID": entry_id
            })
    except Exception as e:
        log(f"[FOLDER FAIL] {fld.FolderPath}: {_sanitize(e)}")

log(f"[SCAN DONE] Collected messages: {len(all_items)}")

# group by composite key (strong IDs first)
def make_group_key(r):
    return (r["InetId"] or "", r["SearchKeyHex"] or "", r["Party"], r["Subject"], r["Time"])

buckets = {}
for r in all_items:
    buckets.setdefault(make_group_key(r), []).append(r)

# prepare destination folder
dupe_folder_name = "Duplicates_" + _dt.datetime.now().strftime("%Y%m%d")
def ensure_dupe_folder(parent):
    for i in range(1, parent.Folders.Count + 1):
        f = parent.Folders.Item(i)
        if f.Name == dupe_folder_name:
            return f
    return parent.Folders.Add(dupe_folder_name)
dupe_folder = ensure_dupe_folder(root_folder)

# choose canonical + move others
moved_rows, moved_total = [], 0
for key, records in buckets.items():
    if len(records) < 2:
        continue
    # order by depth then by size (stable)
    recs = sorted(records, key=lambda r: (_folder_depth(r["FolderPath"]), r["Size"]))
    root_depth = _folder_depth(root_folder.FolderPath)

    if KEEP_ORIGINAL_IN == "root":
        root_recs = [r for r in recs if _folder_depth(r["FolderPath"]) == root_depth]
        base = root_recs[0] if root_recs else recs[0]
    else:  # "deep"
        base = max(recs, key=lambda r: _folder_depth(r["FolderPath"]))

    base_sz = base["Size"]
    base_id = base["EntryID"]

    for r in recs:
        if r["EntryID"] == base_id:
            continue
        # size tolerance as final check
        if not size_ok(r["Size"], base_sz):
            continue
        moved_rows.append({
            "FolderPath": r["FolderPath"], "Party": r["Party"], "Subject": r["Subject"], "Time": r["Time"],
            "Size_Base": base_sz, "Size_Dupe": r["Size"], "EntryID_Dupe": r["EntryID"], 
            "InetId": r["InetId"], "SearchKeyHex": r["SearchKeyHex"]
        })
        if not DRY_RUN:
            try:
                mail = ns.GetItemFromID(r["EntryID"])
                mail.Move(dupe_folder)
                moved_total += 1
            except Exception as e:
                log(f"[MOVE ERR] {r['EntryID']}: {_sanitize(e)}")

# write CSV and summary
pd.DataFrame(moved_rows).to_csv(CSV_MOVED, index=False, encoding="utf-8-sig")
log(f"[CSV] {CSV_MOVED}")
log(f"[RESULT] Duplicates detected: {len(moved_rows)} | Moved: {moved_total if not DRY_RUN else 0}")
log("=== DONE ===")
print(f"\nReports written to: {REPORT_DIR}")