In [243]:
import numpy as np

# ============================================================
# NumPy backend for QSVT + imag-extractor
# - avoids qiskit transpile + gate decomposition entirely
# - works great when dimU <= 256 or so (P <= ~16 in your setup)
# ============================================================

def _apply_rz_on_anc_inplace(psi: np.ndarray, N: int, phi: float):
    """
    Apply qiskit Rz(theta) on anc with theta = -2*phi.
    Rz(-2phi) = diag(e^{+i phi}, e^{-i phi}) on anc |0>,|1>.
    anc is assumed MSB, so anc=0 indices [0:N), anc=1 indices [N:2N).
    """
    # anc=0 half
    psi[:N] *= np.exp(1j * phi)
    # anc=1 half
    psi[N:] *= np.exp(-1j * phi)

def _apply_z_on_anc_inplace(psi: np.ndarray, N: int):
    """Apply Z on anc (MSB): anc=1 half flips sign."""
    psi[N:] *= -1.0

def _apply_W_inplace(psi: np.ndarray, U_block: np.ndarray, N: int, tmp: np.ndarray):
    """
    W = Z_anc * U_block  (since in circuit: apply U_block then Z)
    """
    # tmp = U_block @ psi
    tmp[:] = U_block @ psi
    psi[:] = tmp
    _apply_z_on_anc_inplace(psi, N)

def _apply_W_dagger_inplace(psi: np.ndarray, U_block_dag: np.ndarray, N: int, tmp: np.ndarray):
    """
    W† = (Z U)† = U† Z
    So apply Z first, then apply U†.
    """
    _apply_z_on_anc_inplace(psi, N)
    tmp[:] = U_block_dag @ psi
    psi[:] = tmp

def apply_qsvt_to_state_numpy(
    psi: np.ndarray,
    U_block: np.ndarray,
    phases: np.ndarray,
) -> np.ndarray:
    """
    Apply the same unitary as build_qsvt_from_phases(...) but directly to a state vector.
    Assumes:
      - U_block shape = (2N,2N)
      - anc is MSB => N = dimU//2
    Circuit order in your code:
      for phi in phases[:-1]:
         Rz(-2phi) on anc
         W
      Rz(-2*phases[-1]) on anc
    So unitary is: U = Rz_last · (W Rz_{L-2}) · ... · (W Rz_0)
    """
    psi = np.asarray(psi, dtype=np.complex128).copy()
    phases = np.asarray(phases, dtype=float).reshape(-1)

    dimU = U_block.shape[0]
    assert U_block.shape == (dimU, dimU)
    assert dimU % 2 == 0
    N = dimU // 2

    tmp = np.zeros_like(psi)

    # loop: Rz then W
    for phi in phases[:-1]:
        _apply_rz_on_anc_inplace(psi, N, float(phi))
        _apply_W_inplace(psi, U_block, N, tmp)

    # final Rz
    _apply_rz_on_anc_inplace(psi, N, float(phases[-1]))
    return psi

def apply_qsvt_dagger_to_state_numpy(
    psi: np.ndarray,
    U_block: np.ndarray,
    phases: np.ndarray,
) -> np.ndarray:
    """
    Apply U_qsvt† to a state vector, matching qsvt.inverse() in your code.
    Inverse sequence:
      Rz_last†, then for k=L-2..0: (W† then Rz_k†)
    Since Rz(-2phi)† = Rz(+2phi), we implement by using -phi in _apply_rz helper:
      _apply_rz_on_anc_inplace with phi -> -phi applies Rz(+2phi).
    """
    psi = np.asarray(psi, dtype=np.complex128).copy()
    phases = np.asarray(phases, dtype=float).reshape(-1)

    dimU = U_block.shape[0]
    assert dimU % 2 == 0
    N = dimU // 2

    U_block_dag = U_block.conj().T
    tmp = np.zeros_like(psi)

    # Rz_last† = Rz(+2*phi_last) => use (-phi_last) in our helper
    _apply_rz_on_anc_inplace(psi, N, float(-phases[-1]))

    # then for k = L-2..0: apply W† then Rz_k†
    for phi in phases[-2::-1]:
        _apply_W_dagger_inplace(psi, U_block_dag, N, tmp)
        _apply_rz_on_anc_inplace(psi, N, float(-phi))

    return psi

def imag_extractor_numpy(
    bHp: np.ndarray,
    U_block: np.ndarray,
    phases: np.ndarray,
) -> tuple[np.ndarray, float]:
    """
    Numpy equivalent of build_imag_extractor + Statevector sim + postselect ctrl=1, anc=0.

    Starting state: |anc=0> ⊗ |bHp>
      (dimU = 2N, anc MSB so first half is anc=0)

    After the interferometer:
      ctrl=1 component = (U|psi> - U†|psi>)/2

    Then postselect anc=0 => take first N amps.
    Returns:
      y_vec (length N), p_succ = ||y_vec||^2
    """
    bHp = np.asarray(bHp, dtype=np.complex128).reshape(-1)
    bHp = bHp / (np.linalg.norm(bHp) + 1e-18)

    dimU = U_block.shape[0]
    assert dimU % 2 == 0
    N = dimU // 2
    assert bHp.shape[0] == N, f"bHp len={len(bHp)} but expected N={N}"

    psi = np.zeros((dimU,), dtype=np.complex128)
    psi[:N] = bHp  # anc=0

    upsi   = apply_qsvt_to_state_numpy(psi, U_block, phases)
    upsid  = apply_qsvt_dagger_to_state_numpy(psi, U_block, phases)

    y_total = 0.5 * (upsi - upsid)    # ctrl=1 branch (up to global phase)
    y_vec   = y_total[:N].copy()      # anc=0 postselect

    p_succ = float(np.vdot(y_vec, y_vec).real)
    return y_vec, p_succ


In [244]:
# =========================
# QSVT pipeline (clean)
# - classical tuning (no pyqsp)
# - QSP feasibility check (sym_qsp)
# - final QSVT with odd(Im(U00)) extractor via LCU (U - U^\dagger)/2
# =========================

import numpy as np
from dataclasses import dataclass
from typing import Callable, Iterable, Optional, Tuple, List

from qiskit import QuantumCircuit, transpile
from qiskit.circuit.library import UnitaryGate, StatePreparation
from qiskit.quantum_info import Statevector, Operator

from pyqsp.angle_sequence import QuantumSignalProcessingPhases
from pyqsp.poly import PolyTaylorSeries


# ---------- A construction ----------
def make_A7_strict(
    P: int,
    a1_cos: float,
    a_diag_cos: np.ndarray,
    a_tail_cos: np.ndarray,
    a1_sin: float,
    a_diag_sin: np.ndarray,
    a_tail_sin: np.ndarray,
) -> np.ndarray:
    a_diag_cos = np.asarray(a_diag_cos, dtype=float).reshape(-1)
    a_tail_cos = np.asarray(a_tail_cos, dtype=float).reshape(-1)
    a_diag_sin = np.asarray(a_diag_sin, dtype=float).reshape(-1)
    a_tail_sin = np.asarray(a_tail_sin, dtype=float).reshape(-1)

    assert a_diag_cos.shape[0] == P - 1
    assert a_tail_cos.shape[0] == P - 1
    assert a_diag_sin.shape[0] == P - 1
    assert a_tail_sin.shape[0] == P - 1

    A_cos = np.zeros((P - 1, P + 1), dtype=float)
    A_sin = np.zeros((P - 1, P + 1), dtype=float)

    for i in range(P - 1):
        A_cos[i, 0] = a1_cos
        A_sin[i, 0] = a1_sin

        A_cos[i, i + 1] = -a_diag_cos[i]
        A_sin[i, i + 1] = -a_diag_sin[i]

        A_cos[i, P] = a_tail_cos[i]
        A_sin[i, P] = a_tail_sin[i]

    return np.vstack([A_cos, A_sin]).astype(np.complex128)


# ---------- Hermitian embedding ----------
def hermitian_embedding(A: np.ndarray) -> np.ndarray:
    A = np.asarray(A, dtype=np.complex128)
    m, n = A.shape
    Zm = np.zeros((m, m), dtype=np.complex128)
    Zn = np.zeros((n, n), dtype=np.complex128)
    return np.block([[Zm, A], [A.conj().T, Zn]])

def make_consistent_b(A: np.ndarray, rng: np.random.Generator):
    A = np.asarray(A, dtype=np.complex128)
    x_true = rng.normal(size=A.shape[1]) + 0j
    b = A @ x_true
    return b, x_true

def embed_linear_system(A: np.ndarray, b: np.ndarray):
    A = np.asarray(A, dtype=np.complex128)
    b = np.asarray(b, dtype=np.complex128).reshape(-1)
    m, n = A.shape
    H = hermitian_embedding(A)
    bH = np.concatenate([b, np.zeros(n, dtype=np.complex128)], axis=0)
    return H, bH


# ---------- padding ----------
def next_pow2(n: int) -> int:
    return 1 << (n - 1).bit_length()

def pad_to_pow2_hermitian(M: np.ndarray, pad_diag: float = 1.0) -> np.ndarray:
    M = np.asarray(M, dtype=np.complex128)
    n = M.shape[0]
    N = next_pow2(n)
    if N == n:
        return M
    Mp = np.zeros((N, N), dtype=np.complex128)
    Mp[:n, :n] = M
    Mp[n:, n:] = pad_diag * np.eye(N - n, dtype=np.complex128)
    return Mp

def pad_vec_to_len(v: np.ndarray, N: int) -> np.ndarray:
    v = np.asarray(v, dtype=np.complex128).reshape(-1)
    out = np.zeros((N,), dtype=np.complex128)
    out[:len(v)] = v
    return out


# ---------- Halmos block-encoding with forced alpha ----------
def halmos_block_encode_hermitian_forced_alpha(H: np.ndarray, alpha: float):
    H = np.asarray(H, dtype=np.complex128)
    H = (H + H.conj().T) / 2
    N = H.shape[0]
    Atil = H / alpha

    w, V = np.linalg.eigh(Atil)
    w = np.clip(w, -1.0, 1.0)
    s = np.sqrt(np.clip(1.0 - w*w, 0.0, None))
    B = V @ np.diag(s) @ V.conj().T

    U = np.block([[Atil, B],
                  [B,   -Atil]])

    unit_err = np.linalg.norm(U.conj().T @ U - np.eye(2*N))
    tl_err = np.linalg.norm(U[:N, :N] - Atil)
    return U, float(unit_err), float(tl_err)


# ---------- target g(x): odd regularized inverse with safe global scaling ----------
def _raw_reginv(x: np.ndarray, mu: float) -> np.ndarray:
    x = np.asarray(x, dtype=float)
    return x / (x*x + mu*mu)

def target_regularized_inverse_fixedscale_safe(
    x: np.ndarray,
    mu: float,
    max_scale: float,
    L: float = 0.9,
    safety_grid: int = 4096,
) -> np.ndarray:
    """
    g(x) = c * x/(x^2+mu^2), with a *global* c chosen so that max_{|x|<=max_scale}|g(x)| <= L.
    Extra safety: numerically re-check on dense grid and shrink if needed.
    Also enforce oddness numerically: (g(x)-g(-x))/2.
    """
    x = np.asarray(x, dtype=float)

    # analytic max of |x/(x^2+mu^2)| on [-max_scale,max_scale]
    if max_scale >= mu:
        max_abs = 1.0 / (2.0 * mu)
    else:
        max_abs = max_scale / (max_scale*max_scale + mu*mu)

    c = (L / max_abs) if max_abs > 0 else 1.0
    y = c * _raw_reginv(x, mu)

    # numeric "parity + safety" pass
    # parity enforce:
    y = 0.5 * (y - (c * _raw_reginv(-x, mu)))

    # safety check on grid (avoid weird overshoot due to float / edge cases)
    xs = np.linspace(-max_scale, max_scale, safety_grid)
    ys = c * _raw_reginv(xs, mu)
    ys = 0.5 * (ys - (c * _raw_reginv(-xs, mu)))
    m = float(np.max(np.abs(ys)))
    if m > L * (1.0 + 1e-6):
        c2 = (L / m)
        y *= c2  # shrink globally
    return y


def make_g(mu: float, max_scale: float, L: float) -> Callable[[np.ndarray], np.ndarray]:
    def g(x):
        return target_regularized_inverse_fixedscale_safe(x, mu=mu, max_scale=max_scale, L=L)
    return g


# ---------- classical apply f(H)b where f(λ)=g(λ/alpha) ----------
def classical_apply_fHb(Hp: np.ndarray, bHp: np.ndarray, alpha: float, g_func) -> np.ndarray:
    Hp = np.asarray(Hp, dtype=np.complex128)
    bHp = np.asarray(bHp, dtype=np.complex128).reshape(-1)

    w, V = np.linalg.eigh(Hp)
    lam_tilde = (w / alpha).real
    g_vals = g_func(lam_tilde)  # real
    y = V @ (g_vals * (V.conj().T @ bHp))
    return y

from typing import Any
# ---------- QSP phase synthesis wrapper (feasibility check) ----------
@dataclass
class QSPResult:
    ok: bool
    reason: str
    phases: Optional[np.ndarray] = None
    g: Optional[Callable[[np.ndarray], np.ndarray]] = None
    poly: Optional[Any] = None          # ★ 타입 힌트 추가
    domain_err: Optional[float] = None
    poly_max: Optional[float] = None

def try_synthesize_qsp_phases(
    mu: float,
    degree: int,
    max_scale: float,
    L: float,
    poly_max_tol: float = 1.02,     # allow a little slack; if huge, sym_qsp often dies
    domain_err_tol: float = 5e-3,   # if approximation is bad, skip
    grid: int = 1201,
) -> QSPResult:
    g = make_g(mu=mu, max_scale=max_scale, L=L)

    # build poly approximation in Chebyshev basis on [-max_scale, max_scale]
    try:
        poly = PolyTaylorSeries().taylor_series(
            func=g,
            degree=degree,
            max_scale=max_scale,
            chebyshev_basis=True,
        )
    except Exception as e:
        return QSPResult(False, f"PolyTaylorSeries failed: {type(e).__name__}: {e}")

    # evaluate on grid
    xs = np.linspace(-max_scale, max_scale, grid)
    gx = g(xs)
    px = poly(xs)
    domain_err = float(np.max(np.abs(gx - px)))
    poly_max = float(np.max(np.abs(px)))

    # quick sanity filters (prevents wasting sym_qsp calls)
    if (not np.isfinite(domain_err)) or (not np.isfinite(poly_max)):
        return QSPResult(False, "non-finite domain_err/poly_max", g=g, poly=poly, domain_err=domain_err, poly_max=poly_max)

    # If poly blows up (like 5, 10, 12...), sym_qsp 거의 확실히 발산/실패함
    if poly_max > poly_max_tol:
        return QSPResult(False, f"poly_max too large ({poly_max:.3g} > {poly_max_tol})", g=g, poly=poly, domain_err=domain_err, poly_max=poly_max)

    if domain_err > domain_err_tol:
        return QSPResult(False, f"domain_err too large ({domain_err:.3g} > {domain_err_tol})", g=g, poly=poly, domain_err=domain_err, poly_max=poly_max)

    # sym_qsp attempt
    try:
        phases, _, _ = QuantumSignalProcessingPhases(
            poly,
            method="sym_qsp",
            chebyshev_basis=True,
        )
    except Exception as e:
        return QSPResult(False, f"sym_qsp failed: {type(e).__name__}: {e}", g=g, poly=poly, domain_err=domain_err, poly_max=poly_max)

    phases = np.array(phases, dtype=float).reshape(-1)
    if len(phases) != (degree + 1) and len(phases) != (degree + 2):
        # pyqsp 버전에 따라 길이가 다를 수 있는데, 너무 이상하면 경고
        pass

    return QSPResult(True, "ok", phases=phases, g=g, poly=poly, domain_err=domain_err, poly_max=poly_max)


# ---------- QSVT circuit ----------
def build_qubiterate(U_block: np.ndarray, anc: int) -> QuantumCircuit:
    dim = U_block.shape[0]
    n_qubits = int(np.log2(dim))
    assert 2**n_qubits == dim, f"U_block dim={dim} not power-of-2"

    qc = QuantumCircuit(n_qubits)
    qc.append(UnitaryGate(U_block, label="U_A", check_input=False), list(range(n_qubits)))
    qc.z(anc)  # W = (2Π-I)U  with Π=|0><0| on anc
    return qc

def build_qsvt_from_phases(U_block: np.ndarray, phases: np.ndarray, anc: int) -> QuantumCircuit:
    W_gate = build_qubiterate(U_block, anc).to_gate(label="W")
    n = int(np.log2(U_block.shape[0]))
    qc = QuantumCircuit(n)

    for phi in phases[:-1]:
        qc.rz(-2.0 * float(phi), anc)  # Rz(-2phi) = e^{+i phi Z}
        qc.append(W_gate, list(range(n)))
    qc.rz(-2.0 * float(phases[-1]), anc)
    return qc


# ---------- postselect util ----------
def postselect_bits(sv: np.ndarray, n_qubits: int, fixed: dict[int, int]):
    keep = [q for q in range(n_qubits) if q not in fixed]
    out = np.zeros((2**len(keep),), dtype=np.complex128)

    for idx, amp in enumerate(sv):
        ok = True
        for q, v in fixed.items():
            if ((idx >> q) & 1) != v:
                ok = False
                break
        if not ok:
            continue

        j = 0
        for k, q in enumerate(keep):
            j |= (((idx >> q) & 1) << k)
        out[j] = amp

    p = float(np.vdot(out, out).real)
    return out, p, keep


# ---------- odd-target extractor: (U - U†)/2 via LCU ----------
def build_imag_extractor(U_circ: QuantumCircuit, sys_state: np.ndarray, sys_qubits: list[int]):
    """
    Append ctrl qubit (index = nq).
    Prepare sys_state on sys_qubits (anc remains |0> implicitly).
    Apply controlled-U on ctrl=0 and controlled-U† on ctrl=1,
    then H on ctrl. Postselect ctrl=1 gives (U - U†)/2.
    Then also postselect anc=0 => i*Im(U00) acting on sys.
    """
    nq = U_circ.num_qubits
    ctrl = nq
    qc = QuantumCircuit(nq + 1)

    b = np.asarray(sys_state, dtype=np.complex128).reshape(-1)
    b = b / (np.linalg.norm(b) + 1e-18)
    qc.append(StatePreparation(b), sys_qubits)

    Ug = U_circ.to_gate(label="Uqsvt")
    Ugd = U_circ.inverse().to_gate(label="Uqsvt†")

    qc.h(ctrl)

    # ctrl=0 -> apply U (X-control trick)
    qc.x(ctrl)
    qc.append(Ug.control(1), [ctrl] + list(range(nq)))
    qc.x(ctrl)

    # ctrl=1 -> apply U†
    qc.append(Ugd.control(1), [ctrl] + list(range(nq)))

    qc.h(ctrl)
    return qc, ctrl


# ---------- debug helpers ----------
def report_gate_stats(qc, basis_gates=None, opt=1):
    if basis_gates is None:
        tqc = transpile(qc, optimization_level=opt)
    else:
        tqc = transpile(qc, basis_gates=basis_gates, optimization_level=opt)
    return {"depth": tqc.depth(), "count_ops": dict(tqc.count_ops()), "num_qubits": tqc.num_qubits}

def debug_embedding(A: np.ndarray, H: np.ndarray):
    A = np.asarray(A, dtype=np.complex128)
    H = np.asarray(H, dtype=np.complex128)
    s = np.linalg.svd(A, compute_uv=False)
    w = np.linalg.eigvalsh((H + H.conj().T) / 2)
    w_sorted = np.sort(w)

    print(">>> Embedding debug")
    print("singular values σ(A):", np.round(s, 6))
    print("eigenvalues λ(H):", np.round(w_sorted, 6))
    print("min/max |λ(H)|:", float(np.min(np.abs(w_sorted))), float(np.max(np.abs(w_sorted))))
    print()


# ---------- candidate + tuning ----------
@dataclass
class Candidate:
    score: float
    objective: str
    max_scale: float
    mu: float
    pad_diag: float
    alpha: float
    rel_err_class: float
    resid_class: float

def tune_params_classical(
    A: np.ndarray,
    b: np.ndarray,
    x_ls: np.ndarray,
    max_scale_grid: Iterable[float],
    mu_grid: Iterable[float],
    pad_diag_grid: Iterable[float],
    L: float = 0.9,
    objective: str = "resid",
    top_k: int = 8,
    verbose: bool = True,
) -> Tuple[List[Candidate], np.ndarray, np.ndarray, np.ndarray]:
    """
    returns: (top_candidates_sorted, H, bH, x_ls)
    """
    m, n = A.shape
    H, bH = embed_linear_system(A, b)
    H = (H + H.conj().T) / 2

    cands: List[Candidate] = []

    for pad_diag in pad_diag_grid:
        Hp = pad_to_pow2_hermitian(H, pad_diag=pad_diag)
        N = Hp.shape[0]
        bHp = pad_vec_to_len(bH, N)
        bHp = bHp / (np.linalg.norm(bHp) + 1e-18)

        w = np.linalg.eigvalsh(Hp)
        rho = float(np.max(np.abs(w))) + 1e-18

        y_target = np.concatenate([np.zeros(m, dtype=np.complex128), x_ls], axis=0)
        y_target = pad_vec_to_len(y_target, N)

        for max_scale in max_scale_grid:
            alpha = rho / max_scale  # ensures spectrum(Hp/alpha) within [-max_scale, max_scale]

            for mu in mu_grid:
                g = make_g(mu=mu, max_scale=max_scale, L=L)
                y_class = classical_apply_fHb(Hp, bHp, alpha, g)

                sc = (np.vdot(y_class, y_target) /
                      (np.vdot(y_class, y_class) + 1e-18))
                y_hat = sc * y_class
                x_hat = y_hat[m:m+n]

                rel_err = float(np.linalg.norm(x_hat - x_ls) /
                                (np.linalg.norm(x_ls) + 1e-18))
                resid   = float(np.linalg.norm(A @ x_hat - b) /
                                (np.linalg.norm(b) + 1e-18))

                if objective == "resid":
                    score = resid
                elif objective == "rel_err":
                    score = rel_err
                else:
                    score = resid + 0.5 * rel_err

                cand = Candidate(
                    score=float(score),
                    objective=objective,
                    max_scale=float(max_scale),
                    mu=float(mu),
                    pad_diag=float(pad_diag),
                    alpha=float(alpha),
                    rel_err_class=float(rel_err),
                    resid_class=float(resid),
                )
                cands.append(cand)

    cands.sort(key=lambda c: c.score)
    return cands[:top_k], H, bH, x_ls

# =====================================================
# NUMPY MODE: apply QSVT by directly multiplying U_block
# =====================================================
def qsvt_apply_numpy(U_block: np.ndarray, phases: np.ndarray, bHp: np.ndarray, anc: int):
    # anc 인덱스는 현재 구현에서 MSB로 고정되어 있으므로 사용하지 않음
    y_vec, p_succ = imag_extractor_numpy(
        bHp=bHp,
        U_block=U_block,
        phases=phases,
    )
    return y_vec, p_succ

 


# =====================================================
# GATE MODE: Qiskit circuit execution
# =====================================================
def qsvt_apply_gate(U_block, phases, anc, bHp, sys_qubits):
    qsvt = build_qsvt_from_phases(U_block, phases, anc)
    qc_im, ctrl = build_imag_extractor(qsvt, bHp, sys_qubits)
    sv = Statevector.from_instruction(qc_im).data
    y_vec, p_succ, _ = postselect_bits(sv, qc_im.num_qubits,
                                       fixed={ctrl: 1, anc: 0})
    return y_vec, p_succ, qc_im


# =====================================================
# Unified interface
# =====================================================
def run_qsvt(U_block, phases, bHp, anc, sys_qubits, mode="gate"):
    """
    mode="gate"  → Qiskit 회로 실행
    mode="numpy" → numpy 유니터리 곱으로 QSVT 빠르게 시뮬레이션
    """
    if mode == "gate":
        y_vec, p_succ, qc = qsvt_apply_gate(U_block, phases, anc, bHp, sys_qubits)
        return {
            "y": y_vec,
            "p_succ": p_succ,
            "qc": qc,
            "mode": "gate",
        }
    elif mode == "numpy":
        y_vec, p_succ = qsvt_apply_numpy(U_block, phases, bHp, anc)
        return {
            "y": y_vec,
            "p_succ": p_succ,
            "qc": None,
            "mode": "numpy",
        }
    else:
        raise ValueError("mode must be 'gate' or 'numpy'")



# ---------- main pipeline ----------
def qsvt_solve_from_Ab(
    A: np.ndarray,
    b: np.ndarray,
    *,
    degree=120,
    max_scale_grid=(0.4, 0.5, 0.6, 0.7, 0.8, 0.9),
    mu_grid=(0.02, 0.04, 0.06, 0.08, 0.12),
    pad_diag_grid=(1.0, 2.0),
    L=0.6,
    objective="resid",
    top_k=8,
    poly_max_tol=1.5,
    domain_err_tol=2e-1,
    spec_err_tol=2e-1,
    mode="numpy",      # "numpy" or "gate"
):
    A = np.asarray(A, dtype=np.complex128)
    b = np.asarray(b, dtype=np.complex128).reshape(-1)
    m, n = A.shape
    assert b.shape[0] == m

    print(">>> [NLOS] A,b injected")
    print("A shape:", A.shape, "b shape:", b.shape)
    s = np.linalg.svd(A, compute_uv=False)
    print("singular values σ(A):", np.round(s, 4))
    print("cond(A) =", float(s[0] / (s[-1] + 1e-18)))
    print()

    # classical LS reference
    x_ls = np.linalg.lstsq(A, b, rcond=None)[0]

    # 1) Hermitian embedding
    H, bH = embed_linear_system(A, b)
    H = (H + H.conj().T) / 2
    debug_embedding(A, H)

    # 2) Classical tuning (grid search)
    print(">>> Tuning (classical) start")
    top_cands, H, bH, x_ls = tune_params_classical(
        A=A, b=b, x_ls=x_ls,
        max_scale_grid=max_scale_grid,
        mu_grid=mu_grid,
        pad_diag_grid=pad_diag_grid,
        L=L,
        objective=objective,
        top_k=top_k,
        verbose=True,
    )

    # 3) QSP feasibility on top candidates
    feasible = None
    feasible_qsp = None
    feasible_spec_err = None

    for i, c in enumerate(top_cands):
        print(f">>> QSP try cand[{i}] pad={c.pad_diag}, max_scale={c.max_scale}, mu={c.mu}, degree={degree}")

        qsp_res = try_synthesize_qsp_phases(
            mu=c.mu, degree=degree, max_scale=c.max_scale, L=L,
            poly_max_tol=poly_max_tol, domain_err_tol=domain_err_tol,
        )
        if not qsp_res.ok:
            print("   -> FAIL:", qsp_res.reason)
            if qsp_res.poly_max is not None or qsp_res.domain_err is not None:
                print(f"      poly_max={qsp_res.poly_max} domain_err={qsp_res.domain_err}")
            print()
            continue

        Hp = pad_to_pow2_hermitian(H, pad_diag=c.pad_diag)
        w_Hp = np.linalg.eigvalsh(Hp)
        lam_tilde = (w_Hp / c.alpha).real

        gx = qsp_res.g(lam_tilde)
        px = qsp_res.poly(lam_tilde)
        spec_err = float(np.max(np.abs(gx - px)))

        print(f"   ok: domain_err={qsp_res.domain_err:.3e}, poly_max={qsp_res.poly_max:.6f}, spec_err={spec_err:.3e}")
        if spec_err > spec_err_tol:
            print(f"   -> SKIP: spec_err too large ({spec_err:.3g} > {spec_err_tol})\n")
            continue

        feasible = c
        feasible_qsp = qsp_res
        feasible_spec_err = spec_err
        print("   -> SELECTED\n")
        break

    if feasible is None:
        raise RuntimeError("No feasible candidate. Try larger mu_grid, smaller L, or smaller max_scale.")

    # 4) Build Hp/bHp for chosen candidate
    pad_diag = feasible.pad_diag
    alpha = feasible.alpha
    Hp = pad_to_pow2_hermitian(H, pad_diag=pad_diag)
    N = Hp.shape[0]
    bHp = pad_vec_to_len(bH, N)
    bHp = bHp / (np.linalg.norm(bHp) + 1e-18)

    print(">>> Selected params")
    print(f"pad_diag={pad_diag}, max_scale={feasible.max_scale}, mu={feasible.mu}, alpha={alpha:.6e}")
    print(f"QSP: domain_err={feasible_qsp.domain_err:.3e}, poly_max={feasible_qsp.poly_max:.6f}, spec_err={feasible_spec_err:.3e}")
    print()

    # 5) Halmos block encoding
    U_block, unit_err, tl_err = halmos_block_encode_hermitian_forced_alpha(Hp, alpha=alpha)
    print(">>> Halmos debug")
    print(f"[halmos] ||U†U-I||={unit_err:.3e}, ||tl-H/α||={tl_err:.3e}")
    if unit_err > 1e-6:
        raise RuntimeError(f"U_block not unitary enough: {unit_err:.3e}")
    print()

    dimU = U_block.shape[0]
    nq = int(np.log2(dimU))
    anc = nq - 1
    sys_qubits = list(range(nq - 1))

    # 6) Run QSVT (numpy or gate)
    out = run_qsvt(
        U_block=U_block,
        phases=feasible_qsp.phases,
        bHp=bHp,
        anc=anc,
        sys_qubits=sys_qubits,
        mode=mode,
    )
    y_vec = out["y"]
    p_succ = out["p_succ"]

    # 7) Extract x_hat (same 방식)
    y_target = np.concatenate([np.zeros(m, dtype=np.complex128), x_ls], axis=0)
    y_target = pad_vec_to_len(y_target, N)

    sc_q = (np.vdot(y_vec, y_target) / (np.vdot(y_vec, y_vec) + 1e-18))
    y_hat = sc_q * y_vec
    x_hat = y_hat[m:m+n]

    rel_err = float(np.linalg.norm(x_hat - x_ls) / (np.linalg.norm(x_ls) + 1e-18))
    resid   = float(np.linalg.norm(A @ x_hat - b) / (np.linalg.norm(b) + 1e-18))

    print(">>> QSVT vs LS")
    print("p_succ =", p_succ)
    print("rel_err(x_hat vs x_ls) =", rel_err)
    print("resid(A x_hat - b)/||b|| =", resid)
    print()

    return {
        "x_hat": x_hat,
        "x_ls": x_ls,
        "p_succ": p_succ,
        "rel_err": rel_err,
        "resid": resid,
        "chosen": feasible,
        "qsp_domain_err": feasible_qsp.domain_err,
        "qsp_poly_max": feasible_qsp.poly_max,
        "qsp_spec_err": feasible_spec_err,
        "mode": mode,
    }




# ---- run ----
# if __name__ == "__main__":
#     debug_qsvt_pipeline_clean(
#     P=16,
#     seed=0,
#     degree=200,                   # 120 → 80 정도로 더 낮춤 (sym_qsp 부담 줄이기)
#     max_scale_grid=(0.7,0.8),
#     mu_grid=(0.02,0.03,0.04,0.06),   # μ를 더 크게: 덜 뾰족한 g
#     pad_diag_grid=(1,),
#     L=0.8,                       # 0.7 → 0.6 (더 작게)
#     objective="resid",
#     top_k=5,

#     # ★ 여기 완화 포인트들
#     poly_max_tol=1.2,            # 1.01 → 1.5 정도로 완화
#     domain_err_tol=1e-1,         # 5e-3 → 2e-2
#     spec_err_tol=1e-1,           # 5e-3 → 2e-2

#     show_small_block_diag=True,
# )




In [281]:
def dist(a: np.ndarray, b: np.ndarray):
    if a.shape != b.shape:
        raise ValueError("inconsistent point shape")
    return np.sqrt(np.sum((a - b) ** 2))

In [282]:
class NLOStestcase:
    """
    One randomized test case (NLOS geometry). Does not enforce NLOS by itself,
    just generates geometry.
    """
    def __init__(self: "NLOStestcase", sample_point_cnt: int, HV_r_min: float, HV_r_max: float, 
                 sample_r_min: float, sample_r_max: float, HV_orientation: float | None = None,
                 HV: tuple[float, float, float] | None = None, sample_points: tuple | list | None = None):
        
        if HV_r_min < 0 or HV_r_max < 0 or HV_r_min > HV_r_max:
            raise ValueError("HV radius restriction inappropriate")
        if HV_orientation is not None and (HV_orientation < 0 or HV_orientation > 2 * np.pi):
            raise ValueError("Inappropriate HV orientation")
        if sample_r_min < 0 or sample_r_max < 0 or sample_r_max < sample_r_min:
            raise ValueError("sample point distance restriction inappropriate")
        if sample_point_cnt <= 0:
            raise ValueError(f"Too small sample_point_cnt: {sample_point_cnt}")
        if (sample_points is not None and len(sample_points) != sample_point_cnt):
            raise ValueError("Sample point count/actual list mismatch")

        self.SV = np.zeros(3)

        self.sample_r_min = sample_r_min
        self.sample_r_max = sample_r_max
        
        if HV is None:
            self.HV = self._sample_point_in_shell(HV_r_min, HV_r_max)
            print(f"HV position set randomly: {self.HV}")
        else:
            self.HV = HV
            print(f"HV position set to given parameter: {self.HV}")
        
        if HV_orientation is None:
            self.HV_orientation = np.random.random() * 2 * np.pi
        else:
            self.HV_orientation = HV_orientation

        # if dist between HV and SV is greater than sample_r_max * 2, throw
        print("SV: {0}, HV: {1}".format(self.HV, self.SV))
        if dist(self.HV, self.SV) > sample_r_max * 2.5:
            raise ValueError("sample_r_max is too small")
        
        # set the reflection sample points
        self.sample_point_cnt = sample_point_cnt
        if sample_points is None:
            points = set()
            while len(points) < self.sample_point_cnt:
                point = self._sample_point_in_shell(sample_r_min, sample_r_max)
                if sample_r_min < dist(point, self.HV) < sample_r_max:
                    points.add(tuple(point))
            self.sample_points = [np.array(x) for x in points]
        else:
            self.sample_points = sample_points

        # convert each point to NLOS-used spherical coordinates
        self.sample_spherical_points = []
        for x in self.sample_points:
            self.sample_spherical_points.append(np.array(self._to_NLOS_coordinates(x)))
        
        # add rho * c(time difference) to last array member
        self._add_rho()

        # sort by dist
        self.sample_spherical_points.sort(key=lambda x: x[0])
        print([x[6] for x in self.sample_spherical_points])

        # initialize
        self.A = np.zeros((3 * (self.sample_point_cnt - 1), (self.sample_point_cnt + 1)))
        self.b = np.zeros((3 * (self.sample_point_cnt - 1),))

    def _random_direction_3d(self) -> np.ndarray:
        """
        Sample a random direction uniformly on the unit sphere S^2.
        """
        v = np.random.normal(size=3)
        norm = np.linalg.norm(v)
        if norm < 1e-12:
            # extremely unlikely; resample if it happens
            return self._random_direction_3d()
        return v / norm
    
    def _sample_point_in_shell(self, r_min: float, r_max: float) -> np.ndarray:
        """
        Sample a random 3D point whose distance from the origin is in [r_min, r_max].
        Direction is uniform; radius is uniform in [r_min, r_max] (not volume-uniform).
        """
        direction = self._random_direction_3d()
        r = r_min + (r_max - r_min) * np.random.rand()
        return r * direction
    
    def _cartesian_to_spherical(self, point: np.ndarray):
        r = dist(np.array([0, 0, 0]), point)
        rho = np.sqrt(np.sum(point[:2] ** 2))
        phi = np.arctan2(point[1], point[0])
        psi = np.arctan2(rho, point[2])
        return r, phi, psi
    
    def _to_NLOS_coordinates(self, point: np.ndarray):
        """returns d, v, theta, vartheta, phi, psi"""
        v, theta, var_theta = self._cartesian_to_spherical(point)
        v_p, phi_p, psi = self._cartesian_to_spherical(point - self.HV)
        
        phi = phi_p - self.HV_orientation
        if phi < 0: 
            phi += 2 * np.pi

        return (v + v_p, v, theta, var_theta, phi, psi, 0)
    
    def _add_rho(self):
        d_1 = min([x[0] for x in self.sample_spherical_points])
        for x in self.sample_spherical_points:
            x[-1] = x[0] - d_1

    def set_A(self, omega):
        # self sphereical points:
        # d, v, theta, var_theta, phi, psi, rho * c
        # 0, 1, 2,     3,         4,   5,   6
        
        # first point
        p1 = self.sample_spherical_points[0]

        # x direction matrix: assume point list is sorted
        for i, x in enumerate(self.sample_spherical_points[1:]):
            # - ( sin(var_theta) * cos(theta) + sin(psi) cos(phi+w) )
            self.A[i, 0] = -(np.sin(p1[3]) * np.cos(p1[2]) + np.sin(p1[5]) * np.cos(p1[4] + omega))

            # sin(var_theta) * cos(theta) + sin(psi) cos(phi+w)
            self.A[i, i + 1] = (np.sin(x[3]) * np.cos(x[2]) + np.sin(x[5]) * np.cos(x[4] + omega))
            
            # sin(psi_1) cos(phi_1 + omega) - sin(psi_p) cos(phi_p + omega)
            self.A[i, self.sample_point_cnt] = np.sin(p1[5]) * np.cos(p1[4] + omega) - np.sin(x[5]) * np.cos(x[4] + omega)
        
        # y direction matrix
        for i, x in enumerate(self.sample_spherical_points[1:]):
            self.A[(self.sample_point_cnt - 1) + i, 0] = -(np.sin(p1[3]) * np.sin(p1[2]) + np.sin(p1[5]) * np.sin(p1[4] + omega))

            self.A[(self.sample_point_cnt - 1) + i, i + 1] = (np.sin(x[3]) * np.sin(x[2]) + np.sin(x[5]) * np.sin(x[4] + omega))

            self.A[(self.sample_point_cnt - 1) + i, self.sample_point_cnt] = np.sin(p1[5]) * np.sin(p1[4] + omega) - np.sin(x[5]) * np.sin(x[4] + omega)
        
        # z direction matrix
        for i, x in enumerate(self.sample_spherical_points[1:]):
            self.A[(self.sample_point_cnt - 1) * 2 + i, 0] = -(np.cos(p1[3]) + np.cos(p1[5]))

            self.A[(self.sample_point_cnt - 1) * 2 + i, i + 1] = (np.cos(x[3]) + np.cos(x[5]))
            
            self.A[(self.sample_point_cnt - 1) * 2 + i, self.sample_point_cnt] = np.cos(p1[5]) - np.cos(x[5])
        
        print("A successfully set")

    def set_b(self, omega):
        # self sphereical points:
        # d, v, theta, var_theta, phi, psi, rho * c
        # 0, 1, 2,     3,         4,   5,   6

        # x direction
        for i, x in enumerate(self.sample_spherical_points[1:]):
            self.b[i] = x[6] * np.sin(x[5]) * np.cos(x[4] + omega)
        
        # y direction
        for i, x in enumerate(self.sample_spherical_points[1:]):
            self.b[(self.sample_point_cnt - 1) + i] = x[6] * np.sin(x[5]) * np.sin(x[4] + omega)
        
        # z direction
        for i, x in enumerate(self.sample_spherical_points[1:]):
            self.b[(self.sample_point_cnt - 1) * 2 + i] = x[6] * np.cos(x[5])

        print("b successfully mounted")

In [283]:
def make_nlos_Ab(
    sample_point_cnt=8,
    HV_r_min=10, HV_r_max=15,
    sample_r_min=8, sample_r_max=12,
    omega=None,
    seed=None,
):
    """
    NLOStestcase로부터 (A,b)를 생성해서 반환.
    omega=None이면 testcase.HV_orientation를 그대로 사용.
    """
    if seed is not None:
        np.random.seed(seed)

    testcase = NLOStestcase(sample_point_cnt, HV_r_min, HV_r_max, sample_r_min, sample_r_max)
    if omega is None:
        omega = testcase.HV_orientation

    testcase.set_A(omega)
    testcase.set_b(omega)

    A = np.asarray(testcase.A, dtype=np.complex128)
    b = np.asarray(testcase.b, dtype=np.complex128).reshape(-1)
    return A, b, testcase


def rescale_Ab_by_smax(A: np.ndarray, b: np.ndarray, eps=1e-18):
    """
    A,b를 A의 최대 특이값(smax)로 나눔. (x_ls는 변하지 않음)
    """
    s = np.linalg.svd(A, compute_uv=False)
    smax = float(np.max(s))
    scale = max(smax, eps)
    return A / scale, b / scale, scale


In [284]:

# 1) NLOS에서 A,b 만들기
A, b, tc = make_nlos_Ab(
    sample_point_cnt=8,
    HV_r_min=10, HV_r_max=15,
    sample_r_min=8, sample_r_max=12,
    seed=1
)

# 2) (추천) 스케일 안정화
A2, b2, scale = rescale_Ab_by_smax(A, b)
print(">>> rescale by smax:", scale)

# 3) QSVT로 LS 풀기 (빠른 검증은 numpy, 정확도 cross-check는 gate)
res = qsvt_solve_from_Ab(
    A2, b2,
    degree=200,
    max_scale_grid=(0.7,0.8),
    mu_grid=(0.01,0.015,0.02,0.04),
    pad_diag_grid=(1.0,),
    L=0.7,
    top_k=8,
    poly_max_tol=1.3,
    domain_err_tol=2e-1,
    spec_err_tol=2e-1,
    mode="numpy",   # 여기 "gate"로 바꾸면 느리지만 검증됨
)

print("x_hat =", res["x_hat"])
print("x_ls  =", res["x_ls"])


HV position set randomly: [10.72910638 -4.04076607 -3.48867368]
SV: [10.72910638 -4.04076607 -3.48867368], HV: [0. 0. 0.]
[0.0, 1.4901858537628634, 1.632662239720883, 2.0639953347297, 2.690401792704339, 3.2433349381747796, 3.9423598184865547, 4.412604601495076]
A successfully set
b successfully mounted
>>> rescale by smax: 4.999157013473087
>>> [NLOS] A,b injected
A shape: (21, 9) b shape: (21,)
singular values σ(A): [1.     0.4801 0.337  0.3359 0.3293 0.326  0.3245 0.1792 0.0271]
cond(A) = 36.926856403923615

>>> Embedding debug
singular values σ(A): [1.       0.480139 0.336971 0.335899 0.329328 0.325988 0.324494 0.179156
 0.027081]
eigenvalues λ(H): [-1.       -0.480139 -0.336971 -0.335899 -0.329328 -0.325988 -0.324494
 -0.179156 -0.027081 -0.       -0.       -0.       -0.       -0.
 -0.        0.        0.        0.        0.        0.        0.
  0.027081  0.179156  0.324494  0.325988  0.329328  0.335899  0.336971
  0.480139  1.      ]
min/max |λ(H)|: 5.554230320225925e-19 1.000000