In [None]:
import numpy as np
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score

from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector

from sklearn.preprocessing import StandardScaler


# 1) Load QM7, Flatten 23x23 Coulomb Matrices => 529 features


def load_qm7_coulomb_flat(matfile='qm7.mat'):
    """
    Loads the QM7 dataset from `matfile`.
    For each 23x23 Coulomb matrix, flattens into 529 features.
    Returns:
      X_flat: (N, 529) array of flattened Coulomb matrices
      Y:      (N,) array of target energies
    """
    data = scipy.io.loadmat(matfile)
    C_matrices = data['X']  # shape: (N, 23, 23)
    energies   = data['T'].ravel()  # shape: (N,)

    N = C_matrices.shape[0]
    X_list = []
    for i in range(N):
        # Flatten 23x23 => 529 features
        M_flat = C_matrices[i].flatten()
        X_list.append(M_flat)
    X_flat = np.array(X_list)  # shape (N, 529)
    return X_flat, energies


# 2) Amplitude Encoding Helper


def amplitude_encoding_circuit(vec, num_qubits):
    """
    Creates a circuit that amplitude-encodes 'vec' into 'num_qubits'.
      - Pads to length 2^num_qubits
      - Normalizes
      - Use qc.initialize(...)
    Returns a QuantumCircuit.
    """
    dim = 2**num_qubits
    qc = QuantumCircuit(num_qubits)

    padded = np.zeros(dim, dtype=complex)
    length = len(vec)
    if length > dim:
        raise ValueError(f"Cannot encode {length} features in {num_qubits} qubits (max {dim}).")

    padded[:length] = vec
    norm = np.linalg.norm(padded)
    if norm < 1e-9:
        padded[0] = 1.0
        norm = 1.0
    padded /= norm

    qc.initialize(padded, range(num_qubits))
    return qc

def build_statevectors(X, num_qubits):
    """
    For each row in X (shape (N, F)), it builds amplitude encoding,
    then retrieves the final statevector as a (2^num_qubits,) array.
    Returns sv_data of shape (N, 2^num_qubits) complex.
    """
    N = X.shape[0]
    dim = 2**num_qubits
    sv_data = np.zeros((N, dim), dtype=complex)
    for i in range(N):
        qc = amplitude_encoding_circuit(X[i], num_qubits)
        sv = Statevector.from_instruction(qc)
        sv_data[i] = sv.data
    return sv_data

def compute_kernel_statevector(sv_data):
    """
    Given shape (N, 2^num_qubits),
    computes NxN kernel K where K[i,j] = |< sv_i, sv_j >|^2.
    """
    N = sv_data.shape[0]
    K = np.zeros((N, N))
    for i in range(N):
        for j in range(i, N):
            ov = np.vdot(sv_data[i], sv_data[j])  # conj(sv[i]) dot sv[j]
            ov_sq = abs(ov)**2
            K[i,j] = ov_sq
            K[j,i] = ov_sq
    return K


# 3) Main: Feature + Target Scaling + Amplitude Encoding


def main_qsvr_scaled_coulomb_matrix_demo(
    matfile='qm7.mat',
    subset_size=600,
    test_size=0.2,
    num_qubits=10,
    random_seed=42
):
    """
    1) Loads dataset, flatten => 529 features
    2) Subsamples
    3) Scales features with StandardScaler
    4) Scales targets (subtract mean, divide std)
    5) Train/test split
    6) Amplitude encode -> direct statevector kernel
    7) Fits SVR
    8) inverts target transform, measures final MAE, R^2
    """
    # Load flattened data
    X_all, Y_all = load_qm7_coulomb_flat(matfile)
    Ntotal = len(X_all)
    print(f"Loaded {Ntotal} molecules with flattened 23x23 => 529 features.")

    # Subsample
    np.random.seed(random_seed)
    idxs = np.random.choice(Ntotal, size=subset_size, replace=False)
    X_sub = X_all[idxs]
    Y_sub = Y_all[idxs]

    # (1) Scale features
    scaler = StandardScaler()
    X_sub_scaled = scaler.fit_transform(X_sub)  # shape (subset_size, 529)

    # (2) Scale targets
    Y_mean = np.mean(Y_sub)
    Y_std  = np.std(Y_sub)
    Y_sub_scaled = (Y_sub - Y_mean)/Y_std

    # Train/test
    X_train, X_test, Y_train_scaled, Y_test_scaled = train_test_split(
        X_sub_scaled,
        Y_sub_scaled,
        test_size=test_size,
        random_state=random_seed
    )
    print(f"Train size={X_train.shape[0]}, Test size={X_test.shape[0]}")
    print(f"Amplitude encoding with {num_qubits} qubits => dimension=2^{num_qubits} >= 529.\n")

    # Build statevectors for train
    print("Building amplitude-encoded statevectors for training set...")
    sv_train = build_statevectors(X_train, num_qubits=num_qubits)
    print("Computing training kernel (direct overlap) ...")
    K_train = compute_kernel_statevector(sv_train)

    # Fit SVR with precomputed kernel
    svr = SVR(kernel='precomputed',  C=1e4, gamma=1e-3, epsilon=0.01)
    svr.fit(K_train, Y_train_scaled)

    # Build statevectors for test
    print("\nBuilding amplitude-encoded statevectors for test set...")
    sv_test = build_statevectors(X_test, num_qubits=num_qubits)

    # Cross-kernel test x train
    N_train = X_train.shape[0]
    N_test  = X_test.shape[0]
    K_test = np.zeros((N_test, N_train))
    for i in range(N_test):
        for j in range(N_train):
            ov = np.vdot(sv_test[i], sv_train[j])
            K_test[i,j] = abs(ov)**2

    # Predict (in scaled space)
    Y_pred_scaled = svr.predict(K_test)

    # Invert target transform
    Y_pred = Y_pred_scaled * Y_std + Y_mean
    Y_test = Y_test_scaled * Y_std + Y_mean

    # Evaluate
    mae = mean_absolute_error(Y_test, Y_pred)
    r2  = r2_score(Y_test, Y_pred)

    print("\nQuantum SVR (Amp Encoding, Full Coulomb, Scaled) Results:")
    print(f"  Subset size: {subset_size}")
    print(f"  MAE = {mae:.3f}")
    print(f"  R^2 = {r2:.3f}")


if __name__ == "__main__":
    # Example usage
    main_qsvr_scaled_coulomb_matrix_demo(
        matfile='qm7.mat',
        subset_size=7165,
        test_size=0.2,
        num_qubits=10,
        random_seed=42
    )


In [None]:
# “best” support-vector indices for each representation

import numpy as np
import scipy.io
from scipy.linalg import eigh
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

from qiskit import QuantumCircuit
from qiskit.quantum_info import Statevector

BEST_IDS_529 = [
    193, 19, 94, 158, 370, 0, 148, 281, 218, 214,
    52, 87, 186, 35, 337, 244, 267, 1, 342, 79,
    232, 183, 364, 348, 139, 369, 64, 188, 338, 304,
    128, 11, 172, 92, 393, 97, 149, 386, 350, 222,
    398, 181, 213, 164, 41, 143, 67, 271, 210, 202,
    275, 372, 141, 258, 247, 105, 211, 23, 130, 166,
    86, 124, 171, 132, 47, 208, 104, 243, 324, 399,
    336, 159, 51, 365, 46, 305, 390, 257, 196, 119,
    233, 12, 242, 274, 144, 106, 394, 215, 341, 366,
    382, 251, 250, 288, 272, 317, 8, 209, 63, 137,
    295, 111, 110, 53, 238, 54, 69, 253, 45, 109,
    167, 206, 195, 173, 17, 343, 371, 216, 322, 40,
    121, 308, 355, 312, 20, 198, 353, 294, 36, 321,
    160, 204, 284, 282, 248, 161, 225, 234, 347, 15,
    201, 279, 185, 140, 7, 96, 200, 177, 65, 133,
    169, 254, 349, 120, 344, 117, 155, 95, 127, 175,
    346, 71, 178, 33, 162, 77, 352, 392, 179, 88,
    240, 184, 118, 345, 311, 81, 262, 91, 359, 306,
    190, 291, 18, 276, 231, 326, 269, 24, 199, 31,
    189, 146, 174, 135, 157, 145, 287, 325, 70, 389,
    59, 300, 101, 285, 122, 277, 339, 21, 273, 358,
    333, 100, 226, 180, 299, 237, 142, 297, 309, 80,
    14, 10, 116, 5, 363, 203, 114, 72, 197, 212,
    377, 256, 89, 223, 307, 289, 4, 50, 316, 43,
    235, 27, 268, 42, 56, 129, 375, 255, 266, 246,
    303, 22, 85, 385, 354, 82, 278, 368, 29, 138,
    25, 361, 383, 207, 310, 236, 13, 150, 192, 290,
    388, 75, 245, 315, 125, 263, 313, 397, 49, 351,
    153, 239, 228, 296, 194, 379, 205, 230, 2, 298,
    131, 113, 261, 314, 32, 384, 252, 37, 98, 84
]

#1) Load QM7, Flatten 23×23 Coulomb Matrices → 529 features 
def load_qm7_coulomb_flat(matfile='qm7.mat'):
    data = scipy.io.loadmat(matfile)
    C = data['X']         # shape (N,23,23)
    Y = data['T'].ravel() # shape (N,)
    X_flat = np.stack([M.flatten() for M in C], axis=0)
    return X_flat, Y

#2) Amplitude-encoding helper
def amplitude_encoding_circuit(vec, num_qubits):
    dim = 2**num_qubits
    qc = QuantumCircuit(num_qubits)
    padded = np.zeros(dim, dtype=complex)
    padded[:len(vec)] = vec
    norm = np.linalg.norm(padded)
    if norm < 1e-9:
        padded[0] = 1.0
        norm = 1.0
    padded /= norm
    qc.initialize(padded, range(num_qubits))
    return qc

def build_statevectors(X, num_qubits):
    N = X.shape[0]
    dim = 2**num_qubits
    sv = np.zeros((N, dim), dtype=complex)
    for i in range(N):
        qc = amplitude_encoding_circuit(X[i], num_qubits)
        sv[i] = Statevector.from_instruction(qc).data
    return sv

def compute_kernel_statevector(sv):
    N = sv.shape[0]
    K = np.zeros((N, N))
    for i in range(N):
        for j in range(i, N):
            ov = np.vdot(sv[i], sv[j])
            K[i,j] = abs(ov)**2
            K[j,i] = K[i,j]
    return K

#3) Main: QSVR on top-300 by BEST_IDS_529
def main_qsvr_best300_coulomb(
    matfile='qm7.mat',
    subset_size=300,    # must be ≤ len(BEST_IDS_529)
    test_size=0.2,
    num_qubits=10,      # ⌈log2(529)⌉=10
    random_seed=42
):
    # 1) Load flattened data
    X_all, Y_all = load_qm7_coulomb_flat(matfile)
    N = len(X_all)
    print(f"Loaded {N} molecules with 529-D features.")

    # 2) Take top subset_size IDs
    if subset_size > len(BEST_IDS_529):
        raise ValueError("subset_size exceeds BEST_IDS_529 length")
    idxs = BEST_IDS_529[:subset_size]
    X_sub, Y_sub = X_all[idxs], Y_all[idxs]
    print(f"Subsampled top {subset_size} via BEST_IDS_529.")

    # 3) Scale features & targets
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_sub)
    Y_mean, Y_std = Y_sub.mean(), Y_sub.std()
    Y_scaled     = (Y_sub - Y_mean) / Y_std

    # 4) Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, Y_scaled, test_size=test_size, random_state=random_seed
    )
    print(f"Train={len(X_train)}, Test={len(X_test)}, Qubits={num_qubits}")

    # 5) Build train kernel
    sv_train = build_statevectors(X_train, num_qubits)
    K_train = compute_kernel_statevector(sv_train)

    # 6) Fit SVR
    svr = SVR(kernel='precomputed',  C=1e4, gamma=1e-3, epsilon=0.01)
    svr.fit(K_train, y_train)

    # 7) Build test kernel
    sv_test = build_statevectors(X_test, num_qubits)
    K_test = np.array([
        [abs(np.vdot(sv_test[i], sv_train[j]))**2 for j in range(len(sv_train))]
        for i in range(len(sv_test))
    ])

    # 8) Predict & invert
    y_pred_s = svr.predict(K_test)
    y_pred   = y_pred_s * Y_std + Y_mean
    y_true   = y_test    * Y_std + Y_mean

    # 9) Evaluate
    print("\nQSVR on top-300 Coulomb (amp-encoded):")
    print(f"  MAE = {mean_absolute_error(y_true, y_pred):.3f}")
    print(f"  R²  = {r2_score(y_true, y_pred):.3f}")

#4) Command-line entrypoint
if __name__ == "__main__":
    main_qsvr_best300_coulomb(
        matfile="qm7.mat",
        subset_size=300, #works up to 300
        test_size=0.2,
        num_qubits=10,
        random_seed=42
    )