In [None]:
import numpy as np
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler


# 1) Load QM7, Flatten 23x23 Coulomb Matrices => 529 features


def load_qm7_coulomb_flat(matfile='qm7.mat'):
    """
    Loads the QM7 dataset from `matfile`.
    For each 23x23 Coulomb matrix, flattens into 529 features.
    Returns:
      X_flat: (N, 529) array of flattened Coulomb matrices
      Y:      (N,) array of target energies
    """
    data = scipy.io.loadmat(matfile)
    C_matrices = data['X']  # shape: (N, 23, 23)
    energies   = data['T'].ravel()  # shape: (N,)

    N = C_matrices.shape[0]
    X_list = []
    for i in range(N):
        # Flatten 23x23 => 529 features
        M_flat = C_matrices[i].flatten()
        X_list.append(M_flat)
    X_flat = np.array(X_list)  # shape (N, 529)
    return X_flat, energies


# 2) Classical SVR with Feature + Target Scaling


def main_classical_svr_scaled_coulomb_matrix_demo(
    matfile='qm7.mat',
    subset_size=500,
    test_size=0.2,
    random_seed=42
):
    """
    1) Loads dataset, flattens each 23x23 => 529 features
    2) Subsamples
    3) Scales features with StandardScaler
    4) Scales targets (subtract mean, divide std)
    5) Train/test split
    6) Fits a classical SVR (RBF)
    7) Inverts target transform, measures final MAE, R^2
    """
    # 1) Load data
    X_all, Y_all = load_qm7_coulomb_flat(matfile)
    Ntotal = len(X_all)
    print(f"Loaded {Ntotal} molecules with flattened 23x23 => 529 features.")

    # 1b) (Optional) Load 23-dimensional eigenvalues instead
    # Uncomment the section below to switch to eigenvalue features
    # from your eigenvalue loader:
   """
    print("Computing 23 eigenvalues for each matrix…")
    X_eig = []
    for M_flat in X_all:
        M = M_flat.reshape(23, 23)              # back to 23x23
        M = 0.5 * (M + M.T)                     # symmetrize
        e_vals = np.linalg.eigvalsh(M)          # sorted by default
        X_eig.append(e_vals)
    X_all = np.array(X_eig)                    # shape (Ntotal, 23)
    print(f"Converted to 23-dimensional eigenvalue features.")
   """

    # 2) Subsample
    np.random.seed(random_seed)
    idxs = np.random.choice(Ntotal, size=subset_size, replace=False)
    X_sub = X_all[idxs]
    Y_sub = Y_all[idxs]

    # 3) Scale features
    scaler = StandardScaler()
    X_sub_scaled = scaler.fit_transform(X_sub)

    # 4) Scale targets
    Y_mean = np.mean(Y_sub)
    Y_std  = np.std(Y_sub)
    Y_sub_scaled = (Y_sub - Y_mean)/Y_std

    # 5) Train/test split
    X_train_scaled, X_test_scaled, Y_train_scaled, Y_test_scaled = train_test_split(
        X_sub_scaled, Y_sub_scaled,
        test_size=test_size,
        random_state=random_seed
    )
    print(f"Train size={X_train_scaled.shape[0]}, Test size={X_test_scaled.shape[0]}")

    # 6) Fit classical SVR with RBF kernel
    svr = SVR(kernel='rbf', C=1e4, gamma=1e-3, epsilon=0.01)
    svr.fit(X_train_scaled, Y_train_scaled)

    # Predict (in scaled space)
    Y_pred_scaled = svr.predict(X_test_scaled)

    # 7) Invert target transform
    Y_pred = Y_pred_scaled * Y_std + Y_mean
    Y_test = Y_test_scaled * Y_std + Y_mean

    # Evaluate
    mae = mean_absolute_error(Y_test, Y_pred)
    r2  = r2_score(Y_test, Y_pred)

    print("\nClassical SVR (RBF) on 529D Coulomb + Scaled Features/Targets:")
    print(f"  Subset size: {subset_size}")
    print(f"  MAE = {mae:.3f}")
    print(f"  R^2 = {r2:.3f}")

if __name__ == "__main__":
    # Example usage
    main_classical_svr_scaled_coulomb_matrix_demo(
        matfile='qm7.mat',
        subset_size=7165,
        test_size=0.2,
        random_seed=42
    )


In [None]:
#FINDS THE BEST 300 QUANTUM SUPPORT VECTORS AND RANKS THEM

import numpy as np
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler


#Loaders 
def load_flat(matfile='qm7.mat'):
    data = scipy.io.loadmat(matfile)
    C = data['X']         # shape (N,23,23)
    Y = data['T'].ravel()
    X_flat = np.stack([M.flatten() for M in C], axis=0)
    return X_flat, Y

def load_eig(matfile='qm7.mat'):
    data = scipy.io.loadmat(matfile)
    C = data['X']
    Y = data['T'].ravel()
    eigs = []
    for M in C:
        M_sym = 0.5*(M + M.T)
        e = np.linalg.eigvalsh(M_sym)
        eigs.append(np.sort(e))
    return np.array(eigs), Y


#SVR + Support‐Vector Inspection
def inspect_svr_support(loader, feature_name, subset_size=500, test_size=0.2, random_seed=42):
    """
    Trains an SVR on a random subset of the provided features,
    then prints the top-300 support vectors by |dual_coef_|.
    """
    # 1) Load & subsample
    X_all, Y_all = loader('qm7.mat')
    np.random.seed(random_seed)
    idxs = np.random.choice(len(X_all), size=subset_size, replace=False)
    X_sub, Y_sub = X_all[idxs], Y_all[idxs]

    # 2) Scale features & targets
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_sub)
    Y_mean, Y_std = Y_sub.mean(), Y_sub.std()
    Y_scaled = (Y_sub - Y_mean) / Y_std

    # 3) Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, Y_scaled, test_size=test_size, random_state=random_seed
    )

    # 4) Fit SVR
    svr = SVR(kernel='rbf', C=1e4, gamma=1e-3, epsilon=0.01)
    svr.fit(X_train, y_train)

    # 5) Inspect support vectors
    sv_indices = svr.support_
    alphas     = np.abs(svr.dual_coef_).ravel()
    order      = np.argsort(-alphas)
    top30_idx  = sv_indices[order[:300]]
    top30_alph = alphas[order[:300]]

    print(f"\n=== {feature_name} features ===")
    print("Top 300 support-vector indices (in the TRAIN set):", top30_idx)
    print("Their dual-coeff magnitudes:           ", top30_alph)

    # 6) Final evaluation
    y_pred_s = svr.predict(X_test)
    y_pred   = y_pred_s * Y_std + Y_mean
    y_true   = y_test * Y_std + Y_mean
    mae = mean_absolute_error(y_true, y_pred)
    r2  = r2_score(y_true, y_pred)
    print(f"MAE = {mae:.3f}   R² = {r2:.3f}")


if __name__ == "__main__":
    # Flat 529-D run
    inspect_svr_support(load_flat, '529D')
    # Eigenspectrum 23-D run
    inspect_svr_support(load_eig,  '23D')


In [None]:
# “best” support-vector indices for each representation

import numpy as np
import scipy.io
from scipy.linalg import eigh
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

BEST_IDS_529 = [
    193, 19, 94, 158, 370, 0, 148, 281, 218, 214,
    52, 87, 186, 35, 337, 244, 267, 1, 342, 79,
    232, 183, 364, 348, 139, 369, 64, 188, 338, 304,
    128, 11, 172, 92, 393, 97, 149, 386, 350, 222,
    398, 181, 213, 164, 41, 143, 67, 271, 210, 202,
    275, 372, 141, 258, 247, 105, 211, 23, 130, 166,
    86, 124, 171, 132, 47, 208, 104, 243, 324, 399,
    336, 159, 51, 365, 46, 305, 390, 257, 196, 119,
    233, 12, 242, 274, 144, 106, 394, 215, 341, 366,
    382, 251, 250, 288, 272, 317, 8, 209, 63, 137,
    295, 111, 110, 53, 238, 54, 69, 253, 45, 109,
    167, 206, 195, 173, 17, 343, 371, 216, 322, 40,
    121, 308, 355, 312, 20, 198, 353, 294, 36, 321,
    160, 204, 284, 282, 248, 161, 225, 234, 347, 15,
    201, 279, 185, 140, 7, 96, 200, 177, 65, 133,
    169, 254, 349, 120, 344, 117, 155, 95, 127, 175,
    346, 71, 178, 33, 162, 77, 352, 392, 179, 88,
    240, 184, 118, 345, 311, 81, 262, 91, 359, 306,
    190, 291, 18, 276, 231, 326, 269, 24, 199, 31,
    189, 146, 174, 135, 157, 145, 287, 325, 70, 389,
    59, 300, 101, 285, 122, 277, 339, 21, 273, 358,
    333, 100, 226, 180, 299, 237, 142, 297, 309, 80,
    14, 10, 116, 5, 363, 203, 114, 72, 197, 212,
    377, 256, 89, 223, 307, 289, 4, 50, 316, 43,
    235, 27, 268, 42, 56, 129, 375, 255, 266, 246,
    303, 22, 85, 385, 354, 82, 278, 368, 29, 138,
    25, 361, 383, 207, 310, 236, 13, 150, 192, 290,
    388, 75, 245, 315, 125, 263, 313, 397, 49, 351,
    153, 239, 228, 296, 194, 379, 205, 230, 2, 298,
    131, 113, 261, 314, 32, 384, 252, 37, 98, 84
]

BEST_IDS_23 = [
    305, 286, 145, 285, 284, 148, 281, 280, 279, 152,
    143, 277, 276, 157, 158, 275, 161, 274, 273, 270,
    166, 154, 269, 287, 139, 112, 309, 306, 115, 117,
    121, 122, 301, 125, 140, 299, 297, 129, 130, 131,
    292, 290, 135, 137, 138, 298, 268, 169, 263, 249,
    204, 246, 206, 207, 245, 244, 243, 213, 202, 214,
    241, 217, 239, 235, 221, 232, 231, 224, 230, 242,
    251, 252, 198, 262, 173, 175, 176, 177, 178, 179,
    180, 261, 260, 259, 185, 186, 187, 188, 189, 257,
    193, 194, 196, 256, 111, 311, 227, 107, 38, 39,
    40, 41, 361, 45, 48, 49, 366, 50, 52, 109,
    54, 56, 354, 61, 352, 351, 51, 64, 368, 373,
    397, 5, 6, 390, 9, 384, 383, 15, 29, 382,
    379, 21, 377, 375, 374, 25, 26, 27, 18, 350,
    53, 349, 85, 87, 88, 66, 91, 92, 93, 95,
    96, 325, 321, 101, 103, 105, 106, 331, 83, 89,
    334, 348, 345, 82, 70, 74, 344, 341, 71, 79,
    336, 303, 102, 34, 132, 46, 191, 160, 389, 392,
    86, 199, 391, 313, 182, 174, 155, 381, 209, 362,
    304, 14, 386, 317, 31, 226, 367, 72, 358, 378,
    219, 73, 0, 393, 104, 201, 267, 7, 237, 2,
    387, 134, 114, 324, 37, 222, 81, 288, 123, 310,
    372, 212, 1, 65, 332, 225, 156, 228, 17, 172,
    283, 167, 124, 62, 80, 380, 144, 162, 234, 320,
    216, 371, 388, 147, 338, 218, 238, 12, 20, 67,
    370, 150, 343, 210, 159, 319, 340, 247, 183, 43,
    363, 240, 289, 385, 127, 314, 168, 295, 24, 335,
    369, 220, 116, 236, 328, 339, 133, 398, 399, 357,
    100, 4, 322, 315, 360, 163, 23, 356, 265, 266,
    192, 396, 13, 151, 33, 16, 253, 84, 141, 171
]



def main_classical_svr_scaled_demo(
    matfile='qm7.mat',
    subset_size=300,
    test_size=0.2,
    random_seed=42
):
#   1) Load QM7, Flatten 23x23 Coulomb Matrices => 529 features
    data = scipy.io.loadmat(matfile)
    C_all = data['X']             # shape (N,23,23)
    Y_all = data['T'].ravel()     # shape (N,)

    # 1A) FLAT 529-D (default)
    X_all = np.stack([M.flatten() for M in C_all], axis=0)
    BEST_IDS = BEST_IDS_529

#   1B) EIGENVALUE 23-D MODE (uncomment to activate)
    
    # compute 23 eigenvalues in-place
    eigs = []
    for M in C_all:
        M_sym = 0.5*(M + M.T)
        vals  = eigh(M_sym, eigvals_only=True)  # sorted ascending
        eigs.append(np.sort(vals)[::-1])       # descending absolute
    X_all   = np.array(eigs)                  # shape (N,23)
    BEST_IDS = BEST_IDS_23
    
 

    N = len(X_all)
    print(f"Loaded {N} molecules; feature-dim = {X_all.shape[1]}.")

    #2) SUBSAMPLE via BEST_IDS 
    if subset_size > len(BEST_IDS):
        raise ValueError(f"subset_size={subset_size} exceeds BEST_IDS length={len(BEST_IDS)}")
    idxs = BEST_IDS[:subset_size]
    X_sub, Y_sub = X_all[idxs], Y_all[idxs]
    print(f"Using top {subset_size} samples from BEST_IDS.")

    #3) SCALE FEATURES & TARGET
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_sub)

    Y_mean, Y_std = Y_sub.mean(), Y_sub.std()
    Y_scaled     = (Y_sub - Y_mean) / Y_std

    #) SPLIT
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, Y_scaled, test_size=test_size, random_state=random_seed
    )
    print(f"Train size = {len(X_train)}, Test size = {len(X_test)}")

    #5) FIT & PREDICT
    svr = SVR(kernel='rbf', C=1e4, gamma=1e-3, epsilon=0.01)
    svr.fit(X_train, y_train)

    y_pred_s = svr.predict(X_test)
    y_pred   = y_pred_s * Y_std + Y_mean
    y_true   = y_test   * Y_std + Y_mean

    #6) EVALUATE 
    mae = mean_absolute_error(y_true, y_pred)
    r2  = r2_score(y_true, y_pred)
    print("\nClassical SVR (RBF) results:")
    print(f"  Subset size: {subset_size}")
    print(f"  MAE = {mae:.3f}")
    print(f"  R²  = {r2:.3f}")


if __name__ == "__main__":
    main_classical_svr_scaled_demo(
        matfile='/Users/franogurlic/Desktop/qm7.mat',
        subset_size=300, #works up to 300
        test_size=0.2,
        random_seed=42
    )
