In [6]:
import numpy as np
import scipy.io
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler


# 1) Load QM7, Flatten 23x23 Coulomb Matrices => 529 features


def load_qm7_coulomb_flat(matfile='qm7.mat'):
    """
    Loads the QM7 dataset from `matfile`.
    For each 23x23 Coulomb matrix, flattens into 529 features.
    Returns:
      X_flat: (N, 529) array of flattened Coulomb matrices
      Y:      (N,) array of target energies
    """
    data = scipy.io.loadmat(matfile)
    C_matrices = data['X']  # shape: (N, 23, 23)
    energies   = data['T'].ravel()  # shape: (N,)

    N = C_matrices.shape[0]
    X_list = []
    for i in range(N):
        # Flatten 23x23 => 529 features
        M_flat = C_matrices[i].flatten()
        X_list.append(M_flat)
    X_flat = np.array(X_list)  # shape (N, 529)
    return X_flat, energies


# 2) Classical SVR with Feature + Target Scaling


def main_classical_svr_scaled_coulomb_matrix_demo(
    matfile='qm7.mat',
    subset_size=500,
    test_size=0.2,
    random_seed=42
):
    """
    1) Loads dataset, flattens each 23x23 => 529 features
    2) Subsamples
    3) Scales features with StandardScaler
    4) Scales targets (subtract mean, divide std)
    5) Train/test split
    6) Fits a classical SVR (RBF)
    7) Inverts target transform, measures final MAE, R^2
    """
    # 1) Load data
    X_all, Y_all = load_qm7_coulomb_flat(matfile)
    Ntotal = len(X_all)
    print(f"Loaded {Ntotal} molecules with flattened 23x23 => 529 features.")

    # 1b) (Optional) Load 23-dimensional eigenvalues instead
    # Uncomment the section below to switch to eigenvalue features
    # from your eigenvalue loader:
    """
    print("Computing 23 eigenvalues for each matrix…")
    X_eig = []
    for M_flat in X_all:
        M = M_flat.reshape(23, 23)              # back to 23×23
        M = 0.5 * (M + M.T)                     # symmetrize
        e_vals = np.linalg.eigvalsh(M)          # sorted by default
        X_eig.append(e_vals)
    X_all = np.array(X_eig)                    # shape (Ntotal, 23)
    print(f"Converted to 23-dimensional eigenvalue features.")
    """

    # 2) Subsample
    np.random.seed(random_seed)
    idxs = np.random.choice(Ntotal, size=subset_size, replace=False)
    X_sub = X_all[idxs]
    Y_sub = Y_all[idxs]

    # 3) Scale features
    scaler = StandardScaler()
    X_sub_scaled = scaler.fit_transform(X_sub)

    # 4) Scale targets
    Y_mean = np.mean(Y_sub)
    Y_std  = np.std(Y_sub)
    Y_sub_scaled = (Y_sub - Y_mean)/Y_std

    # 5) Train/test split
    X_train_scaled, X_test_scaled, Y_train_scaled, Y_test_scaled = train_test_split(
        X_sub_scaled, Y_sub_scaled,
        test_size=test_size,
        random_state=random_seed
    )
    print(f"Train size={X_train_scaled.shape[0]}, Test size={X_test_scaled.shape[0]}")

    # 6) Fit classical SVR with RBF kernel
    svr = SVR(kernel='rbf', C=100.0, gamma='scale', epsilon=1.0)
    svr.fit(X_train_scaled, Y_train_scaled)

    # Predict (in scaled space)
    Y_pred_scaled = svr.predict(X_test_scaled)

    # 7) Invert target transform
    Y_pred = Y_pred_scaled * Y_std + Y_mean
    Y_test = Y_test_scaled * Y_std + Y_mean

    # Evaluate
    mae = mean_absolute_error(Y_test, Y_pred)
    r2  = r2_score(Y_test, Y_pred)

    print("\nClassical SVR (RBF) on 529D Coulomb + Scaled Features/Targets:")
    print(f"  Subset size: {subset_size}")
    print(f"  MAE = {mae:.3f}")
    print(f"  R^2 = {r2:.3f}")

if __name__ == "__main__":
    # Example usage
    main_classical_svr_scaled_coulomb_matrix_demo(
        matfile='qm7.mat',
        subset_size=7165,
        test_size=0.2,
        random_seed=42
    )
