In [None]:
"""
This script performs binary classification using a Quantum Support Vector Machine (QSVM)
implemented with Qiskit. Unlike traditional Neural Networks, QSVM uses quantum kernels
to find optimal decision boundaries in a quantum feature space.

Key concepts for beginners:
- QSVM: Uses quantum computing to enhance classical Support Vector Machines
- Quantum Kernel: Maps classical data to quantum states and measures similarity
- Feature Map: Circuit that encodes classical data into quantum states
- No training loop needed: QSVM trains in one step (unlike iterative neural networks)

The QSVM workflow:
1. Load and preprocess data (same as neural networks)
2. Create a quantum feature map (circuit that encodes data)
3. Create a quantum kernel (measures similarity between quantum states)
4. Train QSVM classifier (one-step process)
5. Evaluate and visualize results
"""

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/2025_summer_QSVM/MuonSeedClassifier_QNN')

path = '/content/drive/My Drive/2025_summer_QSVM/MuonSeedClassifier_QNN/DYToLL_PU200_Spring23_NThltIter2FromL1/DYToLL_PU200_Spring23_NThltIter2FromL1_Barrel.pkl'

Mounted at /content/drive


In [2]:
########################################################################################################
# Data IO part - Loading and preprocessing the muon seed classification data
########################################################################################################
import sys
import os
import pickle
import pandas as pd
import numpy as np

# Add parent directory to sys.path for importing custom modules
project_root = os.path.abspath(os.path.join(os.getcwd(), "."))
if project_root not in sys.path:
    sys.path.append(project_root)
print(f"Project root: {project_root}")

# Import custom preprocessing functions from BDT_model
# from BDT_model.HLTIO import preprocess

# Redefine the problematic function locally with the fix
# This is a workaround because we cannot directly modify the imported module
def addDistHitL1Tk(df, addAbsDist=False):
    """
    Adds distance squared features between hits and L1 tracks to the DataFrame.
    Removed the problematic multiplication by (hitx{i}+99999.)/(hitx{i}+99999.).
    """
    for i in range(1, 4):  # For hit1, hit2, hit3
        exprd2 = f'''d2hitl1tk{i} = (l1x{i}-hitx{i})**2 + (l1y{i}-hity{i})**2 + (l1z{i}-hitz{i})**2'''
        df.eval(exprd2, engine='numexpr', inplace=True)
        # df[f'd2hitl1tk{i}'] = df[f'd2hitl1tk{i}'].fillna(-99999.) # Keeping comment as in original

    if addAbsDist:
         for i in range(1, 4):  # For hit1, hit2, hit3
              df[f'absexpd2hitl1tk{i}'] = np.sqrt(df[f'expd2hitl1tk{i}'])

    return df


print("="*80)
print("QUANTUM SUPPORT VECTOR MACHINE (QSVM) FOR MUON SEED CLASSIFICATION")
print("="*80)
print("Loading and preprocessing data...")

# Path to the original pickle file containing muon data
pkl_path = "./DYToLL_PU200_Spring23_NThltIter2FromL1/DYToLL_PU200_Spring23_NThltIter2FromL1_Barrel.pkl"

# Load the pickle file
with open(pkl_path, "rb") as file:
    data = pickle.load(file)

# Extract the DataFrame from the loaded data
df = data[0]

# Apply data quality cuts
print("Applying data quality cuts...")
# Remove unphysical pT values (pT should be positive and reasonable)
df = df[df['gen_pt'] < 1e9]  # Remove extremely high pT values
df = df[df['gen_pt'] > 0]    # Remove zero or negative pT values

# Apply setClassLabel to compute binary class labels (0 = background, 1 = signal)
# Assuming setClassLabel is not causing issues and is available in the imported module
# If not, it would need to be redefined locally as well.
from BDT_model.HLTIO import preprocess
df = preprocess.setClassLabel(df)


# Compute distance features between hits and L1 tracks
# Use the locally redefined function
df = addDistHitL1Tk(df, addAbsDist=False)

# Apply eta cuts to focus on barrel region (|eta| < 1.2)
df = df[((df['tsos_eta'] < 1.2) & (df['tsos_eta'] > -1.2))].copy()

# Define the input features for our QSVM
# These features describe the muon seed properties and track-hit distances
required_columns = [
    "expd2hitl1tk1",     # Expected distance to hit from L1 track 1
    "expd2hitl1tk2",     # Expected distance to hit from L1 track 2
    "expd2hitl1tk3",     # Expected distance to hit from L1 track 3
    "dR_L1TkMuSeedP",    # Delta R between L1 track and muon seed
    "dPhi_L1TkMuSeedP",  # Delta phi between L1 track and muon seed
    "tsos_qbp",          # Track state parameter: q/p (charge/momentum)
    "tsos_dydz",         # Track state parameter: dy/dz slope
    "tsos_dxdz",         # Track state parameter: dx/dz slope
    "tsos_err0",         # Track state error parameter 0
    "tsos_err2",         # Track state error parameter 2
    "tsos_err5",         # Track state error parameter 5
    "y_label"            # Binary class label (0=background, 1=signal)
]

# Check if all required columns exist in the dataset
missing = [col for col in required_columns if col not in df.columns]
if missing:
    print("Warning: The following required columns are missing:", missing)

# Create final dataset with only required columns
# Add the newly created d2hitl1tk columns to the required columns list
distance_columns = [f"d2hitl1tk{i}" for i in range(1, 4)]
all_required_columns = required_columns + distance_columns

# Ensure all_required_columns are in df.columns before selecting
existing_required_columns = [col for col in all_required_columns if col in df.columns]
if len(existing_required_columns) != len(all_required_columns):
    missing_after_creation = [col for col in all_required_columns if col not in df.columns]
    print(f"Warning: The following columns are still missing after attempting to create them: {missing_after_creation}")


df_final = df[existing_required_columns].copy()
df_final = df_final.fillna(-1.)  # Fill missing values with -1


print(f"Dataset shape: {df_final.shape}")
print("First few rows of processed data:")
print(df_final.head())

# Check class distribution
print("\nClass distribution in full dataset:")
print(df_final["y_label"].value_counts())

Project root: /content/drive/My Drive/2025_summer_QSVM/MuonSeedClassifier_QNN
QUANTUM SUPPORT VECTOR MACHINE (QSVM) FOR MUON SEED CLASSIFICATION
Loading and preprocessing data...
Applying data quality cuts...
Dataset shape: (92114, 12)
First few rows of processed data:
   dR_L1TkMuSeedP  dPhi_L1TkMuSeedP  tsos_qbp  tsos_dydz  tsos_dxdz  \
0        0.004175         -0.003510 -0.023912  -0.335638  -0.038410   
1        0.005500          0.004065  0.025677  -0.411500   0.036356   
2        0.027850         -0.027783 -0.109400   1.075664   0.080918   
3        0.008985         -0.008914 -0.055924   0.417587   0.080094   
4        0.001913          0.001653  0.009154   0.334038  -0.009316   

   tsos_err0     tsos_err2     tsos_err5  y_label  d2hitl1tk1  d2hitl1tk2  \
0   0.000043  6.029049e-08  1.060757e-07      1.0    0.009054    0.338021   
1   0.000068  6.829735e-08  7.030542e-08      1.0    0.372055    0.268714   
2   0.000016  1.170655e-07  2.279720e-07      1.0    0.600656    0.73243

In [None]:
########################################################################################################
# Data sampling and preprocessing for QSVM
########################################################################################################
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

print("\n" + "="*60)
print("DATA SAMPLING AND PREPROCESSING")
print("="*60)

# For QSVM tutorial, we'll use a smaller sample of data
# QSVM can be computationally intensive, so we start with manageable size
sample_size = 100
print(f"Randomly sampling {sample_size} data points for QSVM training...")

# Randomly select indices for sampling
np.random.seed(42)  # Set seed for reproducibility
random_indices = np.random.choice(df_final.index, size=sample_size, replace=False)
df_sampled = df_final.loc[random_indices]

# Check class balance in sampled data
print("Class distribution in sampled data:")
print(df_sampled["y_label"].value_counts())

# Separate features (X) and labels (y)
X = df_sampled.drop(columns=["y_label"]).values.astype(np.float32)
y = df_sampled["y_label"].values.astype(np.int32)  # QSVM expects integer labels

print(f"Feature matrix shape: {X.shape}")
print(f"Label vector shape: {y.shape}")

# Split data into training and test sets (80% train, 20% test)
# Stratify ensures both sets have similar class distributions
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # Maintain class balance in both splits
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Standardize features (important for quantum algorithms)
# This scales all features to have mean=0 and std=1
print("\nStandardizing features...")
scaler = StandardScaler()
scaler.fit(X_train)  # Compute scaling parameters from training data only

# Save scaling parameters for future use
scalefiles_dir = "scalefiles"
if not os.path.exists(scalefiles_dir):
    os.makedirs(scalefiles_dir)

scale_filepath = os.path.join(scalefiles_dir, "barrel_qsvm_scale.txt")
with open(scale_filepath, "w") as f_scale:
    f_scale.write("%s\n" % str(scaler.mean_.tolist()))
    f_scale.write("%s\n" % str(scaler.scale_.tolist()))
print(f"Scaling parameters saved to: {scale_filepath}")

# Apply standardization to both training and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify class distributions after splitting and scaling
print("\nFinal class distributions:")
print(f"Training set: {np.bincount(y_train)}")
print(f"Test set: {np.bincount(y_test)}")



DATA SAMPLING AND PREPROCESSING
Randomly sampling 100 data points for QSVM training...
Class distribution in sampled data:
y_label
0.0    55
1.0    45
Name: count, dtype: int64
Feature matrix shape: (100, 11)
Label vector shape: (100,)
Training set size: 80
Test set size: 20

Standardizing features...
Scaling parameters saved to: scalefiles/barrel_qsvm_scale.txt

Final class distributions:
Training set: [44 36]
Test set: [11  9]


In [None]:
########################################################################################################
# QSVM Model Definition using Qiskit
########################################################################################################
print("\n" + "="*60)
print("QUANTUM SVM MODEL SETUP")
print("="*60)

# Import required Qiskit and Qiskit Machine Learning modules
!pip install qiskit_machine_learning
from qiskit.circuit.library import ZZFeatureMap
from qiskit.primitives import StatevectorSampler as Sampler
from qiskit_machine_learning.state_fidelities import ComputeUncompute
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit_machine_learning.algorithms import QSVC

# Start implementing your QSVM model here




QUANTUM SVM MODEL SETUP
Collecting qiskit_machine_learning
  Downloading qiskit_machine_learning-0.8.3-py3-none-any.whl.metadata (13 kB)
Collecting qiskit<2.0,>=1.0 (from qiskit_machine_learning)
  Downloading qiskit-1.4.3-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting rustworkx>=0.15.0 (from qiskit<2.0,>=1.0->qiskit_machine_learning)
  Downloading rustworkx-0.16.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting stevedore>=3.0.0 (from qiskit<2.0,>=1.0->qiskit_machine_learning)
  Downloading stevedore-5.4.1-py3-none-any.whl.metadata (2.3 kB)
Collecting symengine<0.14,>=0.11 (from qiskit<2.0,>=1.0->qiskit_machine_learning)
  Downloading symengine-0.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting pbr>=2.0.0 (from stevedore>=3.0.0->qiskit<2.0,>=1.0->qiskit_machine_learning)
  Downloading pbr-6.1.1-py2.py3-none-any.whl.metadata (3.4 kB)
Downloading qiskit_machine_learning