# Classical SVM Exploration - BI2 Project

This notebook provides an interactive environment for exploring the classical SVM implementation.

**Objectives:**
1. Load and explore the German Credit Risk dataset
2. Understand the preprocessing pipeline
3. Train and evaluate classical SVM models
4. Compare different kernels and hyperparameters
5. Visualize results

In [None]:
# Setup
import sys
from pathlib import Path

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("✅ Setup complete")

## 1. Data Loading

Load the German Credit Risk dataset from OpenML.

In [None]:
from data_loader import CreditDataLoader

# Load data
loader = CreditDataLoader()
X, y = loader.load_from_openml()

print(f"\nDataset shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nTarget distribution:\n{y.value_counts()}")

In [None]:
# Explore the data
print("First few rows:")
display(X.head())

print("\nData types:")
print(X.dtypes)

print("\nData summary:")
summary = loader.get_data_summary()
for key, value in summary.items():
    if key != 'feature_names':
        print(f"{key}: {value}")

## 2. Data Preprocessing

Apply the complete preprocessing pipeline:
- Missing value handling
- Categorical encoding
- Scaling
- PCA dimensionality reduction

In [None]:
from preprocessing import CreditDataPreprocessor

# Initialize preprocessor with 4 components (for 4-qubit QSVM later)
preprocessor = CreditDataPreprocessor(n_components=4)

# Run preprocessing pipeline
X_train, X_test, y_train, y_test = preprocessor.preprocess_data(X, y)

In [None]:
# Visualize PCA results
explained_var = preprocessor.pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Explained variance per component
ax1.bar(range(1, len(explained_var) + 1), explained_var, alpha=0.7, color='steelblue')
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('Explained Variance by Component')
ax1.set_xticks(range(1, len(explained_var) + 1))

# Cumulative explained variance
ax2.plot(range(1, len(cumulative_var) + 1), cumulative_var, marker='o', linewidth=2, color='steelblue')
ax2.axhline(y=0.8, color='r', linestyle='--', alpha=0.5, label='80% threshold')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.set_title('Cumulative Explained Variance')
ax2.legend()
ax2.grid(alpha=0.3)
ax2.set_xticks(range(1, len(cumulative_var) + 1))

plt.tight_layout()
plt.show()

print(f"Total explained variance with {len(explained_var)} components: {cumulative_var[-1]:.4f}")

In [None]:
# Visualize data in reduced space (first 2 PCs)
plt.figure(figsize=(10, 6))
scatter = plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='RdYlGn', alpha=0.6, edgecolors='k', s=50)
plt.xlabel(f'PC1 ({explained_var[0]:.2%} variance)')
plt.ylabel(f'PC2 ({explained_var[1]:.2%} variance)')
plt.title('Training Data in PCA Space (First 2 Components)')
plt.colorbar(scatter, label='Credit Quality (0=Bad, 1=Good)')
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Classical SVM Training

Train a classical SVM with RBF kernel.

In [None]:
from classical_svm import ClassicalSVM

# Initialize and train
svm = ClassicalSVM(kernel='rbf', C=1.0)
svm.train(X_train, y_train)

# Evaluate
metrics = svm.evaluate(X_test, y_test)

In [None]:
# Generate detailed report
report = svm.generate_classification_report(X_test, y_test)

## 4. Visualizations

In [None]:
# Confusion Matrix
svm.plot_confusion_matrix(X_test, y_test, save_path=None)

In [None]:
# ROC Curve
svm.plot_roc_curve(X_test, y_test, save_path=None)

## 5. Kernel Comparison

Compare different SVM kernels to find the best performing one.

In [None]:
from classical_svm import compare_kernels

# Compare kernels
comparison_df = compare_kernels(X_train, X_test, y_train, y_test)

# Display results
display(comparison_df)

In [None]:
# Visualize kernel comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1_score']
colors = ['steelblue', 'coral', 'lightgreen', 'plum']

for ax, metric, color in zip(axes.flat, metrics_to_plot, colors):
    bars = ax.bar(comparison_df['kernel'], comparison_df[metric], color=color, alpha=0.7, edgecolor='black')
    ax.set_ylabel(metric.replace('_', ' ').title())
    ax.set_xlabel('Kernel')
    ax.set_ylim([0, 1])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=9)

plt.suptitle('Kernel Performance Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Hyperparameter Impact

Explore how different hyperparameters affect performance.

In [None]:
# Test different C values
C_values = [0.1, 1.0, 10.0, 100.0]
results = []

for C in C_values:
    print(f"\nTesting C={C}...")
    svm_temp = ClassicalSVM(kernel='rbf', C=C)
    svm_temp.train(X_train, y_train)
    metrics = svm_temp.evaluate(X_test, y_test)
    results.append({'C': C, **metrics})

df_C = pd.DataFrame(results)
display(df_C[['C', 'accuracy', 'precision', 'recall', 'f1_score']])

In [None]:
# Visualize C parameter impact
fig, ax = plt.subplots(figsize=(10, 6))

ax.plot(df_C['C'], df_C['accuracy'], marker='o', label='Accuracy', linewidth=2)
ax.plot(df_C['C'], df_C['precision'], marker='s', label='Precision', linewidth=2)
ax.plot(df_C['C'], df_C['recall'], marker='^', label='Recall', linewidth=2)
ax.plot(df_C['C'], df_C['f1_score'], marker='D', label='F1-Score', linewidth=2)

ax.set_xscale('log')
ax.set_xlabel('C (Regularization Parameter)', fontsize=12)
ax.set_ylabel('Score', fontsize=12)
ax.set_title('Impact of C Parameter on Model Performance', fontsize=14, fontweight='bold')
ax.legend(loc='best')
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Feature Space Analysis

Understand how PCA component count affects performance.

In [None]:
# Test different numbers of components
component_counts = [2, 4, 6, 8, 10]
pca_results = []

for n_comp in component_counts:
    print(f"\nTesting {n_comp} components...")
    
    # Preprocess with different component count
    prep_temp = CreditDataPreprocessor(n_components=n_comp)
    X_tr, X_te, y_tr, y_te = prep_temp.preprocess_data(X, y)
    
    # Train and evaluate
    svm_temp = ClassicalSVM(kernel='rbf')
    svm_temp.train(X_tr, y_tr)
    metrics = svm_temp.evaluate(X_te, y_te)
    
    pca_results.append({
        'n_components': n_comp,
        'explained_variance': prep_temp.pca.explained_variance_ratio_.sum(),
        **metrics
    })

df_pca = pd.DataFrame(pca_results)
display(df_pca[['n_components', 'explained_variance', 'accuracy', 'f1_score', 'training_time']])

In [None]:
# Visualize PCA component count impact
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Accuracy vs components
ax1_twin = ax1.twinx()
line1 = ax1.plot(df_pca['n_components'], df_pca['accuracy'], marker='o', color='steelblue', linewidth=2, label='Accuracy')
line2 = ax1_twin.plot(df_pca['n_components'], df_pca['explained_variance'], marker='s', color='coral', linewidth=2, label='Explained Variance')
ax1.set_xlabel('Number of PCA Components')
ax1.set_ylabel('Accuracy', color='steelblue')
ax1_twin.set_ylabel('Explained Variance', color='coral')
ax1.set_title('Accuracy vs PCA Components')
ax1.grid(alpha=0.3)
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='best')

# Training time vs components
ax2.plot(df_pca['n_components'], df_pca['training_time'], marker='o', color='green', linewidth=2)
ax2.set_xlabel('Number of PCA Components')
ax2.set_ylabel('Training Time (seconds)')
ax2.set_title('Training Time vs PCA Components')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Summary and Next Steps

### Key Findings
- Best kernel: [To be determined from results]
- Optimal C value: [To be determined from results]
- PCA components trade-off: [To be determined from results]

### Next Steps for BI2 Project
1. Implement Quantum SVM (QSVM) using Qiskit
2. Compare QSVM performance with Classical SVM
3. Analyze computational cost differences
4. Test on different component counts (qubit counts)
5. Generate final comparison report

In [None]:
# Save the trained model and preprocessor for later use
svm.save_model("../models/classical_svm_rbf.pkl")
preprocessor.save_preprocessor("../models/preprocessor_4comp.pkl")

print("✅ Models saved successfully!")
print("\nReady to proceed with Quantum SVM implementation.")