In [2]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# --- Constants for Local Files ---
TRAIN_X_FILE = "X_train_binary.csv"
TRAIN_Y_FILE = "y_train_binary.csv"
TEST_X_FILE = "X_test_binary.csv"
TEST_Y_FILE = "y_test_binary.csv"

# -----------------------------------------------------------------
# TASK 2.1: Data Understanding and Preprocessing
# -----------------------------------------------------------------
def load_and_preprocess_data():
    """Loads, reports frequencies, and standardizes the data."""
    print("--- Task 2.1: Data Loading ---")
    try:
        # Load X and y training and test data
        X_train = np.loadtxt(TRAIN_X_FILE, delimiter=',')
        y_train = np.loadtxt(TRAIN_Y_FILE, delimiter=',')
        X_test = np.loadtxt(TEST_X_FILE, delimiter=',')
        y_test = np.loadtxt(TEST_Y_FILE, delimiter=',')
    except FileNotFoundError:
        print("Error: One or more data files not found in the directory.")
        raise

    # 1. Report number of training/test samples
    print(f"Number of training samples: {X_train.shape[0]}")
    print(f"Number of test samples:     {X_test.shape[0]}")

    # 2. Report Class Frequencies
    y_train_series = pd.Series(y_train)
    counts = y_train_series.value_counts(normalize=True).sort_index()
    print("\nTraining Class Frequencies:")
    for label, freq in counts.items():
        print(f"  Class {int(label)}: {freq:.2%}")

    # 3. Normalize Input Data (f_norm)
    # StandardScaler computes mean/variance from X_train (f_norm) and applies it.
    # This aligns with the theory that normalization is part of the model building process.
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Apply the SAME f_norm to the test data (Crucial Rule against data leakage!)
    X_test_scaled = scaler.transform(X_test)

    # Verification
    print("\nVerification of Transformed Test Data (Feature 10):")
    # Feature 10 is used for quick check
    print(f"  Mean (should be near 0): {np.mean(X_test_scaled[:, 10]):.4f}")
    print(f"  Variance (should be near 1): {np.var(X_test_scaled[:, 10]):.4f}")

    return X_train_scaled, y_train, X_test_scaled, y_test

# -----------------------------------------------------------------
# TASK 2.2: Model Selection using Grid-Search
# -----------------------------------------------------------------
def run_grid_search(X_train, y_train):
    """Performs 5-fold CV Grid Search to find optimal C and gamma."""

    # Define the search grid (logarithmic scale is common practice)
    C_values = np.logspace(-2, 2, 5)  # [0.01, 0.1, 1, 10, 100]
    gamma_values = np.logspace(-3, 1, 5) # [0.001, 0.01, 0.1, 1, 10]

    param_grid = {'C': C_values, 'gamma': gamma_values}

    # Initialize the SVM model (Gaussian/RBF kernel)
    svm_base = SVC(kernel='rbf', random_state=42)

    # Use GridSearchCV with 5-fold Cross-Validation (cv=5)
    grid_search = GridSearchCV(
        estimator=svm_base,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    print("\n--- Task 2.2: Starting Grid Search ---")
    grid_search.fit(X_train, y_train)

    return grid_search

# -----------------------------------------------------------------
# TASK 2.3: Inspecting the Kernel Expansion (Model Interpretation)
# -----------------------------------------------------------------
def analyze_support_vectors(X_train, y_train, C_val, gamma_val):
    """Analyzes the change in free and bounded SVs for decreasing C values."""

    print("\n--- Task 2.3: Support Vector Analysis ---")

    # Define C values to test (the optimal C and two drastically lower values)
    C_analysis_values = [C_val, C_val / 100, C_val / 1000]

    print(f"Rigorous Argument: Free SVs DECREASE, Bounded SVs INCREASE when C is lowered.")
    print(f"{'C Value':<10} | {'Free SVs (0 < |a| < C)':<20} | {'Bounded SVs (|a| = C)':<30}")
    print("-" * 70)

    for C_test in C_analysis_values:
        # Train a new SVM with the test C value and fixed gamma
        svm_model = SVC(kernel='rbf', C=C_test, gamma=gamma_val, random_state=42)
        svm_model.fit(X_train, y_train)

        # Access the dual coefficients (alpha_i)
        abs_alphas = np.abs(svm_model.dual_coef_.ravel())

        # Calculate SV counts (using small tolerance for floating point comparison)
        bounded_svs = np.sum(abs_alphas >= C_test - 1e-5) # Bounded: alpha_i = C
        free_svs = np.sum((abs_alphas > 1e-5) & (abs_alphas < C_test - 1e-5)) # Free: 0 < alpha_i < C

        print(f"{C_test:<10.4f} | {free_svs:<20} | {bounded_svs:<30}")

    print("\nDeliverable: Code Snippets for Free and Bounded SV computation are within this function.")
    print("These results should verify the theory that Bounded SVs increase as C decreases.")

# -----------------------------------------------------------------
# MAIN EXECUTION
# -----------------------------------------------------------------
if __name__ == "__main__":
    X_train_s, y_train, X_test_s, y_test = load_and_preprocess_data()

    # --- Task 2.2 Execution (Grid Search) ---
    grid_search = run_grid_search(X_train_s, y_train)

    # --- Final Report Results ---
    final_C = grid_search.best_params_['C']
    final_gamma = grid_search.best_params_['gamma']
    final_svm = grid_search.best_estimator_

    # Calculate Final Loss on Test Data (Deliverable)
    final_test_accuracy = accuracy_score(y_test, final_svm.predict(X_test_s))
    final_test_loss = 1 - final_test_accuracy

    print("\n=============================================")
    print("FINAL MODEL RESULTS (Task 2.2 Deliverables)")
    print("=============================================")
    print(f"Optimal C:     {final_C}")
    print(f"Optimal Gamma: {final_gamma}")
    print(f"Test Loss (0-1 Error): {final_test_loss:.4f}")
    print("=============================================")

    # --- Task 2.3 Execution (SV Analysis) ---
    analyze_support_vectors(X_train_s, y_train, final_C, final_gamma)

--- Task 2.1: Data Loading ---
Number of training samples: 150
Number of test samples:     164

Training Class Frequencies:
  Class -1: 45.33%
  Class 1: 54.67%

Verification of Transformed Test Data (Feature 10):
  Mean (should be near 0): 0.0141
  Variance (should be near 1): 1.0939

--- Task 2.2: Starting Grid Search ---
Fitting 5 folds for each of 25 candidates, totalling 125 fits

FINAL MODEL RESULTS (Task 2.2 Deliverables)
Optimal C:     1.0
Optimal Gamma: 0.01
Test Loss (0-1 Error): 0.2073

--- Task 2.3: Support Vector Analysis ---
Rigorous Argument: Free SVs DECREASE, Bounded SVs INCREASE when C is lowered.
C Value    | Free SVs (0 < |a| < C) | Bounded SVs (|a| = C)         
----------------------------------------------------------------------
1.0000     | 30                   | 55                            
0.0100     | 4                    | 134                           
0.0010     | 0                    | 136                           

Deliverable: Code Snippets for Free