## SVM model code

Jia

### Import libraries and display dataset information

In [None]:
#import necessary libraries and packages
import quadprog
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

#load the dataset
df = pd.read_csv('processed_urls.csv')

#display dataset information
print("Dataset Column Information:")
print("=" * 50)
print(df.dtypes)
print("\n" + "=" * 50)
print("\nDataFrame Info:")
df.info()
print("\n" + "=" * 50)
print("\nColumn Names:")
for i, col in enumerate(df.columns, 1):
    print(f"{i}. {col} ({df[col].dtype})")


### Split data for training and testing

In [None]:
#separate training and testing data

#use 5000 samples for training
subset_size = 5000
np.random.seed(42)
subset_indices = np.random.choice(X_train.shape[0], size=subset_size, replace=False)

X_train_subset = X_train[subset_indices]
Y_train_subset = Y_train[subset_indices]

### Create Q Matrix and set up quadprog parameters

In [None]:
#create Q matrix
n_samples = X_train_subset.shape[0]
Q = np.zeros((n_samples, n_samples))

print(f"Creating Q matrix of shape ({n_samples}, {n_samples})...")
for i in range(n_samples):
    for j in range(n_samples):
        Q[i, j] = Y_train_subset[i] * Y_train_subset[j] * np.dot(X_train_subset[i], X_train_subset[j])
    if (i + 1) % 500 == 0:
        print(f"  Completed row {i + 1}/{n_samples}")

#set up rest of quadprog parameters
P = Q + np.eye(n_samples) * 1e-5
q = -np.ones(n_samples)
G = -np.eye(n_samples)
h = np.zeros(n_samples)
A = Y_train_subset.reshape((1, n_samples))
b = np.zeros(1)

print(f"Q matrix shape: {Q.shape}")

### Solve SVM Optimization

In [None]:
#helper function to solve QP
def quadprog_solve_qp(P, q, G=None, h=None, A=None, b=None):
    qp_G = .5 * (P + P.T)
    qp_a = -q
    if A is not None:
        qp_C = -np.vstack([A, G]).T
        qp_b = -np.hstack([b, h])
        meq = A.shape[0]
    else:
        qp_C = -G.T
        qp_b = -h
        meq = 0
    return quadprog.solve_qp(qp_G, qp_a, qp_C, qp_b, meq)[0]

#solve SVM optimization
print("Solving SVM optimization problem...")
solution = quadprog_solve_qp(P, q, G, h, A, b)

print(f"\nSolution shape: {solution.shape}")
print(f"Number of support vectors: {np.sum(solution > 1e-5)}")
print(f"Support vector indices: {np.where(solution > 1e-5)[0]}")


### Compute bias term

In [None]:
#extract support vectors and compute bias (b) term
support_vector_indices = np.where(solution > 1e-5)[0]
support_vectors = X_train_subset[support_vector_indices]
support_vector_labels = Y_train_subset[support_vector_indices]
alphas = solution[support_vector_indices]

print(f"Number of support vectors: {len(support_vector_indices)}")

# Compute bias term (b) using support vectors
# For a support vector: y_i * (w^T * x_i + b) = 1
# We use margin support vectors (0 < alpha < C) for better numerical stability
margin_sv_indices = support_vector_indices[np.logical_and(alphas > 1e-5, alphas < 0.99)]

if len(margin_sv_indices) > 0:

    w = np.sum(alphas[:, np.newaxis] * support_vector_labels[:, np.newaxis] * support_vectors, axis=0)
    
    #compute b from margin support vectors
    b_values = []
    for idx in margin_sv_indices:
        sv_idx = np.where(support_vector_indices == idx)[0][0]
        b_val = support_vector_labels[sv_idx] - np.dot(w, support_vectors[sv_idx])
        b_values.append(b_val)
    
    b_term = np.mean(b_values) if b_values else 0
else:
    w = np.sum(alphas[:, np.newaxis] * support_vector_labels[:, np.newaxis] * support_vectors, axis=0)
    b_term = 0

print(f"Weight vector shape: {w.shape}")
print(f"Bias term (b): {b_term}")

### Make Predictions

In [None]:
#prediction function
def predict_svm(X, w, b):
    predictions = np.dot(X, w) + b
    return np.sign(predictions)

#make predictions on training set
y_pred_train = predict_svm(X_train_subset, w, b_term)
train_accuracy = np.mean(y_pred_train == Y_train_subset)

#make predictions on test set
y_pred_test = predict_svm(X_test, w, b_term)
test_accuracy = np.mean(y_pred_test == Y_test)

print(f"\n{'='*50}")
print("SVM MODEL EVALUATION")
print(f"{'='*50}")
print(f"Training Accuracy: {train_accuracy:.4f} ({train_accuracy*100:.2f}%)")
print(f"Test Accuracy:     {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

### Evaluate Model (TO DO)