In [1]:
#importing reqired libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [3]:

#test and training data urls
train_url = 'http://www.amlbook.com/data/zip/features.train'
test_url = 'http://www.amlbook.com/data/zip/features.test'

# Load the datasets
train_data = pd.read_csv(train_url, sep='\s+', header=None)
test_data = pd.read_csv(test_url, sep='\s+', header=None)


In [4]:
#To check if the data is parsed correctly
print(train_data.shape)
print(test_data.shape )

(7291, 3)
(2007, 3)


In [5]:
# Assign column names
train_data.columns = ['label', 'intensity', 'symmetry']
test_data.columns = ['label', 'intensity', 'symmetry']

# Filter the data for label 1 and 5
train_data_filtered = train_data.loc[(train_data['label'] == 1) | (train_data['label'] == 5)]
test_data_filtered = test_data.loc[(test_data['label'] == 1) | (test_data['label'] == 5)]

# Convert labels: map 1 to +1 and 5 to -1 using loc to avoid warnings
train_data_filtered.loc[:, 'label'] = train_data_filtered['label'].map({1: 1, 5: -1})
test_data_filtered.loc[:, 'label'] = test_data_filtered['label'].map({1: 1, 5: -1})

# Split the features and labels
X_train = train_data_filtered[['intensity', 'symmetry']].values
y_train = train_data_filtered['label'].values

X_test = test_data_filtered[['intensity', 'symmetry']].values
y_test = test_data_filtered['label'].values



**4A**

In [6]:
# Train a linear SVM (soft-margin)
svm_clf = SVC(kernel='linear', C=1.0)  # C is the regularization parameter
svm_clf.fit(X_train, y_train)

# Predict on the test set
y_pred = svm_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Get the number of support vectors
n_support_vectors = len(svm_clf.support_)

# Output results

print(f"Accuracy on test set: {accuracy * 100:.2f}%")
print(f"Number of support vectors: {n_support_vectors}")

Accuracy on test set: 97.88%
Number of support vectors: 28


**4B:**

In [16]:
subset_sizes = [50, 100, 200, 800]

# Iterate over each subset size
for size in subset_sizes:
    print(f"\nTraining with first {size} points:")

    # Subset the training data
    X_train_sample = train_data_filtered[['intensity', 'symmetry']].head(size).values
    y_train_sample = train_data_filtered['label'].head(size).values

    #printing the shape
    print(X_train_sample.shape)
    print(y_train_sample.shape)

    # Train a linear SVM (soft-margin)
    svm_clf = SVC(kernel='linear', C=1.0)  # C is the regularization parameter
    svm_clf.fit(X_train_sample, y_train_sample)

    # Predict on the test set
    y_pred = svm_clf.predict(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)

    # Get the number of support vectors
    support_vectors = len(svm_clf.support_)

    # Output results

    print(f"Accuracy on test set: {accuracy * 100:.2f}%")
    print(f"Number of support vectors: {support_vectors}")


Training with first 50 points:
(50, 2)
(50,)
Accuracy on test set: 98.11%
Number of support vectors: 2

Training with first 100 points:
(100, 2)
(100,)
Accuracy on test set: 98.11%
Number of support vectors: 4

Training with first 200 points:
(200, 2)
(200,)
Accuracy on test set: 98.11%
Number of support vectors: 8

Training with first 800 points:
(800, 2)
(800,)
Accuracy on test set: 98.11%
Number of support vectors: 14


In [15]:
def test_polynomial_kernel(Q, C):
    print(f"Testing for Q={Q}, C={C}")

    # Create SVM with polynomial kernel
    model = SVC(kernel='poly', degree=Q, C=C, coef0=1)

    # Train the model
    model.fit(X_train, y_train)

    # Predict on training and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute training and test errors
    train_error = 1 - accuracy_score(y_train, y_train_pred)
    test_error = 1 - accuracy_score(y_test, y_test_pred)

    # Number of support vectors
    num_support_vectors = len(model.support_)

    # Print results
    print(f"Training Error: {train_error}")
    print(f"Test Error: {test_error}")
    print(f"Number of Support Vectors: {num_support_vectors}")
    print()

# Testing the different conditions as per the question
C_values = [0.0001, 0.001, 0.01, 1]

for C in C_values:
    test_polynomial_kernel(Q=2, C=C)  # Test for Q = 2
    test_polynomial_kernel(Q=5, C=C)  # Test for Q = 5

Testing for Q=2, C=0.0001
Training Error: 0.25368353619474693
Test Error: 0.2570754716981132
Number of Support Vectors: 1112

Testing for Q=5, C=0.0001
Training Error: 0.018577834721332454
Test Error: 0.028301886792452824
Number of Support Vectors: 188

Testing for Q=2, C=0.001
Training Error: 0.014093529788597015
Test Error: 0.02594339622641506
Number of Support Vectors: 456

Testing for Q=5, C=0.001
Training Error: 0.006406149903907754
Test Error: 0.018867924528301883
Number of Support Vectors: 72

Testing for Q=2, C=0.01
Training Error: 0.005124919923126248
Test Error: 0.018867924528301883
Number of Support Vectors: 132

Testing for Q=5, C=0.01
Training Error: 0.004484304932735439
Test Error: 0.01650943396226412
Number of Support Vectors: 34

Testing for Q=2, C=1
Training Error: 0.004484304932735439
Test Error: 0.021226415094339646
Number of Support Vectors: 28

Testing for Q=5, C=1
Training Error: 0.0038436899423446302
Test Error: 0.02358490566037741
Number of Support Vectors: 25



According to the examples and data set we have the following results
1.   When C = 0.0001, training error is higher at Q = 5: **True**
2.   When C = 0.001, the number of support vectors is lower at Q = 5: **True**

1.   When C = 0.01, training error is higher at Q = 5: **False**
2.   When C = 1, test error is lower at Q = 5: **False**







**4 D**

In [10]:
C_values = [0.01, 1, 100, 10**4, 10**6]

# Initialize lists to store error values
training_errors = []
test_errors = []

# Train and evaluate for each value of C
for C in C_values:
    # Create the SVM model with RBF kernel
    model = SVC(kernel='rbf', C=C)

    # Train the model
    model.fit(X_train, y_train)

    # Predict on training and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Compute error
    train_error = 1 - train_accuracy
    test_error = 1 - test_accuracy

    # Store the errors
    training_errors.append(train_error)
    test_errors.append(test_error)

    # Print the results for each C
    # print(f"C = {C}:")
    # print(f"  Training Error: {train_error:.4f}")
    # print(f"  Test Error: {test_error:.4f}")

# Summary of errors

for i, C in enumerate(C_values):
    print(f"C = {C}: Training Error = {training_errors[i]*100:.4f}%, Test Error = {test_errors[i]*100:.4f}%")

C = 0.01: Training Error = 0.5125%, Test Error = 1.6509%
C = 1: Training Error = 0.4484%, Test Error = 2.1226%
C = 100: Training Error = 0.3203%, Test Error = 1.8868%
C = 10000: Training Error = 0.2562%, Test Error = 1.8868%
C = 1000000: Training Error = 0.2562%, Test Error = 2.3585%


**5**
We are using SVM from sklearn to compute SVM with different kernels

In [11]:
from sklearn.preprocessing import StandardScaler
# Standardize the data (scaling the features)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to train and evaluate SVM
def train_and_evaluate_svm(X_train, y_train, X_test, y_test, kernel, **kwargs):
    model = SVC(kernel=kernel, **kwargs)
    model.fit(X_train, y_train)

    # Predict on train and test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Compute training and test accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)

    # Compute training and test errors
    train_error = 1 - train_accuracy
    test_error = 1 - test_accuracy

    # Number of support vectors
    num_support_vectors = len(model.support_)

    return train_error, test_error, num_support_vectors

# (a) Linear kernel
print("5 A Question's Answer: ")
train_error_linear, test_error_linear, num_sv_linear = train_and_evaluate_svm(X_train_scaled, y_train, X_test_scaled, y_test, kernel='linear')
print(f"Linear Kernel: Train Error = {train_error_linear:.4f}, Test Error = {test_error_linear:.4f}, Support Vectors = {num_sv_linear}")

# (b) RBF kernel (γ = 0.001)
print("5 B Question's Answer: ")
train_error_rbf, test_error_rbf, num_sv_rbf = train_and_evaluate_svm(X_train_scaled, y_train, X_test_scaled, y_test, kernel='rbf', gamma=0.001)
print(f"RBF Kernel (γ=0.001): Train Error = {train_error_rbf:.4f}, Test Error = {test_error_rbf:.4f}, Support Vectors = {num_sv_rbf}")

# (c) Polynomial kernel (degree=2, coef0=1)
train_error_poly, test_error_poly, num_sv_poly = train_and_evaluate_svm(X_train_scaled, y_train, X_test_scaled, y_test, kernel='poly', degree=2, coef0=1)
print(f"Polynomial Kernel (degree=2): Train Error = {train_error_poly:.4f}, Test Error = {test_error_poly:.4f}, Support Vectors = {num_sv_poly}")

5 A Question's Answer: 
Linear Kernel: Train Error = 0.0038, Test Error = 0.0189, Support Vectors = 36
5 B Question's Answer: 
RBF Kernel (γ=0.001): Train Error = 0.0160, Test Error = 0.0259, Support Vectors = 556
Polynomial Kernel (degree=2): Train Error = 0.0032, Test Error = 0.0212, Support Vectors = 36
