# Linear soft margin svm

In [78]:
import numpy as np
from cvxopt import matrix, solvers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import time


def linear_soft_margin_svm(X, y, C):
    """
    Solves the soft-margin SVM optimization problem using quadratic programming.

    Args:
        X: Input data (N x d), where N is the number of samples and d is the feature dimension.
        y: Labels (N x 1), where each label is either +1 or -1.
        C: Regularization parameter.

    Returns:
        w: Weight vector (d x 1).
        b: Bias term.
    """
    N, d = X.shape

    # Construct the matrices for the QP solver

    # P: Block diagonal matrix (d + 1 + N) x (d + 1 + N)
    P = np.zeros((d + 1 + N, d + 1 + N))
    P[:d, :d] = np.eye(d)  # Penalize w but not b or xi
    P = matrix(P)

    # q: Linear term (d + 1 + N)x1
    q = np.zeros(d + 1 + N)
    q[d + 1:] = C  # Penalize xi with factor C
    q = matrix(q)

    # G: Inequality constraint coefficients
    G = np.zeros((2 * N, d + 1 + N))
    
    # y_i(w^T x_i + b) + xi >= 1
    for i in range(N):
        G[i, :d] = -y[i] * X[i]
        G[i, d] = -y[i]  # Bias term
        G[i, d + 1 + i] = -1  # Slack variable xi

    # xi >= 0
    for i in range(N):
        G[N + i, d + 1 + i] = -1

    G = matrix(G)

    # h: Right-hand side for inequality constraints
    h = np.zeros(2 * N)
    h[:N] = -1  # y_i(w^T x_i + b) + xi >= 1
    h = matrix(h)

    # A and b: Equality constraints (none for soft-margin SVM)
    A = None
    b = None

    # Solve the QP problem
    sol = solvers.qp(P, q, G, h, A, b)
    solution = np.array(sol['x']).flatten()

    # Extract w and b from the solution
    w = solution[:d]
    b = solution[d]

    return w, b

def predict(X, w, b):
    """
    Predicts the labels for the input data using the learned SVM model.

    Args:
        X: Input data (N x d), where N is the number of samples and d is the feature dimension.
        w: Weight vector (d x 1).
        b: Bias term.

    Returns:
        Predicted labels (N x 1).
    """
    return np.sign(np.dot(X, w) + b)

def calculate_accuracy(X_test, y_test, w, b):
    """
    Calculate the accuracy of the SVM model on the test set.

    Args:
        X_test: Test data (N x d).
        y_test: True labels for the test data (N x 1).
        w: Weight vector.
        b: Bias term.

    Returns:
        Accuracy of the model.
    """
    predictions = predict(X_test, w, b)
    accuracy = np.mean(predictions == y_test)
    return accuracy

def calculate_support_vectors(X, y, w, b, C):
    """
    Calculate the number of support vectors by checking the decision function.

    Args:
        X: Data points (N x d).
        y: Labels (N x 1).
        w: Weight vector.
        b: Bias term.
        C: Regularization parameter.

    Returns:
        Number of support vectors.
    """
    decision_values = y * (np.dot(X, w) + b)
    support_vectors = np.sum(np.abs(decision_values - 1) <= 1e-5)

    return support_vectors

# Example usage
if __name__ == "__main__":
    # Load dataset
    X = pd.read_excel("../coffeeDataSynthesized.xlsx", "dataset")
    y = np.where(X["type"] == "robusta", 0, 1)  # Robusta -> 0, Arabica -> 1
    y = pd.Series(y)

    # Select relevant features
    X = X[['width', 'height', 'depth', 'weight']]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Convert y_train and y_test to -1 and +1
    y_train = np.where(y_train == 0, -1, 1)
    y_test = np.where(y_test == 0, -1, 1)

    # Measure training time
    start_time = time.time()
    w, b = linear_soft_margin_svm(X_train_scaled, y_train, C=0.1)

    end_time = time.time()
    training_time = end_time - start_time

    # Predict and evaluate custom SVM
    y_pred_custom = predict(X_test_scaled, w, b)
    y_train_pred_custom = predict(X_train_scaled, w, b)
    custom_accuracy = np.mean(y_pred_custom == y_test)
    custom_accuracy_train = np.mean(y_train_pred_custom == y_train)

    # Output results
    print("Custom Linear SVM Results:")
    print(f"Test Accuracy: {custom_accuracy:.4f}")
    print(f"Train Accuracy: {custom_accuracy_train:.4f}")
    print(f"Training Time: {training_time:.4f} seconds")

    # Calculate number of support vectors
    num_support_vectors = calculate_support_vectors(X_train_scaled, y_train, w, b, C=0.1)
    print(f"Number of Support Vectors: {num_support_vectors}")

    print("Weight vector (w):", w)
    print("Bias term (b):", b)
    


     pcost       dcost       gap    pres   dres
 0:  2.2697e+01  5.1817e+02  8e+03  3e+00  5e+02
 1:  2.0900e+02 -2.2996e+02  5e+02  1e-01  2e+01
 2:  1.1640e+02  1.9175e+01  1e+02  1e-02  2e+00
 3:  5.8976e+01  4.1479e+01  2e+01  2e-03  4e-01
 4:  5.0881e+01  4.7086e+01  4e+00  4e-04  7e-02
 5:  4.9575e+01  4.8181e+01  1e+00  1e-04  2e-02
 6:  4.9191e+01  4.8499e+01  7e-01  5e-05  8e-03
 7:  4.8992e+01  4.8667e+01  3e-01  2e-05  3e-03
 8:  4.8875e+01  4.8767e+01  1e-01  5e-06  8e-04
 9:  4.8830e+01  4.8807e+01  2e-02  7e-07  1e-04
10:  4.8820e+01  4.8816e+01  4e-03  1e-07  2e-05
11:  4.8818e+01  4.8818e+01  4e-04  1e-08  2e-06
12:  4.8818e+01  4.8818e+01  4e-06  1e-10  2e-08
Optimal solution found.
Custom Linear SVM Results:
Test Accuracy: 0.8678
Train Accuracy: 0.7944
Training Time: 0.6436 seconds
Number of Support Vectors: 5
Weight vector (w): [ 1.13674498  1.05522057 -0.16442498  0.15522254]
Bias term (b): 0.013285141925986578


In [79]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_custom)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[100  20]
 [ 12 110]]


# Cross validation

In [80]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time

def cross_validate_svm(X, y, C_values, n_splits=5):
    """
    Perform 5-fold cross-validation to tune the hyperparameter C.

    Args:
        X: Input data (N x d)
        y: Labels (N x 1)
        C_values: List of regularization parameters C to test
        n_splits: Number of folds for cross-validation (default is 5)

    Returns:
        best_C: The best regularization parameter C based on cross-validation
        best_accuracy: The corresponding average accuracy
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=44)
    best_C = None
    best_accuracy = 0


    # Perform cross-validation for each C value
    for C in C_values:
        accuracies = []
        train_accuracies = []

        for train_idx, val_idx in kf.split(X):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]

            # Standardize the features using the training data
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_val_scaled = scaler.transform(X_val)  # Apply the same scaling to the validation set

            # Train the SVM with the current C (replace this with your linear soft margin SVM)
            w, b = linear_soft_margin_svm(X_train_scaled, y_train, C)

            # Predict and evaluate on validation set
            y_pred_val = predict(X_val_scaled, w, b)
            y_pred_train = predict(X_train_scaled, w, b)
            accuracy = np.mean(y_pred_val == y_val)
            train_accuracy = np.mean(y_pred_train == y_train)
            accuracies.append(accuracy)
            train_accuracies.append(train_accuracy)

        # Calculate the average accuracy for the current C
        avg_accuracy = np.mean(accuracies)
        avg_train_accuracy = np.mean(train_accuracies)

        # Update the best C if necessary
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_train_accuracy = avg_train_accuracy
            best_C = C


    return best_C, best_accuracy, best_train_accuracy

# Example usage
if __name__ == "__main__":
    # Load dataset
    X = pd.read_excel("../coffeeDataSynthesized.xlsx", "dataset")
    y = np.where(X["type"] == "robusta", 0, 1)  # Robust -> 0, Arabica -> 1
    y = pd.Series(y)

    # Select relevant features
    X = X[['width', 'height', 'depth', 'weight']]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=44)

    # Scale the features (using StandardScaler for consistency across all steps)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)  # Apply the same scaling to the test data

    # Convert y_train and y_test to -1 and +1 (SVM convention)
    y_train = np.where(y_train == 0, -1, 1)
    y_test = np.where(y_test == 0, -1, 1)

    # Set a range of C values to test
    C_values = [0.1, 1.0, 10.0, 100.0, 1000.0]

    # Measure training time
    start_time = time.time()
    # Tune hyperparameter C using cross-validation
    best_C, best_accuracy, best_train_accuracy = cross_validate_svm(X_train_scaled, y_train, C_values)
    w, b = linear_soft_margin_svm(X_train_scaled, y_train, best_C)
    end_time = time.time()
    y_pred_custom_cv = predict(X_test_scaled, w, b)
    y_train_pred_custom_cv = predict(X_train_scaled, w, b)
    custom_accuracy_cv = np.mean(y_pred_custom_cv == y_test)
    custom_accuracy_train_cv = np.mean(y_train_pred_custom_cv == y_train)
    training_time = end_time - start_time

    print(f"Best C: {best_C}")
    print(f"Best Cross-Validation Test Accuracy: {custom_accuracy_cv:.4f}")
    print(f"Best Cross-Validation Train Accuracy: {custom_accuracy_train_cv:.4f}")
    print(f"Cross-Validation Run Time: {training_time:.4f}")


     pcost       dcost       gap    pres   dres
 0:  1.7482e+01  4.0268e+02  6e+03  3e+00  4e+02
 1:  1.6161e+02 -1.8768e+02  4e+02  1e-01  2e+01
 2:  9.1942e+01  1.2225e+01  8e+01  1e-02  2e+00
 3:  4.6940e+01  3.0891e+01  2e+01  3e-03  4e-01
 4:  3.9104e+01  3.5712e+01  3e+00  5e-04  8e-02
 5:  3.7916e+01  3.6687e+01  1e+00  1e-04  2e-02
 6:  3.7544e+01  3.6985e+01  6e-01  5e-05  7e-03
 7:  3.7348e+01  3.7143e+01  2e-01  1e-05  2e-03
 8:  3.7265e+01  3.7214e+01  5e-02  2e-06  4e-04
 9:  3.7244e+01  3.7232e+01  1e-02  5e-07  7e-05
10:  3.7238e+01  3.7238e+01  4e-04  1e-08  2e-06
11:  3.7238e+01  3.7238e+01  5e-06  1e-10  2e-08
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0:  1.7857e+01  4.1021e+02  6e+03  3e+00  4e+02
 1:  1.6589e+02 -1.8498e+02  4e+02  1e-01  2e+01
 2:  9.2558e+01  1.5317e+01  8e+01  1e-02  2e+00
 3:  4.5537e+01  3.3485e+01  1e+01  2e-03  3e-01
 4:  4.0495e+01  3.7535e+01  3e+00  4e-04  6e-02
 5:  3.9348e+01  3.8596e+01  8e-01  6e-05  9e-0

# Different scikit learn libraries

In [81]:
# Import necessary libraries
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import time  # Import time module

# Train and evaluate Linear SVM
start_time = time.time()  # Start time
linear_svm = SVC(kernel='linear')
linear_svm.fit(X_train_scaled, y_train)
y_pred_linear = linear_svm.predict(X_test_scaled)
y_pred_train_linear = linear_svm.predict(X_train_scaled)
linear_accuracy = accuracy_score(y_test, y_pred_linear)
linear_train_accuracy = accuracy_score(y_train, y_pred_train_linear)
linear_time = time.time() - start_time  # End time and calculate duration
print(f"Linear SVM Accuracy: {linear_accuracy:.4f}, Train Accuracy: {linear_train_accuracy:.4f}, Time: {linear_time:.4f} seconds")

# Train and evaluate RBF SVM
start_time = time.time()  # Start time
rbf_svm = SVC(kernel='rbf')
rbf_svm.fit(X_train_scaled, y_train)
y_pred_rbf = rbf_svm.predict(X_test_scaled)
y_pred_train_rbf = rbf_svm.predict(X_train_scaled)
rbf_accuracy = accuracy_score(y_test, y_pred_rbf)
rbf_train_accuracy = accuracy_score(y_train, y_pred_train_rbf)
rbf_time = time.time() - start_time  # End time and calculate duration
print(f"RBF SVM Accuracy: {rbf_accuracy:.4f}, Train Accuracy: {rbf_train_accuracy:.4f}, Time: {rbf_time:.4f} seconds")

# Train and evaluate Polynomial SVM (degree=3)
start_time = time.time()  # Start time
poly_svm = SVC(kernel='poly', degree=3)
poly_svm.fit(X_train_scaled, y_train)
y_pred_poly = poly_svm.predict(X_test_scaled)
y_pred_train_poly = poly_svm.predict(X_train_scaled)
poly_accuracy = accuracy_score(y_test, y_pred_poly)
poly_train_accuracy = accuracy_score(y_train, y_pred_train_poly)
poly_time = time.time() - start_time  # End time and calculate duration
print(f"Polynomial SVM (degree=3) Test Accuracy: {poly_accuracy:.4f}, Train Accuracy: {poly_train_accuracy:.4f} , Time: {poly_time:.4f} seconds")


Linear SVM Accuracy: 0.8678, Train Accuracy: 0.7955, Time: 0.0453 seconds
RBF SVM Accuracy: 0.8760, Train Accuracy: 0.8771, Time: 0.0469 seconds
Polynomial SVM (degree=3) Test Accuracy: 0.7603, Train Accuracy: 0.7727 , Time: 0.0232 seconds


In [82]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_linear)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[102  18]
 [ 14 108]]


In [83]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_rbf)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[101  19]
 [ 11 111]]


In [84]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_poly)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[ 64  56]
 [  2 120]]


In [85]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, recall_score, precision_score, f1_score

# Example: Recall, Precision, and F1-score

accuracy = accuracy_score(y_test, y_pred_custom)
recall = recall_score(y_test, y_pred_custom)
precision = precision_score(y_test, y_pred_custom)
f1 = f1_score(y_test, y_pred_custom)
auroc = roc_auc_score(y_test, y_pred_custom)

cv_accuracy = accuracy_score(y_test, y_pred_custom_cv)
cv_recall = recall_score(y_test, y_pred_custom_cv)
cv_precision = precision_score(y_test, y_pred_custom_cv)
cv_f1 = f1_score(y_test, y_pred_custom_cv)
cv_auroc = roc_auc_score(y_test, y_pred_custom_cv)

linear_accuracy = accuracy_score(y_test, y_pred_linear)
linear_recall = recall_score(y_test, y_pred_linear)
linear_precision = precision_score(y_test, y_pred_linear)
linear_f1 = f1_score(y_test, y_pred_linear)
linear_auroc = roc_auc_score(y_test, y_pred_linear)

rbf_accuracy = accuracy_score(y_test, y_pred_rbf)
rbf_recall = recall_score(y_test, y_pred_rbf)
rbf_precision = precision_score(y_test, y_pred_rbf)
rbf_f1 = f1_score(y_test, y_pred_rbf)
rbf_auroc = roc_auc_score(y_test, y_pred_rbf)

poly_accuracy = accuracy_score(y_test, y_pred_poly)
poly_recall = recall_score(y_test, y_pred_poly)
poly_precision = precision_score(y_test, y_pred_poly)
poly_f1 = f1_score(y_test, y_pred_poly)
poly_auroc = roc_auc_score(y_test, y_pred_poly)






# Print all metrics
print(f"Our Accuracy: {accuracy:4f}")
print(f"Our Recall: {recall:4f}")
print(f"Our Precision: {precision:4f}")
print(f"Our F1-score: {f1:4f}"),
print(f"Our AUROC: {auroc:4f}")
print()
print(f"CV Accuracy: {cv_accuracy:4f}")
print(f"CV Recall: {cv_recall:4f}")
print(f"CV Precision: {cv_precision:4f}")
print(f"CV F1-score: {cv_f1:4f}"),
print(f"CV AUROC: {cv_auroc:4f}")
print()
print(f"Linear Accuracy: {linear_accuracy:4f}")
print(f"Linear Recall: {linear_recall:4f}")
print(f"Linear Precision: {linear_precision:4f}")
print(f"Linear F1-score: {linear_f1:4f}"),
print(f"Linear AUROC: {linear_auroc:4f}")
print()
print(f"RBF Accuracy: {rbf_accuracy:4f}")
print(f"RBF Recall: {rbf_recall:4f}")
print(f"RBF Precision: {rbf_precision:4f}")
print(f"RBF F1-score: {rbf_f1:4f}")
print(f"RBF AUROC: {rbf_auroc:4f}")
print()
print(f"Polynomial Accuracy: {poly_accuracy:4f}")
print(f"Polynomial Recall: {poly_recall:4f}")
print(f"Polynomial Precision: {poly_precision:4f}")
print(f"Polynomial F1-score: {poly_f1:4f}")
print(f"Polynomial AUROC: {poly_auroc:4f}")




Our Accuracy: 0.867769
Our Recall: 0.901639
Our Precision: 0.846154
Our F1-score: 0.873016
Our AUROC: 0.867486

CV Accuracy: 0.867769
CV Recall: 0.885246
CV Precision: 0.857143
CV F1-score: 0.870968
CV AUROC: 0.867623

Linear Accuracy: 0.867769
Linear Recall: 0.885246
Linear Precision: 0.857143
Linear F1-score: 0.870968
Linear AUROC: 0.867623

RBF Accuracy: 0.876033
RBF Recall: 0.909836
RBF Precision: 0.853846
RBF F1-score: 0.880952
RBF AUROC: 0.875751

Polynomial Accuracy: 0.760331
Polynomial Recall: 0.983607
Polynomial Precision: 0.681818
Polynomial F1-score: 0.805369
Polynomial AUROC: 0.758470
