In [1]:
import numpy as np
#import pandas as pd
#import bz2file as bz2
import os
from typing import Tuple, Optional


# Introduction to the Gisette Dataset

The Gisette dataset is a well-known benchmark dataset in the field of machine learning, particularly used for feature selection and binary classification tasks. It was originally part of the NIPS 2003 feature selection challenge. The dataset consists of handwritten digit images, where the task is to distinguish between the digits '4' and '9'.

## Dataset Characteristics

- **Features:** The dataset contains 5000 features, many of which are redundant or irrelevant, making it a good test for feature selection algorithms.
- **Instances:** There are 7000 instances in the training set and 1000 instances in the test set.
- **Classes:** The labels are binary, with two classes representing the digits '4' and '9'.

## Usage

The Gisette dataset is often used to evaluate the performance of various machine learning algorithms, especially those designed for high-dimensional data. It provides a challenging testbed for algorithms due to its high dimensionality and the presence of irrelevant features.

## References

put a reference here

All features are scaled.

The Gisette dataset is often used to evaluate the performance of various machine learning algorithms, especially those designed for high-dimensional data. It provides a challenging testbed for algorithms due to its high dimensionality and the presence of irrelevant features.
each line is like this: 
-1 1:-1 2:-1 3:0.913914 4:-1 5:-1 6:0.4530 ...
the first number is either 1 or -1 (label y)
 and it is followed by 5000 pairs of the form integer_index. the floats are the x values


In [2]:
from src.utils import read_gisette_data

In [3]:
MAX_LINES = 7000
file_path_train = os.path.join("..","data","gisette_scale.bz2")
file_path_test = os.path.join("..","data","gisette_scale.t.bz2")


y_train, X_train = read_gisette_data(file_path_train, max_lines=MAX_LINES)
y_test, X_test = read_gisette_data(file_path_test, max_lines=MAX_LINES)

## SVM problem definition

* the optimization problem we should solve it the following one:
$$
\begin{equation}
\begin{aligned}
& \min \quad \frac{1}{2} \|\mathbf{w}\|^2 + C \sum_{i=1}^m \xi_i \\
& \text{subject to} \quad y_i (\mathbf{w} \cdot \mathbf{x}_i + b) \geq 1 - \xi_i, \quad \xi_i \geq 0, \quad i = 1, \dots, m
\end{aligned}
\end{equation}
$$

According to Platt's algorithm [put reference here] it is preferrable to solve the dual, which is the following:

$$
\begin{equation}
\begin{aligned}
& \max_{\boldsymbol{\alpha}} \quad \sum_{i=1}^n \alpha_i - \frac{1}{2} \sum_{i=1}^n \sum_{j=1}^n \alpha_i \alpha_j y_i y_j \mathbf{x}_i \cdot \mathbf{x}_j \\
& \text{subject to} \sum_{i=1}^n \alpha_i y_i = 0 \\
& \quad \quad \quad \quad 0 \leq \alpha_i \leq C, \quad i = 1, \dots, m
\end{aligned}
\end{equation}
$$
     

In [11]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score

# Create an SVM classifier
svm = SVC(kernel='linear', C=100)

# Train the SVM classifier
svm.fit(X_train, y_train)


In [12]:
# Predict on the test data
y_pred = svm.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

print(f'Accuracy of SVM on test set: {accuracy:.2f}')
print(f'F1 of SVM on test set: {f1:.2f}')
print(f'Precision of SVM on test set: {precision:.2f}')



Accuracy of SVM on test set: 0.97
F1 of SVM on test set: 0.97
Precision of SVM on test set: 0.98


# Custom SMO algorithm

* Create the SMO algorith
* use classes in scikit-learn similar manner, so that has fit and predict methods for training and inference
* the cor of the classifier class are the two functions described in Platt's paper `take_step` and `examine_example`
* The main routine has been replaced with `fit` and has the two 

In [23]:
class SVM_classifier:
    def __init__(self, X, y, kernel:str ='linear', C:float =1, epsilon:float = 1e-8, tol:float = 0.001, max_iter:int= 500):
        self.X = X
        self.y = y
        self.kernel = kernel
        self.kernel_func = self.select_kernel(self.kernel)
        self.C = C
        self.epsilon = epsilon # error margin 
        self.tol = tol # tolerance for KKT
        self.max_iter = max_iter
        self.m, self.n = np.shape(self.X) # m is number of samples, n number of features
        
        self.alphas = np.zeros(self.m)
        self.Error_cache = np.zeros(self.m) 
        
        # If the kernel is linear we can store a single weight vector and use the alternative implemented in SVM
        
        self.w = np.zeros(self.n)
        self.b = 0 # intercept            
        
        
    def select_kernel(self, kernel:str):
        
        ''' We have to choose a kernel based on the kernel type argument
        here we can use only linear or the gaussion, no other kernels are available'''
        
        if kernel == 'linear':
            return self.linear_kernel
        elif kernel == 'rbf':
            return self.rbf_kernel
        else:
            raise ValueError(f"Unsupported kernel type: {kernel}")
    
    def linear_kernel(self, x1: np.ndarray, x2: np.ndarray) -> np.ndarray:
        return np.dot(x1, x2.T)
    
    def rbf_kernel(self, x1: np.ndarray, x2: np.ndarray) -> np.ndarray:
        if x1.ndim == 1:
            x1 = x1.reshape(1, -1)
        if x2.ndim == 1:
            x2 = x2.reshape(1, -1)

        # Compute the squared Euclidean distance between each pair of points
        sq_dists = np.sum(x1**2, axis=1).reshape(-1, 1) + np.sum(x2**2, axis=1) - 2 * np.dot(x1, x2.T)

        # Compute the Gaussian kernel with auto scaling
        gamma = 1.0/x1.size
        K = np.exp(-gamma * sq_dists)
        return K  
    
    def take_step(self, i1:int=None, i2:int=None) -> int:
        if i1==i2:
            return 0
        
        # Set all required parameters
        a1 = self.alphas[i1]
        a2 = self.alphas[i2]
        
        x1 = self.X[i1,:]
        x2 = self.X[i2,:]
        
        y1 = self.y[i1]
        y2 = self.y[i2]
        
        E1 = self.Error_cache[i1]
        E2 = self.Error_cache[i2]
        
        # Define parameter s
        s = y1 * y2
        
        # Compute L, H via equations (13) and (14) from Platt
        if y1!=y2:
            L = np.max(0,a2-a1)
            H = np.min(self.C,self.C+a2-a1)
        else:
            L = np.max(0,a2+a1-self.C)
            H = np.min(self.C,a2+a1)
            
        if L==H:
            return 0
        
        k11 = self.kernel_func(x1,x1)
        k22 = self.kernel_func(x2,x2)
        k12 = self.kernel_func(x1,x2)
        
        # Compute the second derivative of the objective function along the diagonal line
        eta = k11 + k22 - 2.0*k12
        
        if eta > 0:
            # Normal circumstances, using Equations (16)-(18) to compute a1 and a2
            a2_new = a2 +y2*(E1-E2)/eta
            
            if a2_new>H:
                a2_new = H
            if a2_new<L:
                a2_new = L
        else:
            # Strange case, we use Equations (19)
            f1 = y1*(E1 + self.b) - a1*k11 - s*a2*k12
            f2 = y2*(E2 + self.b) - s*a1*k12 - a2*k22
            L1 = a1 + s*(a2 - L)
            H1 = a1 + s*(a2 - H)
            psi_L = L1*f1 + L*f2 + 0.5*L1*L1*k11 + 0.5*L*L*k22 + s*L*L1*k12
            psi_H = H1*f1 + H*f2 + 0.5*H1*H1*k11 + 0.5*H*H*k22 + s*H*H1*k12
            
            if psi_L < (psi_H - self.epsilon):
                a2_new = L
            elif psi_L > (psi_H + self.epsilon):
                a2_new = H
            else:
                a2_new = a2

        # if a2 very close to zero or C set a to 0 or C respectively
        #if a2 < (10 ** (-8)):
        #    a2 = 0.0
        #elif a2 > self.C - (10**-8):
        #    a2 = self.C
        
        if np.abs(a2_new - a2) < (self.epsilon * (a2_new + a2 + self.epsilon)):
            return 0
        
        # Calculcate a1_new
        a1_new = a1+ s*(a2 - a2_new)
        
        # Update threshold b
        b1 = self.b + E1 + y1*(a1_new - a1)*k11 + y2*(a2_new - a2)*k12
        b2 = self.b + E2 + y1*(a1_new - a1)*k12 + y2*(a2_new - a2)*k22
        
        if 0 < a1_new < self.C:
            b_new = b1
        elif 0 < a2_new < self.C:
            b_new = b2
        else:
            b_new = 0.5*(b1 + b2)
            
        # Update weight's vector if Linear kernel
        if self.kernel == 'linear':
            self.w = self.w + y1*(a1_new - a1)*x1 + y2*(a2_new - a2)*x2
            
        # Update Error_cache using alphas (see reference)
        
        # if a1 & a2 are not at bounds, the error will be 0
        self.Error_cache[i1] = 0
        self.Error_cache[i2] = 0

        inner_indices = [idx for idx, a in enumerate(self.alphas) if 0 < a < self.C]
        for i in inner_indices:
            self.Error_cache[i] += ( y1*(a1_new - a1)*self.kernel_func(x1, self.X[i,:]) 
                                     + y2 * (a2_new - a2) * self.kernel_func(x2, self.X[i,:]) + (b_new - self.b))
        
        
        # Update alphas
        self.alphas[i1] = a1_new
        self.alphas[i2] = a2_new
        
        # Update b
        self.b = b_new
        
        return 1
    
    def examine_example(self, i2:int=None):
        pass
    
    def fit(self):
        pass

In [18]:
svm.n_features_in_

5000

In [24]:
model = SVM_classifier(X_train, y_train)
model.fit()

In [25]:
y_train[90] * y_train[60]

np.int64(-1)

In [26]:
X_train[50,:]

array([-1., -1., -1., ..., -1., -1., -1.])