In [5]:
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn import metrics

In [6]:
# Constants

cancer_file = "hw4_data/breast-cancer_scale"
cancer_train_indices_file = "hw4_data/breast-cancer_train_indices.txt"
cancer_test_indices_file = "hw4_data/breast-cancer_test_indices.txt"


sonar_file = "hw4_data/sonar_scale"
sonar_train_indices_file = "hw4_data/sonar_train_indices.txt"
sonar_test_indices_file = "hw4_data/sonar_test_indices.txt"


covtype_file = "hw4_data/covtype.data"
covtype_train_indices_file = "hw4_data/covtype_train_indices.txt"
covtype_test_indices_file = "hw4_data/covtype_test_indices.txt"


k = 5
Cs = [0.1, 1, 10, 100, 1000]
loss = "hinge"
penalty = "l2"
max_iter=2000

### Load data

In [7]:
def load_data(data_file, train_idx_file, test_idx_file):
    """
    Load train and test data from data_file
    Returns:
        X_train, Y_train, X_test, Y_test
    """

    # Load data
    def conv(item):
        return item.split(":")[1] if ":" in item else item

    data = np.loadtxt(data_file, converters=conv)
    print(f"\n Data shape: {data.shape}")
    
    # Load index
    train_idx = np.loadtxt(train_idx_file, dtype="int", delimiter=",")
    test_idx = np.loadtxt(test_idx_file, dtype="int", delimiter=",")
    
    X_train, Y_train = data[train_idx, 1:], data[train_idx, 0]
    X_test, Y_test = data[test_idx, 1:], data[test_idx, 0]
    
    print(f"\nX_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
    print(f"X_test shape: {X_test.shape}, Y_train shape: {Y_test.shape}\n")
    return X_train, Y_train, X_test, Y_test

In [8]:
# Load sonar file

def load_sonar_file():
    data_file = 'hw4_data/sonar_scale'
    print(f"\nLoading: {data_file}")

    X_list = []
    y_list = []
    max_feature = 0

    with open(data_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            
            y_list.append(int(float(parts[0])))
            
            features = {}
            for item in parts[1:]:
                idx, val = item.split(':')
                idx = int(idx) - 1
                features[idx] = float(val)
                max_feature = max(max_feature, idx)
            X_list.append(features)

    # Convert to arrays
    n_samples = len(y_list)
    n_features = max_feature + 1
    X = np.zeros((n_samples, n_features))
    for i, features in enumerate(X_list):
        for idx, val in features.items():
            X[i, idx] = val
    y = np.array(y_list)

    train_idx = np.loadtxt('hw4_data/sonar_train_indices.txt', dtype=int, delimiter=',')
    test_idx = np.loadtxt('hw4_data/sonar_test_indices.txt', dtype=int, delimiter=',')

    X_train = X[train_idx]
    y_train = y[train_idx]
    X_test = X[test_idx]
    y_test = y[test_idx]

    return X_train, y_train, X_test, y_test



In [9]:
def load_covtype(filename, file_indices_train, file_indices_test):
    data = np.loadtxt(filename, delimiter=",")
    X = data[:, :-1]
    Y =data[:, -1]

    train_indices = np.loadtxt(file_indices_train, dtype="int", delimiter=",")
    test_indices = np.loadtxt(file_indices_test, dtype="int", delimiter=",")

    X_train, Y_train = X[train_indices, :], Y[train_indices]
    X_test, Y_test = X[test_indices, :], Y[test_indices]

    print("Load files:")
    print(f"\t{filename}\n\t{file_indices_train}\n\t{file_indices_test}")
    print(f"X shape: {X.shape}")
    print(f"Y shape: {Y.shape}")

    return X_train, Y_train, X_test, Y_test

In [10]:
class SupportVectorClassifier:
    def __init__(self, loss: str= 'hinge',
                 penalty: str = 'l2',
                 kernel: str = 'linear'):
        
        self.loss = loss
        self.penalty = penalty
        self.kernel = kernel

    
    def create_model(self, C):
        model = None
        if self.kernel == "linear":
            model = LinearSVC(penalty=self.penalty, loss=self.loss, C=C)
        else:
            model = SVC(kernel=self.kernel, C=C)
        
        return model
    

    def kfold_cross_validation(self, X_train, Y_train, k, Cs):
        kfold = KFold(n_splits=k, shuffle=True, random_state=42)
        val_error = []
        train_error = []

        for C in Cs:
            v_errors = []
            t_errors = []

            for train_idx, val_idx in kfold.split(X_train):
                X_train_split = X_train[train_idx, :]
                X_val_split = X_train[val_idx, :]
                Y_train_split = Y_train[train_idx]
                Y_val_split = Y_train[val_idx]

                classifier = self.create_model(C)

                classifier.fit(X_train_split, Y_train_split)

                t_errors.append(1 - classifier.score(X_train_split, Y_train_split))
                v_errors.append(1 - classifier.score(X_val_split, Y_val_split))

            train_error.append(np.mean(t_errors))
            val_error.append(np.mean(v_errors))
        
        best_C = Cs[np.argmin(val_error)]
        return train_error, val_error, best_C
    

    def train(self, X_train, Y_train, model):
        model.fit(X_train, Y_train)
        error_rate = 1 - model.score(X_train, Y_train)
        return model, error_rate
    

    def test(self, X_test, Y_test, model):
        error_rate = 1 - model.score(X_test, Y_test)
        return error_rate

# SVM

#### Breast Cancer Data

In [11]:
kernels = ["linear", "poly", "rbf"]

In [12]:
X_train, Y_train, X_test, Y_test = load_data(cancer_file, cancer_train_indices_file, cancer_test_indices_file)

for kernel in kernels:
    model = SupportVectorClassifier(loss=loss, penalty=penalty, kernel=kernel)
    train_error, val_error, best_C = model.kfold_cross_validation(X_train, Y_train, k, Cs)

    df = pd.DataFrame({"C": Cs,
                    "Train error": train_error,
                    "Val error": val_error})

    print(f"Breast Cancer Data with SVC kernel = {kernel}")
    print(df)
    print(f"Best C is {best_C}\n")


 Data shape: (683, 11)

X_train shape: (546, 10), Y_train shape: (546,)
X_test shape: (137, 10), Y_train shape: (137,)





Breast Cancer Data with SVC kernel = linear
        C  Train error  Val error
0     0.1     0.027926   0.034746
1     1.0     0.026096   0.032927
2    10.0     0.025640   0.040234
3   100.0     0.028386   0.043903
4  1000.0     0.064105   0.065838
Best C is 1

Breast Cancer Data with SVC kernel = poly
        C  Train error  Val error
0     0.1     0.027472   0.029291
1     1.0     0.021062   0.036614
2    10.0     0.007783   0.058532
3   100.0     0.000000   0.076864
4  1000.0     0.000000   0.076864
Best C is 0.1

Breast Cancer Data with SVC kernel = rbf
        C  Train error  Val error
0     0.1     0.029303   0.029291
1     1.0     0.021976   0.034762
2    10.0     0.010073   0.043920
3   100.0     0.000000   0.067740
4  1000.0     0.000000   0.067740
Best C is 0.1



##### Eval with best C

In [13]:
for kernel in kernels:
    svc = SupportVectorClassifier(loss=loss, penalty=penalty, kernel=kernel)
    model = svc.create_model(best_C)
    model, train_err = svc.train(X_train, Y_train, model)

    error_test_rate = svc.test(X_test, Y_test, model)
    print(f"Data Breast Cancer with kernel SVC = {kernel}")
    print(f"Best C: {best_C}")
    print(f"Train error: {train_err}")
    print(f"Test error rate: {error_test_rate}\n")

Data Breast Cancer with kernel SVC = linear
Best C: 0.1
Train error: 0.02930402930402931
Test error rate: 0.058394160583941646

Data Breast Cancer with kernel SVC = poly
Best C: 0.1
Train error: 0.027472527472527486
Test error rate: 0.021897810218978075

Data Breast Cancer with kernel SVC = rbf
Best C: 0.1
Train error: 0.02930402930402931
Test error rate: 0.03649635036496346



### Sonar data

In [14]:
X_train, Y_train, X_test, Y_test = load_sonar_file()

for kernel in kernels:
    model = SupportVectorClassifier(loss=loss, penalty=penalty, kernel=kernel)
    train_error, val_error, best_C = model.kfold_cross_validation(X_train, Y_train, k, Cs)

    df = pd.DataFrame({"C": Cs,
                    "Train error": train_error,
                    "Val error": val_error})

    print(f"Sonar Data with SVC kernel {kernel}")
    print(df)
    print(f"Best C is {best_C}\n")


Loading: hw4_data/sonar_scale




Sonar Data with SVC kernel linear
        C  Train error  Val error
0     0.1     0.144555   0.301070
1     1.0     0.076828   0.300891
2    10.0     0.018079   0.270945
3   100.0     0.004511   0.247237
4  1000.0     0.003008   0.241176
Best C is 1000

Sonar Data with SVC kernel poly
        C  Train error  Val error
0     0.1     0.234917   0.301070
1     1.0     0.012030   0.174510
2    10.0     0.000000   0.168449
3   100.0     0.000000   0.168449
4  1000.0     0.000000   0.168449
Best C is 10

Sonar Data with SVC kernel rbf
        C  Train error  Val error
0     0.1     0.390180   0.469340
1     1.0     0.051185   0.150446
2    10.0     0.000000   0.162210
3   100.0     0.000000   0.162210
4  1000.0     0.000000   0.162210
Best C is 1



##### Eval with best C

In [15]:
for kernel in kernels:
    svc = SupportVectorClassifier(loss=loss, penalty=penalty, kernel="linear")
    model = svc.create_model(best_C)
    model, train_err = svc.train(X_train, Y_train, model)

    error_test_rate = svc.test(X_test, Y_test, model)
    print("Data Sonar Cancer")
    print(f"Best C: {best_C}")
    print(f"Train error: {train_err}")
    print(f"Test error rate: {error_test_rate}")

Data Sonar Cancer
Best C: 1
Train error: 0.10240963855421692
Test error rate: 0.2142857142857143
Data Sonar Cancer
Best C: 1
Train error: 0.10240963855421692
Test error rate: 0.2142857142857143
Data Sonar Cancer
Best C: 1
Train error: 0.10240963855421692
Test error rate: 0.2142857142857143




# Data Preprocessing

In [16]:
methods = ["rescale", "standardization", "normalization"]

In [17]:
def data_preprocessing(X, method:str):
    if method == "normalization":
        X_normalized = preprocessing.normalize(X, norm="l2", axis=1)
        return X_normalized
    
    if method == "rescale":
        minmax_scaler = preprocessing.MinMaxScaler(X) # default range (0, 1)
        X_scaled = minmax_scaler.fit_transform(X)
        return X_scaled

    if method == "standardization":
        standard_scaler = preprocessing.StandardScaler()
        X_standardized = standard_scaler.fit_transform(X=X)
        return X_standardized
    
    print("No available data preprocessing method!")

### Raw data

In [18]:
X_train, Y_train, X_test, Y_test = load_covtype(sonar_file, sonar_train_indices_file, sonar_test_indices_file)
Y_train[Y_train != 2] = -1
Y_test[Y_test != 2] = -1

linearsvc = SupportVectorClassifier(loss=loss, penalty=penalty, kernel="linear")
train_err_raw, val_err_raw, best_C_raw = linearsvc.kfold_cross_validation(X_train, Y_train, k, Cs)
df = pd.DataFrame({"C": Cs,
                    "Train error": train_err_raw,
                    "Val error": val_err_raw})
print(f"Cover Data with SVC kernel {kernel}")
print(df)
print(f"Best C is {best_C}\n")

linearsvc = SupportVectorClassifier(loss=loss, penalty=penalty, kernel="linear")
model = linearsvc.create_model(best_C)
model, train_err = linearsvc.train(X_train, Y_train, model)

error_test_rate = linearsvc.test(X_test, Y_test, model)
print("Data Cover Evaluation")
print(f"Best C: {best_C}")
print(f"Train error: {train_err}")
print(f"Test error rate: {error_test_rate}")

Y_pred = model.predict(X_test)
Y_prob = model.decisioin_function(X_test)
print(f"F1-score {metrics.f1_score(Y_test, Y_pred)}")
print(f"ROC AUC score {metrics.roc_auc_score(Y_test, Y_prob)}")


ValueError: could not convert string '1 1:-0.727139 2:-0.687098 3:-0.728647 4:-0.929149 5:-0.550089 6:-0.524859 7:-0.185065 8:-0.318192 9 to float64 at row 0, column 1.

In [None]:



X_raw = {"raw": X_test}

for method in methods:
    X_train_preprocessed = data_preprocessing(X_train, method)
    X_test_preprocessed = data_preprocessing(X_test, method)

    X["preprocessed"] = X_train_preprocessed
    linearsvc = SupportVectorClassifier(loss=loss, penalty=penalty, kernel="linear")
    _, _, best_C = linearsvc.kfold_cross_validation(X_train_preprocessed, Y_train, k, Cs)

