In [None]:
import numpy as np
def load_data(data_file, train_idx_file, test_idx_file):
    """
    Load train and test data from data_file
    Returns:
        X_train, Y_train, X_test, Y_test
    """

    # Load data
    def conv(item):
        return item.split(":")[1] if ":" in item else item

    data = np.l(data_file, converters=conv)
    print(f"\n Data shape: {data.shape}")
    
    # Load index
    train_idx = np.loadtxt(train_idx_file, dtype="int", delimiter=",")
    test_idx = np.loadtxt(test_idx_file, dtype="int", delimiter=",")
    
    X_train, Y_train = data[train_idx, 1:], data[train_idx, 0]
    X_test, Y_test = data[test_idx, 1:], data[test_idx, 0]

    # Reassign value for classes:
    
    print(f"\nX_train shape: {X_train.shape}, Y_train shape: {Y_train.shape}")
    print(f"X_test shape: {X_test.shape}, Y_train shape: {Y_test.shape}\n")
    return X_train, Y_train, X_test, Y_test


In [62]:
cancer_file = "hw4_data/breast-cancer_scale"
cancer_train_indices_file = "hw4_data/breast-cancer_train_indices.txt"
cancer_test_indices_file = "hw4_data/breast-cancer_test_indices.txt"

X_train, Y_train, X_test, Y_test = load_data(cancer_file, cancer_train_indices_file, cancer_test_indices_file)


 Data shape: (683, 11)

X_train shape: (546, 10), Y_train shape: (546,)
X_test shape: (137, 10), Y_train shape: (137,)



In [57]:
# Load data file
data_file = 'hw4_data/sonar_scale'
print(f"\nLoading: {data_file}")

X_list = []
y_list = []
max_feature = 0

with open(data_file, 'r') as f:
    for line in f:
        parts = line.strip().split()
        if not parts:
            continue
        
        y_list.append(int(float(parts[0])))
        
        features = {}
        for item in parts[1:]:
            idx, val = item.split(':')
            idx = int(idx) - 1
            features[idx] = float(val)
            max_feature = max(max_feature, idx)
        X_list.append(features)

# Convert to arrays
n_samples = len(y_list)
n_features = max_feature + 1
X = np.zeros((n_samples, n_features))
for i, features in enumerate(X_list):
    for idx, val in features.items():
        X[i, idx] = val
y = np.array(y_list)


Loading: hw4_data/sonar_scale


In [58]:
# Load train/test indices
train_idx = np.loadtxt('hw4_data/sonar_train_indices.txt', dtype=int, delimiter=',')
test_idx = np.loadtxt('hw4_data/sonar_test_indices.txt', dtype=int, delimiter=',')

X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]


In [53]:
sonar_file = "hw4_data/sonar_scale"
sonar_train_indices_file = "hw4_data/sonar_train_indices.txt"
sonar_test_indices_file = "hw4_data/sonar_test_indices.txt"

In [63]:
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import KFold
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

Cs = [0.1, 1, 10, 100, 1000]

# "poly": Polynominal, "rbf": RBF
class SupportVectorClassifier:
    def __init__(self, loss: str = "hinge",
                 penalty: str = "l2",
                 kernel: str = "linear"):

        """
        Will use LinearSVC if kernel = "linear"
        """
        self.loss = loss
        self.penalty = penalty
        self.kernel = kernel
    

    def create_model(self, C):
        model = None
        if self.kernel == "linear":
            model = LinearSVC(penalty=self.penalty, loss = self.loss, C=C, random_state=42)
        else:
            model = SVC(kernel=self.kernel, C=C)

        # model = LogisticRegression(C=C, max_iter=1000, penalty='l2', solver='lbfgs')

        return model


    def kfold_cross_validation(self, X_train, Y_train, k, Cs):
        kfold = KFold(n_splits=k, shuffle=True, random_state=42)
        validation_err = []
        training_err = []

        for C in Cs:
            
            val_errors = []
            training_errors = []

            for train_idx, val_idx in kfold.split(X_train):

                # classifier = self.create_model(C)
                
                X_train_split, Y_train_split = X_train[train_idx, :], Y_train[train_idx]
                X_val_split, Y_val_split = X_train[val_idx, :], Y_train[val_idx]

                model = self.train(X_train_split, Y_train_split, C)
                # classifier.fit(X_train_split, Y_train_split)
                
                # training_errors.append(1 - classifier.score(X_train_split, Y_train_split))
                # val_errors.append(1 - classifier.score(X_val_split, Y_val_split))
                
                predict = model.predict(X_train_split)
                training_errors.append(1 - accuracy_score(Y_train_split, predict))
                val_errors.append(self.eval(model, X_val_split, Y_val_split))
            
            validation_err.append(np.mean(val_errors))
            training_err.append(np.mean(training_errors))


        best_C = Cs[np.argmin(validation_err)]

        return training_err, validation_err, best_C
    

    def train(self, X_train, Y_train, C):
        classifier = self.create_model(C)
        classifier.fit(X_train, Y_train)
        return classifier
    

    def eval(self, model, X_test, Y_test):
        error_rate = 1- model.score(X_test, Y_test)
        return error_rate


In [17]:
import pandas as pd

In [66]:
svc = SupportVectorClassifier(kernel="linear")
train_err, val_err, best_C = svc.kfold_cross_validation(X_train, Y_train, 5, Cs)
print(train_err)
df = pd.DataFrame({"C": Cs,
                   "Train error": train_err,
                   "Train error": train_err,
                   "Val error": val_err})

print(f"5-fold cross-validation:")
print(df)

[np.float64(0.0279260176768207), np.float64(0.026096403753700193), np.float64(0.026097453446140294), np.float64(0.025182121638359978), np.float64(0.03479520500493356)]
5-fold cross-validation:
        C  Train error  Val error
0     0.1     0.027926   0.034746
1     1.0     0.026096   0.032927
2    10.0     0.026097   0.038399
3   100.0     0.025182   0.040250
4  1000.0     0.034795   0.045738




In [None]:
# Read data
covtype_file = "hw4_data/covtype.data"
covtype_train_indices_file = "hw4_data/covtype_train_indices.txt"
covtype_test_indices_file = "hw4_data/covtype_test_indices.txt"

def load_covtype(filename, file_indices_train, file_indices_test):
    data = np.loadtxt(filename, delimiter=",")
    X = data[:, :-1]
    Y =data[:, -1]

    train_indices = np.loadtxt(file_indices_train, dtype="int", delimiter=",")
    test_indices = np.loadtxt(file_indices_test, dtype="int", delimiter=",")

    X_train, Y_train = X[train_indices, :], Y[train_indices]
    X_test, Y_test = X[test_indices, :], Y[test_indices]

    print("Load files:")
    print(f"\t{filename}\n\t{file_indices_train}\n\t{file_indices_test}")
    print(f"X shape: {X.shape}")
    print(f"Y shape: {Y.shape}")

    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = load_covtype(covtype_file, covtype_train_indices_file, covtype_test_indices_file)


Load file:
	hw4_data/covtype.data
	hw4_data/covtype_train_indices.txt
	hw4_data/covtype_test_indices.txt
X shape: (581012, 54)
Y shape: (581012,)


In [None]:
from sklearn import preprocessing

def data_preprocessing(X, method:str):
    if method == "normalization":
        X_normalized = preprocessing.normalize(X, norm="l2", axis=1)
        return X_normalized
    
    if method == "rescale":
        minmax_scaler = preprocessing.MinMaxScaler(X) # default range (0, 1)
        X_scaled = minmax_scaler.fit_transform(X)
        return X_scaled

    if method == "standardization":
        standard_scaler = preprocessing.StandardScaler()
        X_standardized = standard_scaler.fit_transform(X=X)
        return X_standardized
    
    print("No available data preprocessing method!")

In [46]:
a = np.array([[1, 2],
              [2, 3],
              [3, 4]])

a = preprocessing.normalize(a)
a[0, :].T @ a[0, :]

np.float64(0.9999999999999999)