<a href="https://colab.research.google.com/github/eftychiav/pattern_recognition/blob/main/wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import zipfile

path_to_zip_file = 'wine.zip'
directory_to_extract_to = 'wine'
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

data_array = np.genfromtxt("wine/wine.data", delimiter=",", dtype=float)
data_c1 = data_array[data_array[:, 0] == 1]
data_c2 = data_array[data_array[:, 0] == 2]
data_c3 = data_array[data_array[:, 0] == 3]


In [10]:
!pip install libsvm-official
from libsvm.svmutil import svm_train, svm_predict, svm_problem, svm_parameter

data_c2_c3 = np.vstack((data_c2, data_c3))

X = data_c2_c3[:, 1:6]  # Features
y = data_c2_c3[:, 0]    # Labels (class 2 and class 3)

# Separate the data by class
class2_data = data_c2_c3[y == 2]
class3_data = data_c2_c3[y == 3]

# Shuffle each class's data
np.random.seed(9)
np.random.shuffle(class2_data)
np.random.shuffle(class3_data)

# Calculate the split indices for each class
train_size = 0.5
val_size = 0.25
test_size = 0.25

def split_indices(data, train_size, val_size):
    train_end = int(train_size * len(data))
    val_end = train_end + int(val_size * len(data))
    train, val, test = data[:train_end], data[train_end:val_end], data[val_end:]
    return train, val, test

# Split each class separately
class2_train, class2_val, class2_test = split_indices(class2_data, train_size, val_size)
class3_train, class3_val, class3_test = split_indices(class3_data, train_size, val_size)

# Combine the splits from both classes
train_data = np.vstack((class2_train, class3_train))
val_data = np.vstack((class2_val, class3_val))
test_data = np.vstack((class2_test, class3_test))

# Shuffle each combined set to mix class 2 and class 3
np.random.shuffle(train_data)
np.random.shuffle(val_data)
np.random.shuffle(test_data)

# Separate features and labels for each set
X_train, y_train = train_data[:, 1:6], train_data[:, 0]
X_val, y_val = val_data[:, 1:6], val_data[:, 0]
X_test, y_test = test_data[:, 1:6], test_data[:, 0]



In [11]:
# Define a range of C values to test
C_values = [0.01, 0.1, 1, 10, 100]
best_C = None
best_val_accuracy = 0

# Function to find the best C value
def find_best_C(C_values, X, y):
    best_C = None
    best_val_accuracy = 0

    for C in C_values:
        # Create and train the SVM model with the current C
        param = svm_parameter(f'-t 0 -c {C}')  # Linear kernel (-t 0), set C
        problem = svm_problem(y_train.tolist(), X_train.tolist())
        model = svm_train(problem, param)

        # Evaluate on the validation set
        _, val_accuracy, _ = svm_predict(y_val.tolist(), X_val.tolist(), model)

        # Update the best C if the current model performs better
        if val_accuracy[0] > best_val_accuracy:
            best_val_accuracy = val_accuracy[0]
            best_C = C

    return best_C

# Find the best C value using the defined function
best_C = find_best_C(C_values, X, y)

# Train final model on the entire dataset with best C
final_param = svm_parameter(f'-t 0 -c {best_C}')
final_model = svm_train(svm_problem(y.tolist(), X.tolist()), final_param)

# Evaluate on the same data for demonstration
_, test_accuracy, _ = svm_predict(y_test.tolist(), X_test.tolist(), final_model)

# Calculate the classification error on the entire dataset
test_error = 1 - (test_accuracy[0] / 100)

# Since we're not splitting into different test sets, we keep the error as a single value
print(f"Best C: {best_C}, Classification Error on the entire dataset: {test_error:.2f}")

# test_accuracy contains a list where test_accuracy[0]
# is the percentage of correctly classified samples, calculated as (correctly classified samples / total samples) * 100.

Accuracy = 75.8621% (22/29) (classification)
Accuracy = 75.8621% (22/29) (classification)
Accuracy = 72.4138% (21/29) (classification)
Accuracy = 68.9655% (20/29) (classification)
Accuracy = 68.9655% (20/29) (classification)
Accuracy = 80.6452% (25/31) (classification)
Best C: 0.01, Classification Error on the entire dataset: 0.19


In [14]:
splits = 5
test_errors = []

for i in range(splits):
  np.random.seed(i+4)
  np.random.shuffle(class2_data)
  np.random.shuffle(class3_data)

  class2_train, class2_val, class2_test = split_indices(class2_data, train_size, val_size)
  class3_train, class3_val, class3_test = split_indices(class3_data, train_size, val_size)
  train_data = np.vstack((class2_train, class3_train))

  val_data = np.vstack((class2_val, class3_val))
  test_data = np.vstack((class2_test, class3_test))

  np.random.shuffle(train_data)
  np.random.shuffle(val_data)
  np.random.shuffle(test_data)

  X_train, y_train = train_data[:, 1:6], train_data[:, 0]
  X_val, y_val = val_data[:, 1:6], val_data[:, 0]
  X_test, y_test = test_data[:, 1:6], test_data[:, 0]

  best_C = find_best_C(C_values, X, y)

  final_param = svm_parameter(f'-t 0 -c {best_C}')
  final_model = svm_train(svm_problem(y.tolist(), X.tolist()), final_param)

  _, test_accuracy, _ = svm_predict(y_test.tolist(), X_test.tolist(), final_model)

  test_error = 1 - (test_accuracy[0] / 100)
  test_errors.append(test_error)

  print(f"Split {i + 1}: Best C: {best_C}, Test Classification Error: {test_error:.2f}")

mean_test_error = np.mean(test_errors)
std_test_error = np.std(test_errors)
print(f"Mean Test Classification Error: {mean_test_error:.2f}")
print(f"Standard Deviation of Test Classification Error: {std_test_error:.2f}")

Accuracy = 68.9655% (20/29) (classification)
Accuracy = 93.1034% (27/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 83.871% (26/31) (classification)
Split 1: Best C: 0.1, Test Classification Error: 0.16
Accuracy = 65.5172% (19/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 83.871% (26/31) (classification)
Split 2: Best C: 0.1, Test Classification Error: 0.16
Accuracy = 65.5172% (19/29) (classification)
Accuracy = 75.8621% (22/29) (classification)
Accuracy = 79.3103% (23/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 86.2069% (25/29) (classification)
Accuracy = 87.0968% (27/31) (classification)
Split 3: Best C: 10, Test Classification Error: 0.13
Accuracy = 65.5172% (19/29) (cl

In [15]:
def kernel_test_splits(splits, kernel_type, C_values):
  for i in range(splits):
    test_errors = []
    np.random.seed(i+4)
    np.random.shuffle(class2_data)
    np.random.shuffle(class3_data)

    class2_train, class2_val, class2_test = split_indices(class2_data, train_size, val_size)
    class3_train, class3_val, class3_test = split_indices(class3_data, train_size, val_size)
    train_data = np.vstack((class2_train, class3_train))

    val_data = np.vstack((class2_val, class3_val))
    test_data = np.vstack((class2_test, class3_test))

    np.random.shuffle(train_data)
    np.random.shuffle(val_data)
    np.random.shuffle(test_data)

    X_train, y_train = train_data[:, 1:6], train_data[:, 0]
    X_val, y_val = val_data[:, 1:6], val_data[:, 0]
    X_test, y_test = test_data[:, 1:6], test_data[:, 0]

    best_C = find_best_C(C_values, X, y)

    if kernel_type == 'linear':
      final_param = svm_parameter(f'-t 0 -c {best_C}')
    elif kernel_type == 'polynomial':
      final_param = svm_parameter(f'-t 1 -c {best_C}')
    elif kernel_type == 'rbf':
      final_param = svm_parameter(f'-t 2 -c {best_C}')
    elif kernel_type == 'sigmoid':
      final_param = svm_parameter(f'-t 3 -c {best_C}')

    final_model = svm_train(svm_problem(y.tolist(), X.tolist()), final_param)

    _, test_accuracy, _ = svm_predict(y_test.tolist(), X_test.tolist(), final_model)

    test_error = 1 - (test_accuracy[0] / 100)
    test_errors.append(test_error)

    print(f"Split {i + 1}: Best C: {best_C}, Test Classification Error: {test_error:.2f}")

  mean_test_error = np.mean(test_errors)
  std_test_error = np.std(test_errors)
  print(f"Mean Test Classification Error: {mean_test_error:.2f}")
  print(f"Standard Deviation of Test Classification Error: {std_test_error:.2f}")
  return mean_test_error, std_test_error, best_C



In [16]:
splits = 5

mte_0, ste_0, best_C_0 = kernel_test_splits(splits, 'linear', C_values)
mte_1, ste_1, best_C_1 = kernel_test_splits(splits, 'polynomial', C_values)
mte_2, ste_2, best_C_2 = kernel_test_splits(splits, 'rbf', C_values)
mte_3, ste_3, best_C_3 = kernel_test_splits(splits, 'sigmoid', C_values)


Accuracy = 79.3103% (23/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 83.871% (26/31) (classification)
Split 1: Best C: 0.1, Test Classification Error: 0.16
Accuracy = 79.3103% (23/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 87.0968% (27/31) (classification)
Split 2: Best C: 0.1, Test Classification Error: 0.13
Accuracy = 79.3103% (23/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 89.6552% (26/29) (classification)
Accuracy = 80.6452% (25/31) (classification)
Split 3: Best C: 0.1, Test Classification Error: 0.19
Accuracy = 79.3103% (23/29) (