<a href="https://colab.research.google.com/github/eftychiav/pattern_recognition/blob/main/hw2_pr_3_wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import zipfile

path_to_zip_file = 'wine.zip'
directory_to_extract_to = 'wine'
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

data_array = np.genfromtxt("wine/wine.data", delimiter=",", dtype=float)
data_c1 = data_array[data_array[:, 0] == 1]
data_c2 = data_array[data_array[:, 0] == 2]
data_c3 = data_array[data_array[:, 0] == 3]
print(data_c1.shape)
print(data_c2.shape)

(59, 14)
(71, 14)


In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

data_c2_c3 = np.vstack((data_c2, data_c3))
data_5 = data_c2_c3[:, 1:6]
labels = data_c2_c3[:, 0]
# print(data_5.shape)

train_size = 0.5
val_size = 0.25
test_size = 0.25

np.random.seed(16)  # For reproducibility
indices = np.arange(data_5.shape[0])
np.random.shuffle(indices)

train_end = int(train_size * len(indices))
val_end = train_end + int(val_size * len(indices))

train_indices = indices[:train_end]
val_indices = indices[train_end:val_end]
test_indices = indices[val_end:]

X_train, y_train = data_5[train_indices], labels[train_indices]
X_val, y_val = data_5[val_indices], labels[val_indices]
X_test, y_test = data_5[test_indices], labels[test_indices]

# Define a range of C values to test
C_values = [0.01, 0.1, 1, 10, 100]
best_C = None
best_val_accuracy = 0

# Hyperparameter tuning for C using the validation set
for C in C_values:
    model = svm.SVC(kernel='linear', C=C)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"C: {C}, Validation Accuracy: {val_accuracy:.2f}")

    # Update the best C if the current model performs better
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_C = C

print(f"Best C: {best_C} with Validation Accuracy: {best_val_accuracy:.2f}")

# Train final model on the training set with best C and evaluate on test set
final_model = svm.SVC(kernel='linear', C=best_C)
final_model.fit(X_train, y_train)
y_test_pred = final_model.predict(X_test)

# Calculate and print the classification error on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_error = 1 - test_accuracy
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Classification Error on Test Set: {test_error:.2f}")

C: 0.01, Validation Accuracy: 0.55
C: 0.1, Validation Accuracy: 0.79
C: 1, Validation Accuracy: 0.76
C: 10, Validation Accuracy: 0.76
C: 100, Validation Accuracy: 0.76
Best C: 0.1 with Validation Accuracy: 0.79
Test Accuracy: 0.97
Classification Error on Test Set: 0.03


In [None]:
# Number of random splits
num_splits = 5
test_errors = []

# Loop over the 5 random splits
for split in range(num_splits):
    # Shuffle indices and split into train, validation, and test sets
    np.random.seed(split)  # Different seed for each split
    indices = np.arange(data_5.shape[0])
    np.random.shuffle(indices)

    train_end = int(0.5 * len(indices))
    val_end = train_end + int(0.25 * len(indices))

    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]

    # Create train, validation, and test sets
    X_train, y_train = data_5[train_indices], labels[train_indices]
    X_val, y_val = data_5[val_indices], labels[val_indices]
    X_test, y_test = data_5[test_indices], labels[test_indices]

    # Hyperparameter tuning for C using the validation set
    best_C = None
    best_val_accuracy = 0

    for C in C_values:
        model = svm.SVC(kernel='linear', C=C)
        model.fit(X_train, y_train)
        y_val_pred = model.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_val_pred)

        # Update the best C if the current model performs better
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_C = C

    # Train final model on the training set with best C and evaluate on test set
    final_model = svm.SVC(kernel='linear', C=best_C)
    final_model.fit(X_train, y_train)
    y_test_pred = final_model.predict(X_test)

    # Calculate the classification error on the test set
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_error = 1 - test_accuracy
    test_errors.append(test_error)

    print(f"Split {split + 1}: Best C: {best_C}, Test Classification Error: {test_error:.2f}")

# Calculate mean and standard deviation of the test classification errors
mean_test_error = np.mean(test_errors)
std_test_error = np.std(test_errors)

print(f"Mean Test Classification Error: {mean_test_error:.2f}")
print(f"Standard Deviation of Test Classification Error: {std_test_error:.2f}")

Split 1: Best C: 0.1, Test Classification Error: 0.23
Split 2: Best C: 10, Test Classification Error: 0.26
Split 3: Best C: 10, Test Classification Error: 0.19
Split 4: Best C: 1, Test Classification Error: 0.19
Split 5: Best C: 0.1, Test Classification Error: 0.13
Mean Test Classification Error: 0.20
Standard Deviation of Test Classification Error: 0.04


In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

data_c2_c3 = np.vstack((data_c2, data_c3))
data_5 = data_c2_c3[:, 1:6]
labels = data_c2_c3[:, 0]
# print(data_5.shape)

train_size = 0.5
val_size = 0.25
test_size = 0.25

np.random.seed(16)  # For reproducibility
indices = np.arange(data_5.shape[0])
np.random.shuffle(indices)

train_end = int(train_size * len(indices))
val_end = train_end + int(val_size * len(indices))

train_indices = indices[:train_end]
val_indices = indices[train_end:val_end]
test_indices = indices[val_end:]

X_train, y_train = data_5[train_indices], labels[train_indices]
X_val, y_val = data_5[val_indices], labels[val_indices]
X_test, y_test = data_5[test_indices], labels[test_indices]

# Define a range of C values to test
C_values = [0.01, 0.1, 1, 10, 100]
best_C = None
best_val_accuracy = 0

# Hyperparameter tuning for C using the validation set
for C in C_values:
    model = svm.SVC(kernel='rbf', C=C)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"C: {C}, Validation Accuracy: {val_accuracy:.2f}")

    # Update the best C if the current model performs better
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_C = C

print(f"Best C: {best_C} with Validation Accuracy: {best_val_accuracy:.2f}")

# Train final model on the training set with best C and evaluate on test set
final_model = svm.SVC(kernel='linear', C=best_C)
final_model.fit(X_train, y_train)
y_test_pred = final_model.predict(X_test)

# Calculate and print the classification error on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_error = 1 - test_accuracy
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Classification Error on Test Set: {test_error:.2f}")

C: 0.01, Validation Accuracy: 0.52
C: 0.1, Validation Accuracy: 0.52
C: 1, Validation Accuracy: 0.52
C: 10, Validation Accuracy: 0.52
C: 100, Validation Accuracy: 0.79
Best C: 100 with Validation Accuracy: 0.79
Test Accuracy: 0.90
Classification Error on Test Set: 0.10


In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score

data_c2_c3 = np.vstack((data_c2, data_c3))
data_5 = data_c2_c3[:, 1:6]
labels = data_c2_c3[:, 0]
# print(data_5.shape)

train_size = 0.5
val_size = 0.25
test_size = 0.25

np.random.seed(16)  # For reproducibility
indices = np.arange(data_5.shape[0])
np.random.shuffle(indices)

train_end = int(train_size * len(indices))
val_end = train_end + int(val_size * len(indices))

train_indices = indices[:train_end]
val_indices = indices[train_end:val_end]
test_indices = indices[val_end:]

X_train, y_train = data_5[train_indices], labels[train_indices]
X_val, y_val = data_5[val_indices], labels[val_indices]
X_test, y_test = data_5[test_indices], labels[test_indices]

# Define a range of C values to test
C_values = [0.01, 0.1, 1, 10, 100]
best_C = None
best_val_accuracy = 0

# Hyperparameter tuning for C using the validation set
for C in C_values:
    model = svm.SVC(kernel='poly', C=C)
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    print(f"C: {C}, Validation Accuracy: {val_accuracy:.2f}")

    # Update the best C if the current model performs better
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_C = C

print(f"Best C: {best_C} with Validation Accuracy: {best_val_accuracy:.2f}")

# Train final model on the training set with best C and evaluate on test set
final_model = svm.SVC(kernel='linear', C=best_C)
final_model.fit(X_train, y_train)
y_test_pred = final_model.predict(X_test)

# Calculate and print the classification error on the test set
test_accuracy = accuracy_score(y_test, y_test_pred)
test_error = 1 - test_accuracy
print(f"Test Accuracy: {test_accuracy:.2f}")
print(f"Classification Error on Test Set: {test_error:.2f}")

C: 0.01, Validation Accuracy: 0.52
C: 0.1, Validation Accuracy: 0.52
C: 1, Validation Accuracy: 0.52
C: 10, Validation Accuracy: 0.76
C: 100, Validation Accuracy: 0.86
Best C: 100 with Validation Accuracy: 0.86
Test Accuracy: 0.90
Classification Error on Test Set: 0.10
