In [1]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
# Load the combined feature and label arrays
X = np.concatenate([np.load('X_comcuc.npy'), np.load('X_cowpig1.npy'),
                     np.load('X_eucdov.npy'), np.load('X_eueowl1.npy'),
                     np.load('X_grswoo.npy'), np.load('X_tawowl1.npy')])

y = np.concatenate([np.load('comcuc_combined_labels.npy'), np.load('cowpig1_combined_labels.npy'),
                     np.load('eucdov_combined_labels.npy'), np.load('eueowl1_combined_labels.npy'),
                     np.load('grswoo_combined_labels.npy'), np.load('tawowl1_combined_labels.npy')])

# Combine the feature and label arrays into a single dataset
dataset = np.hstack((X, y.reshape(-1, 1)))

In [3]:
# Get the number of unique labels in the dataset
num_labels = len(np.unique(y))

# Initialize an empty list to store the stratified samples
stratified_samples = []

In [4]:
# Loop over each label
for label in range(num_labels):
    # Get the indices of instances with the current label
    indices = np.where(y == label)[0]
    
    # Get the number of instances with the current label
    num_instances = len(indices)
    
    # Split the instances into training and validation sets using stratified sampling
    train_indices, val_indices = train_test_split(indices, test_size=0.5, stratify=y[indices])
    
    # Add the training instances to the stratified sample list
    stratified_samples.append(train_indices)
    
# Concatenate the stratified samples into a single list of indices
stratified_indices = np.concatenate(stratified_samples)

# Use the stratified indices to select a subset of instances from the dataset
subset_dataset = dataset[stratified_indices, :]

In [5]:
print(subset_dataset.shape)

(59999, 255)


In [6]:
# Split dataset into training and testing sets
X = subset_dataset[:, :-1]  # Features
y = subset_dataset[:, -1]   # Labels

In [13]:
# define the classifier
clf = SVC()

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []

# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [14]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.9102485443231381
Average F1 score: 0.8169193068243954
Average training accuracy: 0.9520283689937985
Average training F1 score: 0.9069535775539965


In [17]:
# define the classifier
clf = SVC(C=0.1)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []

# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [18]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.8594143261938495
Average F1 score: 0.6569836250455194
Average training accuracy: 0.8685436461349889
Average training F1 score: 0.6828261941251613


In [7]:
# define the classifier
clf = SVC(C=5)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []

# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [8]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.919198667944551
Average F1 score: 0.8436770480927127
Average training accuracy: 0.987770629336028
Average training F1 score: 0.9777224255887262


In [10]:
# define the classifier
clf = SVC(kernel='poly')

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []

# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [11]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.8853481248437369
Average F1 score: 0.7555651529804173
Average training accuracy: 0.9306530117814955
Average training F1 score: 0.8590170320232481
