In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score

In [2]:
# Load the combined feature and label arrays
X = np.concatenate([np.load('X_comcuc.npy'), np.load('X_cowpig1.npy'),
                     np.load('X_eucdov.npy'), np.load('X_eueowl1.npy'),
                     np.load('X_grswoo.npy'), np.load('X_tawowl1.npy')])

y = np.concatenate([np.load('comcuc_combined_labels.npy'), np.load('cowpig1_combined_labels.npy'),
                     np.load('eucdov_combined_labels.npy'), np.load('eueowl1_combined_labels.npy'),
                     np.load('grswoo_combined_labels.npy'), np.load('tawowl1_combined_labels.npy')])

# Combine the feature and label arrays into a single dataset
dataset = np.hstack((X, y.reshape(-1, 1)))

In [3]:
# Get the number of unique labels in the dataset
num_labels = len(np.unique(y))

# Initialize an empty list to store the stratified samples
stratified_samples = []

In [4]:
# Loop over each label
for label in range(num_labels):
    # Get the indices of instances with the current label
    indices = np.where(y == label)[0]
    
    # Get the number of instances with the current label
    num_instances = len(indices)
    
    # Split the instances into training and validation sets using stratified sampling
    train_indices, val_indices = train_test_split(indices, test_size=0.5, stratify=y[indices])
    
    # Add the training instances to the stratified sample list
    stratified_samples.append(train_indices)
    
# Concatenate the stratified samples into a single list of indices
stratified_indices = np.concatenate(stratified_samples)

# Use the stratified indices to select a subset of instances from the dataset
subset_dataset = dataset[stratified_indices, :]

In [5]:
print(subset_dataset.shape)

(59999, 255)


In [6]:
# Split dataset into training and testing sets
X = subset_dataset[:, :-1]  # Features
y = subset_dataset[:, -1]   # Labels

In [10]:
# define the classifier
clf = RandomForestClassifier()

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [8]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.849430905353224
Average F1 score: 0.6512648422715933


In [11]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.8498642845237103
Average F1 score: 0.6507215979718446
Average training accuracy: 0.9999166653645564
Average training F1 score: 0.9998938545119331


In [12]:
# define the classifier
clf = RandomForestClassifier()

# define the number of folds
n_folds = 10

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [13]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
1
1
1
1
1
Average accuracy: 0.8518476246041006
Average F1 score: 0.6562012460890234
Average training accuracy: 0.999914813408753
Average training F1 score: 0.9998930069970788


In [15]:
# define the classifier
clf = RandomForestClassifier(max_depth=5)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [16]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.7206620746173293
Average F1 score: 0.168417748023534
Average training accuracy: 0.7228578852337202
Average training F1 score: 0.17938183795553145


In [17]:
# define the classifier
clf = RandomForestClassifier(max_depth=20)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [18]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.844597505347668
Average F1 score: 0.6377781061822454
Average training accuracy: 0.9678911319333041
Average training F1 score: 0.9408241651481708


In [20]:
# define the classifier
clf = RandomForestClassifier(max_depth=50)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [21]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.8485809192432703
Average F1 score: 0.6497511557476886
Average training accuracy: 0.9977791294263074
Average training F1 score: 0.9961438634760311


In [23]:
# define the classifier
clf = RandomForestClassifier(max_depth=35)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [24]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.8487809234102842
Average F1 score: 0.6488135696172306
Average training accuracy: 0.9907706789898401
Average training F1 score: 0.9835923894012069


In [25]:
# define the classifier
clf = RandomForestClassifier(max_features=5)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [26]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.7960466288857405
Average F1 score: 0.49479969780623934
Average training accuracy: 0.9999291654513636
Average training F1 score: 0.9999206118021174


In [28]:
# define the classifier
clf = RandomForestClassifier(max_features=10)

# define the number of folds
n_folds = 5

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []
# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [29]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
Average accuracy: 0.8350807108925743
Average F1 score: 0.6127577677303335
Average training accuracy: 0.999920832031223
Average training F1 score: 0.9999114460683203


In [35]:
# define the classifier with the parameters you want to adjust
clf = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=5, min_samples_leaf=2, max_features=50)

# define the number of folds
n_folds = 10

# initialize lists to store the results
acc_scores = []
f1_scores = []
train_acc_scores = []
train_f1_scores = []

# initialize the KFold object
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

In [36]:
# loop over the folds
for train_idx, test_idx in kf.split(X):

    # split the data into training and testing sets
    X_train, y_train = X[train_idx], y[train_idx]
    X_test, y_test = X[test_idx], y[test_idx]

    # fit the classifier to the training data
    clf.fit(X_train, y_train)

    # predict the labels of the testing data
    y_pred = clf.predict(X_test)

    # compute the accuracy and F1 score of the predictions
    acc_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

    # predict the labels of the training data
    y_train_pred = clf.predict(X_train)

    # compute the accuracy and F1 score of the predictions on the training data
    train_acc_scores.append(accuracy_score(y_train, y_train_pred))
    train_f1_scores.append(f1_score(y_train, y_train_pred, average='macro'))
    print(1)
# print the average accuracy and F1 score across the folds
print("Average accuracy:", sum(acc_scores)/n_folds)
print("Average F1 score:", sum(f1_scores)/n_folds)

# print the average training accuracy and F1 score across the folds
print("Average training accuracy:", sum(train_acc_scores)/n_folds)
print("Average training F1 score:", sum(train_f1_scores)/n_folds)

1
1
1
1
1
1
1
1
1
1
Average accuracy: 0.8696145579818859
Average F1 score: 0.7003238103122598
Average training accuracy: 0.967030932534416
Average training F1 score: 0.9391370917123293
