In [1]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
species_names = ['comcuc', 'cowpig1', 'eucdov', 'eueowl1', 'grswoo', 'tawowl1']
training_data = []
training_labels = []

for species_name in species_names:
    labels = np.load(f'{species_name}_combined_labels.npy')
    training_labels.append(labels)

In [3]:
training_data = np.load('100reduced_combined_features.npy')
training_labels = np.concatenate(training_labels)

In [6]:
# Define the number of samples to keep from each class
num_samples_per_class = 10000

reduced_training_data = []
reduced_training_labels = []

# Iterate over each class
for i in range(6):
    class_data = training_data[i * 20000: (i + 1) * 20000]
    class_labels = training_labels[i * 20000: (i + 1) * 20000]

    # Separate the rows where label value is 0 and where it isn't
    zero_indices = np.where(class_labels == 0)[0]
    nonzero_indices = np.where(class_labels != 0)[0]

    num_zero_indices = min(num_samples_per_class // 2, len(zero_indices))
    num_nonzero_indices = min(num_samples_per_class // 2, len(nonzero_indices))

    # Randomly select equal number of samples from each group
    random_zero_indices = np.random.choice(zero_indices, num_zero_indices, replace=False)
    random_nonzero_indices = np.random.choice(nonzero_indices, num_nonzero_indices, replace=False)

    # Combine the selected samples
    selected_indices = np.concatenate([random_zero_indices, random_nonzero_indices])
    reduced_class_data = class_data[selected_indices]
    reduced_class_labels = class_labels[selected_indices]

    reduced_training_data.append(reduced_class_data)
    reduced_training_labels.append(reduced_class_labels)

In [7]:
reduced_training_data = np.concatenate(reduced_training_data)
reduced_training_labels = np.concatenate(reduced_training_labels)

In [10]:
np.random.seed(42)  # Set a seed for reproducibility
shuffle_indices = np.random.permutation(len(reduced_training_data))
reduced_training_data = reduced_training_data[shuffle_indices]
reduced_training_labels = reduced_training_labels[shuffle_indices]

In [11]:
gbc = GradientBoostingClassifier()

In [12]:
gbc.fit(reduced_training_data, reduced_training_labels)

GradientBoostingClassifier()

In [18]:
# Load and preprocess the test data (if applicable)
test_data = np.load('100reduced_test_features.npy')

In [20]:
test_predictions = gbc.predict(test_data)

In [21]:
import csv

In [22]:
# Number of values per row and total number of rows
values_per_row = 3000
num_rows = len(test_predictions) // values_per_row

# Create a list of rows with file names and values
rows = []
for i in range(num_rows):
    start_index = i * values_per_row
    end_index = (i + 1) * values_per_row
    file_name = f"test{i:02d}"
    row_values = [str(value) for value in test_predictions[start_index:end_index]]
    row = [file_name] + row_values
    rows.append(row)

# Write the rows to a CSV file
csv_file = "predictions_gbt_bigger.csv"
with open(csv_file, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(rows)

print("Predictions saved to predictions.csv")

Predictions saved to predictions.csv


In [23]:
def count_six(lst):
    count = 0
    for num in lst:
        if num == 6:
            count += 1
    return count
print(count_six(test_predictions))

2178
