In [12]:
import os
import librosa
import numpy as np
import time
import joblib
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Paths to the data directories
train_path = 'E:/Queenless/archive/nuhive_processed/train'
val_path = 'E:/Queenless/archive/nuhive_processed/val'
test_path = 'E:/Queenless/archive/nuhive_processed/test'

output_dir = 'E:/Queenless/kaggle_merged_features'
os.makedirs(output_dir, exist_ok=True)

def compute_mfcc(file_path, n_mfcc=13):
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, dct_type=2, norm='ortho', lifter=0)
    return mfcc

def extract_mfcc_features(directory, sample_rate=22050, output_dir=None, dataset_type='train', reduce_dimension=True):
    # Check if files already exist, then load them
    features_file = os.path.join(output_dir, f'mfcc2_features_{dataset_type}.pkl')
    labels_file = os.path.join(output_dir, f'mfcc2_labels_{dataset_type}.pkl')
    data_file = os.path.join(output_dir, f'mfcc2_data_{dataset_type}.pkl')
    
    if os.path.exists(features_file) and os.path.exists(labels_file) and os.path.exists(data_file):
        print(f"Loading {dataset_type} data from .pkl files...")
        features = joblib.load(features_file)
        labels = joblib.load(labels_file)
        data = joblib.load(data_file)
    else:
        print(f"Extracting {dataset_type} data...")
        start_time = time.time()
        labels = []
        features = []
        data = []  # Variable to store input data (can be signals or other information)
        
        for label in ['bee', 'nobee', 'noqueen']:
            path = os.path.join(directory, label)
            for file in os.listdir(path):
                file_path = os.path.join(path, file)
                mfcc = compute_mfcc(file_path=file_path, n_mfcc=40)  # Extract MFCC for each audio file
                features.append(mfcc)  # Save the MFCC matrix for each audio file
                labels.append(label)
        
        features = np.array(features)
        labels = np.array(labels)
        data = np.array(data)  # Convert to numpy array if needed
        
        # Reduce dimensions along axis=2 if needed
        if reduce_dimension:
            features = np.mean(features, axis=2)
        
        if output_dir:
            # Save each object separately for train/val/test
            joblib.dump(features, features_file)
            joblib.dump(labels, labels_file)
            joblib.dump(data, data_file)
        
        end_time = time.time()
        print(f"MFCC extraction time: {end_time - start_time:.2f} seconds")
    
    return features, labels, data

# Call the function with reduce_dimension=True to reduce dimensions
train_features_mfcc, train_labels_mfcc, train_data_mfcc = extract_mfcc_features(train_path, output_dir=output_dir, dataset_type='train')
val_features_mfcc, val_labels_mfcc, val_data_mfcc = extract_mfcc_features(val_path, output_dir=output_dir, dataset_type='val')
test_features_mfcc, test_labels_mfcc, test_data_mfcc = extract_mfcc_features(test_path, output_dir=output_dir, dataset_type='test')

scaler = StandardScaler()
train_features_mfcc_scaled = scaler.fit_transform(train_features_mfcc)
val_features_mfcc_scaled = scaler.transform(val_features_mfcc)
test_features_mfcc_scaled = scaler.transform(test_features_mfcc)

# Create hierarchical labels for first level classification (noqueen vs. others)
def create_hierarchical_labels(labels):
    # First level: 'noqueen' vs. others ('bee' and 'nobee')
    level1_labels = np.array(['bee_nobee' if label in ['bee', 'nobee'] else 'noqueen' for label in labels])
    
    # Second level: 'bee' vs. 'nobee' (only for samples that are not 'noqueen')
    bee_nobee_mask = level1_labels == 'bee_nobee'
    level2_labels = labels[bee_nobee_mask]
    
    return level1_labels, level2_labels, bee_nobee_mask

# Create hierarchical labels
train_level1_labels, train_level2_labels, train_bee_nobee_mask = create_hierarchical_labels(train_labels_mfcc)
val_level1_labels, val_level2_labels, val_bee_nobee_mask = create_hierarchical_labels(val_labels_mfcc)
test_level1_labels, test_level2_labels, test_bee_nobee_mask = create_hierarchical_labels(test_labels_mfcc)

Loading train data from .pkl files...
Loading val data from .pkl files...
Loading test data from .pkl files...


In [13]:
# Level 1 classification: 'noqueen' vs. 'bee_nobee'
print("Level 1 Classification: 'noqueen' vs. 'bee_nobee'")
start_time = time.time()

level1_classifier = SVC(C=8.31, kernel='rbf', gamma='auto', random_state=42)
level1_classifier.fit(train_features_mfcc_scaled, train_level1_labels)

end_time = time.time()
print(f"Level 1 Training time: {end_time - start_time:.2f} seconds")

# Evaluate Level 1 model on validation set
val_level1_predictions = level1_classifier.predict(val_features_mfcc_scaled)
val_level1_accuracy = accuracy_score(val_level1_labels, val_level1_predictions)
print(f"Validation Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): {val_level1_accuracy * 100:.2f}%")

# Evaluate Level 1 model on test set
test_level1_predictions = level1_classifier.predict(test_features_mfcc_scaled)
test_level1_accuracy = accuracy_score(test_level1_labels, test_level1_predictions)
print(f"Test Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): {test_level1_accuracy * 100:.2f}%")

# Level 2 classification: 'bee' vs. 'nobee' (only for samples predicted as 'bee_nobee')
print("\nLevel 2 Classification: 'bee' vs. 'nobee'")
start_time = time.time()

# Get features for 'bee_nobee' samples from training set
train_level2_features = train_features_mfcc_scaled[train_bee_nobee_mask]

level2_classifier = SVC(C=8.31, kernel='rbf', gamma='auto', random_state=42)
level2_classifier.fit(train_level2_features, train_level2_labels)

end_time = time.time()
print(f"Level 2 Training time: {end_time - start_time:.2f} seconds")

# Evaluate Level 2 model on validation set (only on true 'bee_nobee' samples)
val_level2_features = val_features_mfcc_scaled[val_bee_nobee_mask]
val_level2_predictions = level2_classifier.predict(val_level2_features)
val_level2_accuracy = accuracy_score(val_level2_labels, val_level2_predictions)
print(f"Validation Accuracy (Level 2 - 'bee' vs. 'nobee'): {val_level2_accuracy * 100:.2f}%")

# Evaluate Level 2 model on test set (only on true 'bee_nobee' samples)
test_level2_features = test_features_mfcc_scaled[test_bee_nobee_mask]
test_level2_predictions = level2_classifier.predict(test_level2_features)
test_level2_accuracy = accuracy_score(test_level2_labels, test_level2_predictions)
print(f"Test Accuracy (Level 2 - 'bee' vs. 'nobee'): {test_level2_accuracy * 100:.2f}%")

# Full hierarchical classification accuracy (Level 1 + Level 2)
print("\nFull Hierarchical Classification (All 3 classes)")

# Function to perform hierarchical prediction
def hierarchical_predict(features, level1_model, level2_model):
    level1_preds = level1_model.predict(features)
    final_preds = np.copy(level1_preds)
    
    # Find samples predicted as 'bee_nobee'
    bee_nobee_indices = np.where(level1_preds == 'bee_nobee')[0]
    
    if len(bee_nobee_indices) > 0:
        # Get features for samples predicted as 'bee_nobee'
        bee_nobee_features = features[bee_nobee_indices]
        
        # Make level 2 predictions for these samples
        level2_preds = level2_model.predict(bee_nobee_features)
        
        # Replace 'bee_nobee' labels with level 2 predictions
        final_preds[bee_nobee_indices] = level2_preds
    
    return final_preds

# Perform hierarchical prediction on validation set
val_hierarchical_predictions = hierarchical_predict(
    val_features_mfcc_scaled, level1_classifier, level2_classifier
)
val_hierarchical_accuracy = accuracy_score(val_labels_mfcc, val_hierarchical_predictions)
print(f"Validation Accuracy (Full Hierarchical - All 3 classes): {val_hierarchical_accuracy * 100:.2f}%")

# Perform hierarchical prediction on test set
test_hierarchical_predictions = hierarchical_predict(
    test_features_mfcc_scaled, level1_classifier, level2_classifier
)
test_hierarchical_accuracy = accuracy_score(test_labels_mfcc, test_hierarchical_predictions)
print(f"Test Accuracy (Full Hierarchical - All 3 classes): {test_hierarchical_accuracy * 100:.2f}%")

Level 1 Classification: 'noqueen' vs. 'bee_nobee'
Level 1 Training time: 2.39 seconds
Validation Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): 92.68%
Test Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): 93.30%

Level 2 Classification: 'bee' vs. 'nobee'
Level 2 Training time: 1.66 seconds
Validation Accuracy (Level 2 - 'bee' vs. 'nobee'): 91.94%
Test Accuracy (Level 2 - 'bee' vs. 'nobee'): 90.71%

Full Hierarchical Classification (All 3 classes)
Validation Accuracy (Full Hierarchical - All 3 classes): 87.45%
Test Accuracy (Full Hierarchical - All 3 classes): 87.28%


In [15]:
import os
import librosa
import numpy as np
import time
import joblib
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

# Paths to the data directories
train_path = 'E:/Queenless/archive/nuhive_processed/train'
val_path = 'E:/Queenless/archive/nuhive_processed/val'
test_path = 'E:/Queenless/archive/nuhive_processed/test'

output_dir = 'E:/Queenless/kaggle_merged_features'
os.makedirs(output_dir, exist_ok=True)

def compute_mfcc(file_path, n_mfcc=40):
    """Compute MFCC features from audio file"""
    y, sr = librosa.load(file_path, sr=16000)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, dct_type=2, norm='ortho', lifter=0)
    return mfcc

def compute_stft(file_path, n_stft=80):
    """Compute STFT features from audio file"""
    y, sr = librosa.load(file_path, sr=16000)
    stft = np.abs(librosa.stft(y))
    
    # Reduce dimensionality to n_stft features (taking the most significant frequency bins)
    if stft.shape[0] > n_stft:
        stft = stft[:n_stft, :]
    
    return stft

def extract_combined_features(directory, output_dir=None, dataset_type='train', reduce_dimension=True):
    """Extract both MFCC and STFT features and combine them"""
    # Check if files already exist, then load them
    features_file = os.path.join(output_dir, f'combined_features_{dataset_type}.pkl')
    labels_file = os.path.join(output_dir, f'combined_labels_{dataset_type}.pkl')
    
    if os.path.exists(features_file) and os.path.exists(labels_file):
        print(f"Loading {dataset_type} combined data from .pkl files...")
        features = joblib.load(features_file)
        labels = joblib.load(labels_file)
    else:
        print(f"Extracting {dataset_type} combined features...")
        start_time = time.time()
        labels = []
        mfcc_features = []
        stft_features = []
        
        for label in ['bee', 'nobee', 'noqueen']:
            path = os.path.join(directory, label)
            for file in os.listdir(path):
                file_path = os.path.join(path, file)
                
                # Extract MFCC features
                mfcc = compute_mfcc(file_path=file_path, n_mfcc=80)
                
                # Extract STFT features
                stft = compute_stft(file_path=file_path, n_stft=80)
                
                mfcc_features.append(mfcc)
                stft_features.append(stft)
                labels.append(label)
        
        mfcc_features = np.array(mfcc_features)
        stft_features = np.array(stft_features)
        labels = np.array(labels)
        
        # Reduce dimensions if needed
        if reduce_dimension:
            mfcc_features = np.mean(mfcc_features, axis=2)  # Average across time
            stft_features = np.mean(stft_features, axis=2)  # Average across time
        
        # Combine MFCC and STFT features
        combined_features = np.concatenate([mfcc_features, stft_features], axis=1)
        
        if output_dir:
            # Save the combined features and labels
            joblib.dump(combined_features, features_file)
            joblib.dump(labels, labels_file)
        
        end_time = time.time()
        print(f"Feature extraction time: {end_time - start_time:.2f} seconds")
        
        features = combined_features
    
    return features, labels

# Extract combined features
print("Extracting combined MFCC and STFT features...")
train_features, train_labels = extract_combined_features(train_path, output_dir=output_dir, dataset_type='train')
val_features, val_labels = extract_combined_features(val_path, output_dir=output_dir, dataset_type='val')
test_features, test_labels = extract_combined_features(test_path, output_dir=output_dir, dataset_type='test')

print(f"Feature dimensions: {train_features.shape}")

# Normalize features
scaler = StandardScaler()
train_features_scaled = scaler.fit_transform(train_features)
val_features_scaled = scaler.transform(val_features)
test_features_scaled = scaler.transform(test_features)

# Create hierarchical labels for first level classification (noqueen vs. others)
def create_hierarchical_labels(labels):
    # First level: 'noqueen' vs. others ('bee' and 'nobee')
    level1_labels = np.array(['bee_nobee' if label in ['bee', 'nobee'] else 'noqueen' for label in labels])
    
    # Second level: 'bee' vs. 'nobee' (only for samples that are not 'noqueen')
    bee_nobee_mask = level1_labels == 'bee_nobee'
    level2_labels = labels[bee_nobee_mask]
    
    return level1_labels, level2_labels, bee_nobee_mask

# Create hierarchical labels
train_level1_labels, train_level2_labels, train_bee_nobee_mask = create_hierarchical_labels(train_labels)
val_level1_labels, val_level2_labels, val_bee_nobee_mask = create_hierarchical_labels(val_labels)
test_level1_labels, test_level2_labels, test_bee_nobee_mask = create_hierarchical_labels(test_labels)

# Level 1 classification: 'noqueen' vs. 'bee_nobee'
print("\nLevel 1 Classification: 'noqueen' vs. 'bee_nobee'")
start_time = time.time()

level1_classifier = SVC(C=10, kernel='rbf', gamma='scale', random_state=42)
level1_classifier.fit(train_features_scaled, train_level1_labels)

end_time = time.time()
print(f"Level 1 Training time: {end_time - start_time:.2f} seconds")

# Evaluate Level 1 model on validation set
val_level1_predictions = level1_classifier.predict(val_features_scaled)
val_level1_accuracy = accuracy_score(val_level1_labels, val_level1_predictions)
print(f"Validation Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): {val_level1_accuracy * 100:.2f}%")

# Evaluate Level 1 model on test set
test_level1_predictions = level1_classifier.predict(test_features_scaled)
test_level1_accuracy = accuracy_score(test_level1_labels, test_level1_predictions)
print(f"Test Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): {test_level1_accuracy * 100:.2f}%")

# Level 2 classification: 'bee' vs. 'nobee' (only for samples predicted as 'bee_nobee')
print("\nLevel 2 Classification: 'bee' vs. 'nobee'")
start_time = time.time()

# Get features for 'bee_nobee' samples from training set
train_level2_features = train_features_scaled[train_bee_nobee_mask]

level2_classifier = SVC(C=10, kernel='rbf', gamma='scale', random_state=42)
level2_classifier.fit(train_level2_features, train_level2_labels)

end_time = time.time()
print(f"Level 2 Training time: {end_time - start_time:.2f} seconds")

# Evaluate Level 2 model on validation set (only on true 'bee_nobee' samples)
val_level2_features = val_features_scaled[val_bee_nobee_mask]
val_level2_predictions = level2_classifier.predict(val_level2_features)
val_level2_accuracy = accuracy_score(val_level2_labels, val_level2_predictions)
print(f"Validation Accuracy (Level 2 - 'bee' vs. 'nobee'): {val_level2_accuracy * 100:.2f}%")

# Evaluate Level 2 model on test set (only on true 'bee_nobee' samples)
test_level2_features = test_features_scaled[test_bee_nobee_mask]
test_level2_predictions = level2_classifier.predict(test_level2_features)
test_level2_accuracy = accuracy_score(test_level2_labels, test_level2_predictions)
print(f"Test Accuracy (Level 2 - 'bee' vs. 'nobee'): {test_level2_accuracy * 100:.2f}%")

# Full hierarchical classification accuracy (Level 1 + Level 2)
print("\nFull Hierarchical Classification (All 3 classes)")

# Function to perform hierarchical prediction
def hierarchical_predict(features, level1_model, level2_model):
    level1_preds = level1_model.predict(features)
    final_preds = np.copy(level1_preds)
    
    # Find samples predicted as 'bee_nobee'
    bee_nobee_indices = np.where(level1_preds == 'bee_nobee')[0]
    
    if len(bee_nobee_indices) > 0:
        # Get features for samples predicted as 'bee_nobee'
        bee_nobee_features = features[bee_nobee_indices]
        
        # Make level 2 predictions for these samples
        level2_preds = level2_model.predict(bee_nobee_features)
        
        # Replace 'bee_nobee' labels with level 2 predictions
        final_preds[bee_nobee_indices] = level2_preds
    
    return final_preds

# Perform hierarchical prediction on validation set
val_hierarchical_predictions = hierarchical_predict(
    val_features_scaled, level1_classifier, level2_classifier
)
val_hierarchical_accuracy = accuracy_score(val_labels, val_hierarchical_predictions)
print(f"Validation Accuracy (Full Hierarchical - All 3 classes): {val_hierarchical_accuracy * 100:.2f}%")

# Perform hierarchical prediction on test set
test_hierarchical_predictions = hierarchical_predict(
    test_features_scaled, level1_classifier, level2_classifier
)
test_hierarchical_accuracy = accuracy_score(test_labels, test_hierarchical_predictions)
print(f"Test Accuracy (Full Hierarchical - All 3 classes): {test_hierarchical_accuracy * 100:.2f}%")

# Save the trained models and scaler
joblib.dump(level1_classifier, os.path.join(output_dir, 'level1_classifier.pkl'))
joblib.dump(level2_classifier, os.path.join(output_dir, 'level2_classifier.pkl'))
joblib.dump(scaler, os.path.join(output_dir, 'feature_scaler.pkl'))

print("\nModels saved successfully.")

Extracting combined MFCC and STFT features...
Extracting train combined features...
Feature extraction time: 173.61 seconds
Extracting val combined features...
Feature extraction time: 23.82 seconds
Extracting test combined features...
Feature extraction time: 47.78 seconds
Feature dimensions: (9653, 160)

Level 1 Classification: 'noqueen' vs. 'bee_nobee'
Level 1 Training time: 2.92 seconds
Validation Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): 92.75%
Test Accuracy (Level 1 - 'noqueen' vs. 'bee_nobee'): 92.93%

Level 2 Classification: 'bee' vs. 'nobee'
Level 2 Training time: 1.82 seconds
Validation Accuracy (Level 2 - 'bee' vs. 'nobee'): 92.61%
Test Accuracy (Level 2 - 'bee' vs. 'nobee'): 90.60%

Full Hierarchical Classification (All 3 classes)
Validation Accuracy (Full Hierarchical - All 3 classes): 87.96%
Test Accuracy (Full Hierarchical - All 3 classes): 86.85%

Models saved successfully.
