In [1]:
import os
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import gc
from concurrent.futures import ProcessPoolExecutor

# Function to process a single audio file and return log-magnitude spectrogram
def process_audio_file(file_info, max_length=100):  # Reduced max_length
    file_path, label = file_info
    try:
        # Load audio file with downsampling
        audio, _ = librosa.load(file_path, sr=4000)  # Further downsample to 4000 Hz
        # Compute Short-Time Fourier Transform (STFT) with fewer frequency bins
        stft = librosa.stft(audio, n_fft=256, hop_length=128)  # Reduced n_fft
        # Convert to magnitude spectrogram
        magnitude_spectrogram = np.abs(stft)
        # Apply log transformation to get the log-magnitude spectrogram
        log_spectrogram = librosa.amplitude_to_db(magnitude_spectrogram)

        # Ensure the log spectrogram is not too large or small
        if log_spectrogram.shape[1] < max_length:
            log_spectrogram = np.pad(log_spectrogram, ((0, 0), (0, max_length - log_spectrogram.shape[1])), mode='constant')
        else:
            log_spectrogram = log_spectrogram[:, :max_length]

        return log_spectrogram.astype(np.float32), label  # Return log-magnitude features and label
    except Exception as e:
        print(f"Error with file: {file_path} -> {str(e)}")
        return None, None  # Return None if there's an error

# Generator to yield audio files and labels (unchanged)
def audio_file_generator(fake_root_dirs, real_root_dir):
    # Yield audio files from fake directories
    for fake_root_dir in fake_root_dirs:
        for file in os.listdir(fake_root_dir):
            file_path = os.path.join(fake_root_dir, file)
            yield file_path, 1  # Label 1 for fake
    # Yield audio files from real directory
    for file in os.listdir(real_root_dir):
        file_path = os.path.join(real_root_dir, file)
        yield file_path, 0  # Label 0 for real

# Feature extraction for log spectrogram (unchanged)
def extract_spectrogram_features(fake_root_dirs, real_root_dir, max_length=250, max_workers=4):
    features = []
    labels = []

    # Get total number of files to process for tqdm
    total_files = sum(len(os.listdir(d)) for d in fake_root_dirs) + len(os.listdir(real_root_dir))

    # Process audio files using the generator with parallel processing
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(process_audio_file, 
                                         audio_file_generator(fake_root_dirs, real_root_dir)), 
                            total=total_files, desc='Processing audio files'))

    # Collect features and labels from the results
    for log_spectrogram, label in results:
        if log_spectrogram is not None:  # Check if result is valid
            features.append(log_spectrogram)
            labels.append(label)

    # Clean up memory
    del results  # Free memory by deleting the results list
    gc.collect()  # Trigger garbage collection after processing

    return np.array(features), np.array(labels)

# Specify fake and real directories (unchanged)
fake_root_dirs = [
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan',
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan_large',
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_waveglow'
]
real_root_dir = '/kaggle/input/the-lj-speech-dataset/LJSpeech-1.1/wavs'

# Extract features for log spectrogram
x_spectrogram, y_spectrogram = extract_spectrogram_features(fake_root_dirs, real_root_dir)

# Flatten the 2D log spectrogram arrays for XGBoost
x_spectrogram = x_spectrogram.reshape(x_spectrogram.shape[0], -1)

# Train and evaluate Log Spectrogram model
xtrain_s, xtest_s, ytrain_s, ytest_s = train_test_split(x_spectrogram, y_spectrogram, test_size=0.3, random_state=42)
model_spectrogram = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42)
model_spectrogram.fit(xtrain_s, ytrain_s)
ypred_s = model_spectrogram.predict(xtest_s)
accuracy_s = accuracy_score(ytest_s, ypred_s)
print(f"Log Spectrogram Model Accuracy: {accuracy_s:.4f}")
print(classification_report(ytest_s, ypred_s))


Processing audio files: 100%|██████████| 52400/52400 [04:30<00:00, 193.98it/s]


Log Spectrogram Model Accuracy: 0.9939
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      3904
           1       0.99      1.00      1.00     11816

    accuracy                           0.99     15720
   macro avg       1.00      0.99      0.99     15720
weighted avg       0.99      0.99      0.99     15720



In [3]:
import joblib  # Ensure joblib is imported


joblib.dump(model_spectrogram, '/kaggle/working/xgboostlogspec.pkl')


['/kaggle/working/xgboostlogspec.pkl']