In [1]:
import os
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import gc
from concurrent.futures import ProcessPoolExecutor

# Function to process a single audio file (unchanged)
def process_audio_file(file_info, max_length=100):
    file_path, label = file_info
    try:
        audio, _ = librosa.load(file_path, sr=4000)
        stft = librosa.stft(audio, n_fft=256, hop_length=128)
        spectrogram = np.abs(stft)

        if spectrogram.shape[1] < max_length:
            spectrogram = np.pad(spectrogram, ((0, 0), (0, max_length - spectrogram.shape[1])), mode='constant')
        else:
            spectrogram = spectrogram[:, :max_length]

        return spectrogram.astype(np.float32), label
    except Exception as e:
        print(f"Error with file: {file_path} -> {str(e)}")
        return None, None

# Generator for audio files and labels (unchanged)
def audio_file_generator(fake_root_dirs, real_root_dir):
    for fake_root_dir in fake_root_dirs:
        for file in os.listdir(fake_root_dir):
            file_path = os.path.join(fake_root_dir, file)
            yield file_path, 1
    for file in os.listdir(real_root_dir):
        file_path = os.path.join(real_root_dir, file)
        yield file_path, 0

# Feature extraction for spectrogram (unchanged)
def extract_spectrogram_features(fake_root_dirs, real_root_dir, max_length=250, max_workers=4):
    features = []
    labels = []

    total_files = sum(len(os.listdir(d)) for d in fake_root_dirs) + len(os.listdir(real_root_dir))

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(process_audio_file, 
                                         audio_file_generator(fake_root_dirs, real_root_dir)), 
                            total=total_files, desc='Processing audio files'))

    for spectrogram, label in results:
        if spectrogram is not None:
            features.append(spectrogram)
            labels.append(label)

    del results
    gc.collect()

    return np.array(features), np.array(labels)

# Specify fake and real directories (unchanged)
fake_root_dirs = [
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan',
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan_large',
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_waveglow'
]
real_root_dir = '/kaggle/input/the-lj-speech-dataset/LJSpeech-1.1/wavs'

# Extract features for spectrogram (unchanged)
x_spectrogram, y_spectrogram = extract_spectrogram_features(fake_root_dirs, real_root_dir)

# Flatten the 2D spectrogram arrays for XGBoost
x_spectrogram = x_spectrogram.reshape(x_spectrogram.shape[0], -1)

# Split into training and testing
xtrain_s, xtest_s, ytrain_s, ytest_s = train_test_split(x_spectrogram, y_spectrogram, test_size=0.3, random_state=42)

# Train and evaluate Spectrogram model with regularization and early stopping
model_spectrogram = XGBClassifier(
    n_estimators=500,          # Increase number of trees
    max_depth=6,               # Max depth of trees
    learning_rate=0.01,        # Reduce learning rate to make the model learn slower
    subsample=0.8,             # Randomly sample 80% of the data to avoid overfitting
    colsample_bytree=0.8,      # Randomly sample 80% of features to avoid overfitting
    reg_alpha=0.01,            # L1 regularization (add sparsity)
    reg_lambda=0.1,            # L2 regularization
    early_stopping_rounds=50,  # Stop if no improvement after 50 rounds
    random_state=42
)

# Train the model and use early stopping based on validation data
model_spectrogram.fit(
    xtrain_s, ytrain_s,
    eval_set=[(xtest_s, ytest_s)],
    eval_metric="logloss",
    verbose=True
)

# Evaluate the model
ypred_s = model_spectrogram.predict(xtest_s)
accuracy_s = accuracy_score(ytest_s, ypred_s)
print(f"Spectrogram Model Accuracy (with regularization): {accuracy_s:.4f}")
print(classification_report(ytest_s, ypred_s))


Processing audio files: 100%|██████████| 52400/52400 [04:35<00:00, 189.91it/s]


[0]	validation_0-logloss:0.55742
[1]	validation_0-logloss:0.55338
[2]	validation_0-logloss:0.54929
[3]	validation_0-logloss:0.54526
[4]	validation_0-logloss:0.54126
[5]	validation_0-logloss:0.53743
[6]	validation_0-logloss:0.53366
[7]	validation_0-logloss:0.52993
[8]	validation_0-logloss:0.52617
[9]	validation_0-logloss:0.52253
[10]	validation_0-logloss:0.51891
[11]	validation_0-logloss:0.51540
[12]	validation_0-logloss:0.51190
[13]	validation_0-logloss:0.50855
[14]	validation_0-logloss:0.50519
[15]	validation_0-logloss:0.50185
[16]	validation_0-logloss:0.49849
[17]	validation_0-logloss:0.49526
[18]	validation_0-logloss:0.49209
[19]	validation_0-logloss:0.48900
[20]	validation_0-logloss:0.48593
[21]	validation_0-logloss:0.48293
[22]	validation_0-logloss:0.47979
[23]	validation_0-logloss:0.47686
[24]	validation_0-logloss:0.47389
[25]	validation_0-logloss:0.47098
[26]	validation_0-logloss:0.46811
[27]	validation_0-logloss:0.46537
[28]	validation_0-logloss:0.46260
[29]	validation_0-loglos

In [2]:
import joblib  # Ensure joblib is imported

joblib.dump(model_spectrogram, '/kaggle/working/XGBOSST.pkl')


['/kaggle/working/XGBOSST.pkl']