In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import os
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import gc
from concurrent.futures import ProcessPoolExecutor

# Function to process a single audio file and return log-magnitude spectrogram
def process_audio_file(file_info, max_length=100):
    file_path, label = file_info
    try:
        # Load audio file with downsampling
        audio, _ = librosa.load(file_path, sr=4000)  # Downsample to 4000 Hz
        # Compute Short-Time Fourier Transform (STFT)
        stft = librosa.stft(audio, n_fft=256, hop_length=128)
        # Compute magnitude spectrogram
        spectrogram = np.abs(stft)
        # Apply logarithmic transformation to magnitude (log spectrogram)
        log_spectrogram = librosa.amplitude_to_db(spectrogram)

        # Ensure the log spectrogram is not too large or small
        if log_spectrogram.shape[1] < max_length:
            log_spectrogram = np.pad(log_spectrogram, ((0, 0), (0, max_length - log_spectrogram.shape[1])), mode='constant')
        else:
            log_spectrogram = log_spectrogram[:, :max_length]

        return log_spectrogram.astype(np.float32), label
    except Exception as e:
        print(f"Error with file: {file_path} -> {str(e)}")
        return None, None

# Generator to yield audio files and labels
def audio_file_generator(fake_root_dirs, real_root_dir):
    # Yield audio files from fake directories
    for fake_root_dir in fake_root_dirs:
        for file in os.listdir(fake_root_dir):
            file_path = os.path.join(fake_root_dir, file)
            yield file_path, 1  # Label 1 for fake
    # Yield audio files from real directory
    for file in os.listdir(real_root_dir):
        file_path = os.path.join(real_root_dir, file)
        yield file_path, 0  # Label 0 for real

# Feature extraction for log spectrogram
def extract_spectrogram_features(fake_root_dirs, real_root_dir, max_length=250, max_workers=4):
    features = []
    labels = []
    total_files = sum(len(os.listdir(d)) for d in fake_root_dirs) + len(os.listdir(real_root_dir))

    # Process audio files using parallel processing
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(process_audio_file, 
                                          audio_file_generator(fake_root_dirs, real_root_dir)), 
                            total=total_files, desc='Processing audio files'))

    # Collect features and labels
    for log_spectrogram, label in results:
        if log_spectrogram is not None:
            features.append(log_spectrogram)
            labels.append(label)

    # Clean up memory
    del results
    gc.collect()

    return np.array(features), np.array(labels)

# Specify fake and real directories
fake_root_dirs = [
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan',
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_melgan_large',
    '/kaggle/input/wavefake-test/generated_audio/ljspeech_waveglow'
]
real_root_dir = '/kaggle/input/the-lj-speech-dataset/LJSpeech-1.1/wavs'

# Extract features for log spectrogram
x_spectrogram, y_spectrogram = extract_spectrogram_features(fake_root_dirs, real_root_dir)

# Flatten the 2D log spectrogram arrays for Random Forest
x_spectrogram = x_spectrogram.reshape(x_spectrogram.shape[0], -1)

# Train and evaluate the model using Random Forest
xtrain_s, xtest_s, ytrain_s, ytest_s = train_test_split(x_spectrogram, y_spectrogram, test_size=0.3, random_state=42)
model_spectrogram = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
model_spectrogram.fit(xtrain_s, ytrain_s)
ypred_s = model_spectrogram.predict(xtest_s)

# Evaluate the model
accuracy_s = accuracy_score(ytest_s, ypred_s)
print(f"Log Spectrogram Model Accuracy with Random Forest: {accuracy_s:.4f}")
print(classification_report(ytest_s, ypred_s))


Processing audio files: 100%|██████████| 52400/52400 [04:41<00:00, 186.20it/s]


Log Spectrogram Model Accuracy with Random Forest: 0.8513
              precision    recall  f1-score   support

           0       1.00      0.40      0.57      3904
           1       0.83      1.00      0.91     11816

    accuracy                           0.85     15720
   macro avg       0.92      0.70      0.74     15720
weighted avg       0.88      0.85      0.83     15720

