# Cross-Dataset Validation: FoR → In-the-Wild

This notebook evaluates the generalization capability of audio deepfake detection models trained on the FoR (Fake or Real) dataset when tested on the In-the-Wild dataset.

**Objective**: Assess how well models trained on controlled/synthetic data generalize to real-world deepfake audio samples.

## Dataset Overview
- **Training Dataset**: FoR (Fake or Real) - controlled dataset with known TTS systems
- **Test Dataset**: In-the-Wild - real-world deepfake audio collected from various sources

In [1]:
import os
import sys
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, 
    roc_curve, 
    auc,
)
import warnings
warnings.filterwarnings('ignore')

from utils.utils import train_and_evaluate_linear_svm

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

## 1. Configuration and Paths

In [None]:
#fake or real dataset features uesd for training
FOR_FEATURES_DIR = PROJECT_ROOT / "FoR_dataset" / "features"

#original itw dataset
ITW_META_PATH = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "modified_meta.csv"
ITW_DATASET_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "release_in_the_wild"
ITW_FEATURES_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "features"
ITW_FEATURES = ITW_FEATURES_DIR / "itw_features_40_2048_512_128.parquet"

#loud normalized itw dataset
ITW_NORMALIZED_DATASET_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "release_in_the_wild_normalized"
ITW_NORMALIZED_FEATURES_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "normalized_features"
ITW_NORMALIZED_FEATURES = ITW_NORMALIZED_FEATURES_DIR / "itw_features_40_2048_512_128_loudness_normalized.parquet"

#silence trimmed itw dataset
ITW_TRIMMED_DATASET_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "release_in_the_wild_trimmed"
ITW_TRIMMED_FEATURES_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "normalized_features"
ITW_TRIMMED_FEATURES = ITW_TRIMMED_FEATURES_DIR / "itw_features_40_2048_512_128_trimmed.parquet"


#silence trimmed and loud normalized itw dataset
ITW_TRIMMED_NORMALIZED_DATASET_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "release_in_the_wild_trimmed_normalized"
ITW_TRIMMED_NORMALIZED_FEATURES_DIR = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "normalized_features"
ITW_TRIMMED_NORMALIZED_FEATURES = ITW_TRIMMED_NORMALIZED_FEATURES_DIR / "itw_features_40_2048_512_128_trimmed_loudness_normalized.parquet"


#silence trimmed and loud normalized itw dataset with no mean and different config
ITW_TRIMMED_NORMALIZED_DATASET_DIR_NO_MEAN = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "release_in_the_wild_trimmed_normalized"
ITW_TRIMMED_NORMALIZED_FEATURES_DIR_NO_MEAN = PROJECT_ROOT / "in-the-wild-audio-deepfake" / "normalized_features"
ITW_TRIMMED_NORMALIZED_FEATURES_NO_MEAN = ITW_TRIMMED_NORMALIZED_FEATURES_DIR / "itw_features_20_128_256_128_trimmed_loudness_normalized.parquet"

print(f"FoR Features Directory: {FOR_FEATURES_DIR}")
print(f"In-the-Wild Dataset Directory: {ITW_DATASET_DIR}")
print(f"In-the-Wild Metadata: {ITW_META_PATH}")

FoR Features Directory: c:\Users\geon9\MSc\audio-deepfake-detection\FoR_dataset\features
In-the-Wild Dataset Directory: c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\release_in_the_wild
In-the-Wild Metadata: c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\modified_meta.csv


## 2. Load FoR Training Data

In [None]:
train_df = pd.read_parquet(FOR_FEATURES_DIR / "training_features_mean_20_128_256_128.parquet")
train_df.dropna(inplace=True)

print(f"FoR Training samples: {len(train_df)}")
print(f"Features: {train_df.shape[1] - 2}")
print(f"\nLabel distribution:")
print(train_df['label'].value_counts())

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\geon9\\MSc\\audio-deepfake-detection\\FoR_dataset\\features\\training_features_20_128_256_128.parquet'

In [None]:
X_train = train_df.drop(columns=["label", "filename"])
y_train = train_df["label"].map({"real": 0, "fake": 1})

print(f"X_train shape: {X_train.shape}")
print(f"y_train distribution: {y_train.value_counts().to_dict()}")

## 3. Extract Features from In-the-Wild Dataset

We need to extract the same features from the In-the-Wild dataset that were used for the FoR dataset.

In [None]:
#set desired itw features path
ITW_FEATURES_PATH = ITW_NORMALIZED_FEATURES

In [28]:
if ITW_NORMALIZED_FEATURES_DIR.exists() and ITW_NORMALIZED_FEATURES.exists():
    print(f"Loading cached features from {ITW_NORMALIZED_FEATURES}")
    itw_df_normalized = pd.read_parquet(ITW_NORMALIZED_FEATURES)
    print(f"Loaded {len(itw_df_normalized)} samples")
else:
    if not ITW_NORMALIZED_FEATURES_DIR.exists():
        print(f"Creating directory {ITW_NORMALIZED_FEATURES_DIR}")
        ITW_NORMALIZED_FEATURES_DIR.mkdir(parents=True, exist_ok=True)
    if not ITW_NORMALIZED_FEATURES.exists():
        print(f"Features not found. Will extract in next cell.")
        itw_df_normalized = None

Loading cached features from c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\normalized_features\itw_features_40_2048_512_128_loudness_normalized.parquet
Loaded 31779 samples


In [24]:
if itw_df is None:
    from data_preprocessing.feature_extraction import extract_features_from_folder
    
    N_MFCC = 20
    N_FFT = 128
    HOP_LENGTH = 256
    N_MELS = 128
    
    feature_config = {
        "rmse": {},
        "zero_crossing_rate": {},
        "spectral_centroid": {},
        "spectral_bandwidth": {},
        "spectral_flatness": {},
        "spectral_rolloff": {},
        "mfcc": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta2": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "pitch_yin": {"fmin": 50, "fmax": 300},
        "mel_spectrogram": {"n_mels": N_MELS},
    }
    
    print("Extracting features from In-the-Wild dataset...")
    ITW_DATASET_PATH = ITW_TRIMMED_NORMALIZED_DATASET_DIR_NO_MEAN
    print(f"Dataset path: {ITW_DATASET_PATH}")
    
    itw_df = extract_features_from_folder(
        folder_path=str(ITW_DATASET_PATH),
        feature_config=feature_config,
        sample_rate=16000,
        num_workers=5
    )
    
    itw_df.to_parquet(ITW_FEATURES_PATH, index=False)
    print(f"Saved features to {ITW_FEATURES_PATH}")

Extracting features from In-the-Wild dataset...
Dataset path: c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\release_in_the_wild_trimmed_normalized
Using 5 workers...


Extracting features: 100%|██████████| 31779/31779 [02:21<00:00, 224.81it/s]


Saved features to c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\normalized_features\itw_features_20_128_256_128_trimmed_loudness_normalized.parquet


In [None]:
print(f"In-the-Wild samples: {len(itw_df)}")
print(f"Features: {itw_df.shape[1] - 2}")
print(f"\nLabel distribution:")
print(itw_df['label'].value_counts())
print(f"\nMissing values: {itw_df.isna().sum().max()}")

In [None]:
itw_df_clean = itw_df.dropna()
print(f"Samples after dropping NaN: {len(itw_df_clean)} (dropped {len(itw_df) - len(itw_df_clean)})")

X_test_itw = itw_df_clean.drop(columns=["label", "filename"])
y_test_itw = itw_df_clean["label"].map({"real": 0, "fake": 1})

print(f"\nX_test_itw shape: {X_test_itw.shape}")
print(f"y_test_itw distribution: {y_test_itw.value_counts().to_dict()}")

## 4. Feature Alignment Check

Ensure both datasets have the same features in the same order.

In [None]:
train_features = set(X_train.columns)
test_features = set(X_test_itw.columns)

missing_in_test = train_features - test_features
extra_in_test = test_features - train_features

print(f"Features in training set: {len(train_features)}")
print(f"Features in test set: {len(test_features)}")
print(f"Missing in test: {missing_in_test}")
print(f"Extra in test: {extra_in_test}")

common_features = list(train_features & test_features)
X_train_aligned = X_train[common_features]
X_test_aligned = X_test_itw[common_features]

print(f"\nAligned feature count: {len(common_features)}")

## 5. Train Models on FoR Dataset

In [None]:
print("Note: The Linear SVM pipeline includes StandardScaler internally.")
print(f"X_train_aligned shape: {X_train_aligned.shape}")
print(f"X_test_aligned shape: {X_test_aligned.shape}")

In [None]:
svc_params = {
    "C": 100,
    "class_weight": "balanced",
    "max_iter": 20000,
    "random_state": 42
}

train_path = str(FOR_FEATURES_DIR / "training_features_40_2048_512_128.parquet")
test_path = str(FOR_FEATURES_DIR / "testing_features_40_2048_512_128.parquet")

print("Training Linear SVM on FoR dataset...")
pipeline, for_metrics, _, feature_names, metadata_extra = train_and_evaluate_linear_svm(
    train_path, test_path, svc_params
)

print(f"\nFoR Test Set Metrics:")
for k, v in for_metrics.items():
    print(f"  {k}: {v:.4f}")

In [None]:
trained_models = {"Linear SVM": pipeline}
print("Linear SVM model ready for cross-dataset evaluation!")

## 6. Evaluate on In-the-Wild Dataset (Cross-Dataset Validation)

In [None]:
train_data_path = os.path.join(FOR_FEATURES_DIR, "training_features_40_2048_512_128.parquet")
test_data_path = os.path.join(ITW_FEATURES)

df = pd.read_parquet(train_data_path)
df_test = pd.read_parquet(test_data_path)
#print(df.head())
print(df_test.head())
print(df_test.shape)
#df.shape()

svc_params = {
            "C": 100,
            "class_weight": "balanced",
            "max_iter": 20000,
            "random_state": 42
        }
pipeline, metrics, svc_params, feature_names, metadata_extra = train_and_evaluate_linear_svm(train_data_path, test_data_path, svc_params)

In [None]:
print(metrics)

## 7. Results Summary

In [None]:
print("Cross-Dataset Validation Results (FoR → In-the-Wild)")
print("="*60)
print(f"Model: Linear SVM")
print(f"Accuracy:  {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall:    {metrics['recall']:.4f}")
print(f"F1 Score:  {metrics['f1']:.4f}")
print(f"AUC-ROC:   {metrics['roc_auc']:.4f}")

## 8. Visualizations

In [None]:
itw_test_df = pd.read_parquet(ITW_FEATURES_PATH)
itw_test_df.dropna(inplace=True)
X_itw = itw_test_df.drop(columns=["label", "filename"], errors="ignore")
y_itw = itw_test_df["label"].map({"real": 0, "fake": 1}).values

y_pred_itw = pipeline.predict(X_itw)
y_scores_itw = pipeline.decision_function(X_itw)

fig, ax = plt.subplots(figsize=(8, 6))
cm = confusion_matrix(y_itw, y_pred_itw)
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=['Real', 'Fake'],
    yticklabels=['Real', 'Fake'],
    ax=ax
)
ax.set_title("Linear SVM - Cross-Dataset Validation")
ax.set_ylabel('True Label')
ax.set_xlabel('Predicted Label')

plt.suptitle('Confusion Matrix (FoR → In-the-Wild)', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
fpr_itw, tpr_itw, _ = roc_curve(y_itw, y_scores_itw)
roc_auc_itw = auc(fpr_itw, tpr_itw)

plt.figure(figsize=(10, 8))
plt.plot(fpr_itw, tpr_itw, label=f"Linear SVM (AUC = {roc_auc_itw:.3f})", linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curve - Cross-Dataset Validation (FoR → In-the-Wild)', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

metric_names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']
values = [metrics['accuracy'], metrics['precision'], metrics['recall'], metrics['f1'], metrics['roc_auc']]
colors = ['steelblue', 'forestgreen', 'coral', 'darkorange', 'mediumpurple']

bars = ax.bar(metric_names, values, color=colors)

ax.set_ylabel('Score', fontsize=12)
ax.set_title('Linear SVM - Cross-Dataset Validation (FoR → In-the-Wild)', fontsize=14)
ax.set_ylim([0, 1.1])

for bar, val in zip(bars, values):
    ax.annotate(f'{val:.3f}',
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height()),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontsize=11)

plt.tight_layout()
plt.show()

## 9. Baseline Comparison: Same-Dataset Performance

Compare cross-dataset performance with same-dataset (FoR test set) performance.

In [None]:
for_test_path = FOR_FEATURES_DIR / "testing_features_40_2048_512_128.parquet"

if for_test_path.exists():
    test_for_df = pd.read_parquet(for_test_path)
    test_for_df.dropna(inplace=True)
    
    X_test_for = test_for_df.drop(columns=["label", "filename"])[common_features]
    y_test_for = test_for_df["label"].map({"real": 0, "fake": 1})
    
    print(f"FoR Test samples: {len(test_for_df)}")
    print(f"Label distribution: {y_test_for.value_counts().to_dict()}")
else:
    print("FoR test features not found")
    X_test_for = None

## 10. Error Analysis

In [None]:
error_df = itw_df_clean.copy()
error_df['predicted'] = y_pred_itw
error_df['true_label'] = y_itw
error_df['correct'] = error_df['predicted'] == error_df['true_label']
error_df['score'] = y_scores_itw

print(f"Model: Linear SVM")
print(f"\nCorrect predictions: {error_df['correct'].sum()} ({error_df['correct'].mean()*100:.2f}%)")
print(f"Incorrect predictions: {(~error_df['correct']).sum()} ({(~error_df['correct']).mean()*100:.2f}%)")

In [None]:
false_positives = error_df[(error_df['true_label'] == 0) & (error_df['predicted'] == 1)]
false_negatives = error_df[(error_df['true_label'] == 1) & (error_df['predicted'] == 0)]

print(f"False Positives (Real predicted as Fake): {len(false_positives)}")
print(f"False Negatives (Fake predicted as Real): {len(false_negatives)}")

print(f"\nTop False Positives (highest score):")
print(false_positives.nlargest(5, 'score')[['filename', 'score']].to_string(index=False))

print(f"\nTop False Negatives (lowest score):")
print(false_negatives.nsmallest(5, 'score')[['filename', 'score']].to_string(index=False))

## 11. Re-extract features fot Loudness Normalized In-The-Wild Dataset

In [27]:
if ITW_NORMALIZED_FEATURES_DIR.exists() and ITW_NORMALIZED_FEATURES.exists():
    print(f"Loading cached features from {ITW_NORMALIZED_FEATURES}")
    itw_df_normalized = pd.read_parquet(ITW_NORMALIZED_FEATURES)
    print(f"Loaded {len(itw_df_normalized)} samples")
else:
    if not ITW_NORMALIZED_FEATURES_DIR.exists():
        print(f"Creating directory {ITW_NORMALIZED_FEATURES_DIR}")
        ITW_NORMALIZED_FEATURES_DIR.mkdir(parents=True, exist_ok=True)
    if not ITW_NORMALIZED_FEATURES.exists():
        print(f"Features not found. Will extract in next cell.")
        itw_df_normalized = None

Loading cached features from c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\normalized_features\itw_features_40_2048_512_128_loudness_normalized.parquet
Loaded 31779 samples


In [None]:
if itw_df_normalized is None and not ITW_NORMALIZED_FEATURES.exists():
    from data_preprocessing.feature_extraction import extract_features_from_folder
    
    N_MFCC = 40
    N_FFT = 2048
    HOP_LENGTH = 512
    N_MELS = 128
    
    feature_config = {
        "rmse": {},
        "zero_crossing_rate": {},
        "spectral_centroid": {},
        "spectral_bandwidth": {},
        "spectral_flatness": {},
        "spectral_rolloff": {},
        "mfcc": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta2": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "pitch_yin": {"fmin": 50, "fmax": 300},
        "mel_spectrogram": {"n_mels": N_MELS},
    }
    
    print("Extracting features from In-the-Wild dataset...")
    print(f"Dataset path: {ITW_NORMALIZED_DATASET_DIR}")
    
    itw_df_normalized = extract_features_from_folder(
        folder_path=str(ITW_NORMALIZED_DATASET_DIR),
        feature_config=feature_config,
        sample_rate=16000,
        num_workers=5
    )
    
    itw_df_normalized.to_parquet(ITW_NORMALIZED_FEATURES, index=False)
    print(f"Saved features to {ITW_NORMALIZED_FEATURES}")
else:
    print(f"Loading cached features from {ITW_NORMALIZED_FEATURES}")
    itw_df_normalized = pd.read_parquet(ITW_NORMALIZED_FEATURES)
    print(f"Loaded {len(itw_df_normalized)} samples")


In [None]:
itw_df_normalized_clean = itw_df_normalized.dropna()
print(f"Samples after dropping NaN: {len(itw_df_normalized_clean)} (dropped {len(itw_df_normalized) - len(itw_df_normalized_clean)})")

X_test_itw_normalized = itw_df_normalized_clean.drop(columns=["label", "filename"])
y_test_itw_normalized = itw_df_normalized_clean["label"].map({"real": 0, "fake": 1})

print(f"\nX_test_itw_normalized shape: {X_test_itw_normalized.shape}")
print(f"y_test_itw_normalized distribution: {y_test_itw_normalized.value_counts().to_dict()}")

In [None]:
train_features = set(X_train.columns)
test_features = set(X_test_itw_normalized.columns)

missing_in_test = train_features - test_features
extra_in_test = test_features - train_features

print(f"Features in training set: {len(train_features)}")
print(f"Features in test set: {len(test_features)}")
print(f"Missing in test: {missing_in_test}")
print(f"Extra in test: {extra_in_test}")

common_features = list(train_features & test_features)
X_train_aligned = X_train[common_features]
X_test_aligned = X_test_itw_normalized[common_features]

print(f"\nAligned feature count: {len(common_features)}")

In [None]:
train_data_path = os.path.join(FOR_FEATURES_DIR, "training_features_40_2048_512_128.parquet")
test_data_path = os.path.join(ITW_NORMALIZED_FEATURES)

df = pd.read_parquet(train_data_path)
df_test = pd.read_parquet(test_data_path)
#print(df.head())
print(df_test.head())
print(df_test.shape)
#df.shape()

svc_params = {
            "C": 100,
            "class_weight": "balanced",
            "max_iter": 20000,
            "random_state": 42
        }
pipeline, metrics, svc_params, feature_names, metadata_extra = train_and_evaluate_linear_svm(train_data_path, test_data_path, svc_params)

In [None]:
print(metrics)

## 12. Re-extract features for Trimmed Silence In-The-Wild Dataset

In [8]:
if ITW_TRIMMED_FEATURES_DIR.exists() and ITW_TRIMMED_FEATURES.exists():
    print(f"Loading cached features from {ITW_TRIMMED_FEATURES}")
    itw_df_trimmed = pd.read_parquet(ITW_TRIMMED_FEATURES)
    print(f"Loaded {len(itw_df_trimmed)} samples")
else:
    if not ITW_TRIMMED_FEATURES_DIR.exists():
        print(f"Creating directory {ITW_TRIMMED_FEATURES_DIR}")
        ITW_TRIMMED_FEATURES_DIR.mkdir(parents=True, exist_ok=True)
    if not ITW_TRIMMED_FEATURES.exists():
        print(f"Features not found. Will extract in next cell.")
        itw_df_trimmed = None

In [10]:
if itw_df_trimmed is None and not ITW_TRIMMED_FEATURES.exists():
    from data_preprocessing.feature_extraction import extract_features_from_folder
    
    N_MFCC = 40
    N_FFT = 2048
    HOP_LENGTH = 512
    N_MELS = 128
    
    feature_config = {
        "rmse": {},
        "zero_crossing_rate": {},
        "spectral_centroid": {},
        "spectral_bandwidth": {},
        "spectral_flatness": {},
        "spectral_rolloff": {},
        "mfcc": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta2": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "pitch_yin": {"fmin": 50, "fmax": 300},
        "mel_spectrogram": {"n_mels": N_MELS},
    }
    
    print("Extracting features from In-the-Wild dataset...")
    print(f"Dataset path: {ITW_TRIMMED_DATASET_DIR}")
    
    itw_df_trimmed = extract_features_from_folder(
        folder_path=str(ITW_TRIMMED_DATASET_DIR),
        feature_config=feature_config,
        sample_rate=16000,
        num_workers=None
    )
    
    itw_df_trimmed.to_parquet(ITW_TRIMMED_FEATURES, index=False)
    print(f"Saved features to {ITW_TRIMMED_FEATURES}")
else:
    print(f"Loading cached features from {ITW_TRIMMED_FEATURES}")
    itw_df_trimmed = pd.read_parquet(ITW_TRIMMED_FEATURES)
    print(f"Loaded {len(itw_df_trimmed)} samples")


Extracting features from In-the-Wild dataset...
Dataset path: c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\release_in_the_wild_trimmed
Using 23 workers...


Extracting features: 100%|██████████| 31779/31779 [01:43<00:00, 307.59it/s]


Saved features to c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\normalized_features\itw_features_40_2048_512_128_trimmed.parquet


In [11]:
itw_df_trimmed_clean = itw_df_trimmed.dropna()
print(f"Samples after dropping NaN: {len(itw_df_trimmed_clean)} (dropped {len(itw_df_trimmed) - len(itw_df_trimmed_clean)})")

X_test_itw_trimmed = itw_df_trimmed_clean.drop(columns=["label", "filename"])
y_test_itw_trimmed = itw_df_trimmed_clean["label"].map({"real": 0, "fake": 1})

print(f"\nX_test_itw_trimmed shape: {X_test_itw_trimmed.shape}")
print(f"y_test_itw_trimmed distribution: {y_test_itw_trimmed.value_counts().to_dict()}")

In [12]:
train_features = set(X_train.columns)
test_features = set(X_test_itw_trimmed.columns)

missing_in_test = train_features - test_features
extra_in_test = test_features - train_features

print(f"Features in training set: {len(train_features)}")
print(f"Features in test set: {len(test_features)}")
print(f"Missing in test: {missing_in_test}")
print(f"Extra in test: {extra_in_test}")

common_features = list(train_features & test_features)
X_train_aligned = X_train[common_features]
X_test_aligned = X_test_itw_trimmed[common_features]

print(f"\nAligned feature count: {len(common_features)}")

In [13]:
train_data_path = os.path.join(FOR_FEATURES_DIR, "training_features_40_2048_512_128.parquet")
test_data_path = os.path.join(ITW_TRIMMED_FEATURES)

df = pd.read_parquet(train_data_path)
df_test = pd.read_parquet(test_data_path)
#print(df.head())
print(df_test.head())
print(df_test.shape)
#df.shape()

svc_params = {
            "C": 100,
            "class_weight": "balanced",
            "max_iter": 20000,
            "random_state": 42
        }
pipeline, metrics, svc_params, feature_names, metadata_extra = train_and_evaluate_linear_svm(train_data_path, test_data_path, svc_params)

  label   filename  mfcc_mean_0  mfcc_std_0  mfcc_mean_1  mfcc_std_1  \
0  real    100.wav  -194.060593   59.522217    92.874870   36.978516   
1  real   1000.wav  -409.435333   92.160080   102.045868   54.739365   
2  real  10001.wav  -275.553925   88.372208    85.472183   47.544376   
3  real  10003.wav  -408.774170   92.514168   153.550095   30.752758   
4  real  10004.wav  -315.357819  108.508636    97.408119   67.490761   

   mfcc_mean_2  mfcc_std_2  mfcc_mean_3  mfcc_std_3  ...  \
0   -16.974052   43.922073    28.749113   31.790537  ...   
1     3.808117   33.426067    13.269380   32.298374  ...   
2   -11.114737   30.097361    -3.776425   27.965250  ...   
3     3.010990   15.028880    11.507483   13.281430  ...   
4    -1.837851   29.000122    28.668243   25.292135  ...   

   mel_spectrogram_mean_123  mel_spectrogram_std_123  \
0                  0.008237                 0.019864   
1                  0.000216                 0.000533   
2                  0.004186           

In [15]:
print(metrics)

{'accuracy': 0.670910187625004, 'precision': 0.6441542542338299, 'recall': 0.2680193564818745, 'f1': 0.5773682440118115, 'roc_auc': 0.5897906113038175}


## 13. Re-extract features for Trimmed Silence Loudness Normalized In-The-Wild Dataset

In [29]:
if ITW_TRIMMED_NORMALIZED_DATASET_DIR.exists() and ITW_TRIMMED_NORMALIZED_FEATURES.exists():
    print(f"Loading cached features from {ITW_TRIMMED_NORMALIZED_FEATURES}")
    itw_df_trimmed_normalized = pd.read_parquet(ITW_TRIMMED_NORMALIZED_FEATURES)
    print(f"Loaded {len(itw_df_trimmed)} samples")
else:
    if not ITW_TRIMMED_NORMALIZED_DATASET_DIR.exists():
        print(f"Creating directory {ITW_TRIMMED_NORMALIZED_DATASET_DIR}")
        ITW_TRIMMED_NORMALIZED_DATASET_DIR.mkdir(parents=True, exist_ok=True)
    if not ITW_TRIMMED_NORMALIZED_FEATURES.exists():
        print(f"Features not found. Will extract in next cell.")
        itw_df_trimmed_normalized = None

Features not found. Will extract in next cell.


In [30]:
if itw_df_trimmed_normalized is None and not ITW_TRIMMED_NORMALIZED_FEATURES.exists():
    from data_preprocessing.feature_extraction import extract_features_from_folder
    
    N_MFCC = 40
    N_FFT = 2048
    HOP_LENGTH = 512
    N_MELS = 128
    
    feature_config = {
        "rmse": {},
        "zero_crossing_rate": {},
        "spectral_centroid": {},
        "spectral_bandwidth": {},
        "spectral_flatness": {},
        "spectral_rolloff": {},
        "mfcc": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "mfcc_delta2": {"n_mfcc": N_MFCC, "n_fft": N_FFT, "hop_length": HOP_LENGTH},
        "pitch_yin": {"fmin": 50, "fmax": 300},
        "mel_spectrogram": {"n_mels": N_MELS},
    }
    
    print("Extracting features from In-the-Wild dataset...")
    print(f"Dataset path: {ITW_TRIMMED_NORMALIZED_DATASET_DIR}")
    
    itw_df_trimmed_normalized = extract_features_from_folder(
        folder_path=str(ITW_TRIMMED_NORMALIZED_DATASET_DIR),
        feature_config=feature_config,
        sample_rate=16000,
        num_workers=None
    )
    
    itw_df_trimmed_normalized.to_parquet(ITW_TRIMMED_NORMALIZED_FEATURES, index=False)
    print(f"Saved features to {ITW_TRIMMED_NORMALIZED_FEATURES}")
else:
    print(f"Loading cached features from {ITW_TRIMMED_NORMALIZED_FEATURES}")
    itw_df_trimmed_normalized = pd.read_parquet(ITW_TRIMMED_NORMALIZED_FEATURES)
    print(f"Loaded {len(itw_df_trimmed_normalized)} samples")


Extracting features from In-the-Wild dataset...
Dataset path: c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\release_in_the_wild_trimmed_normalized
Using 23 workers...


Extracting features: 100%|██████████| 31779/31779 [02:03<00:00, 257.03it/s]


Saved features to c:\Users\geon9\MSc\audio-deepfake-detection\in-the-wild-audio-deepfake\normalized_features\itw_features_40_2048_512_128_trimmed_loudness_normalized.parquet


In [31]:
itw_df_trimmed_normalized_clean = itw_df_trimmed_normalized.dropna()
print(f"Samples after dropping NaN: {len(itw_df_trimmed_normalized_clean)} (dropped {len(itw_df_trimmed_normalized) - len(itw_df_trimmed_normalized_clean)})")

X_test_itw_trimmed_normalized = itw_df_trimmed_normalized_clean.drop(columns=["label", "filename"])
y_test_itw_trimmed_normalized = itw_df_trimmed_normalized_clean["label"].map({"real": 0, "fake": 1})

print(f"\nX_test_itw_trimmed_normalized shape: {X_test_itw_trimmed_normalized.shape}")
print(f"y_test_itw_trimmed_normalized distribution: {y_test_itw_trimmed_normalized.value_counts().to_dict()}")

In [32]:
train_features = set(X_train.columns)
test_features = set(X_test_itw_trimmed_normalized.columns)

missing_in_test = train_features - test_features
extra_in_test = test_features - train_features

print(f"Features in training set: {len(train_features)}")
print(f"Features in test set: {len(test_features)}")
print(f"Missing in test: {missing_in_test}")
print(f"Extra in test: {extra_in_test}")

common_features = list(train_features & test_features)
X_train_aligned = X_train[common_features]
X_test_aligned = X_test_itw_trimmed_normalized[common_features]

print(f"\nAligned feature count: {len(common_features)}")

In [33]:
train_data_path = os.path.join(FOR_FEATURES_DIR, "training_features_40_2048_512_128.parquet")
test_data_path = os.path.join(ITW_TRIMMED_NORMALIZED_FEATURES)

df = pd.read_parquet(train_data_path)
df_test = pd.read_parquet(test_data_path)
#print(df.head())
print(df_test.head())
print(df_test.shape)
#df.shape()

svc_params = {
            "C": 100,
            "class_weight": "balanced",
            "max_iter": 20000,
            "random_state": 42
        }
pipeline, metrics, svc_params, feature_names, metadata_extra = train_and_evaluate_linear_svm(train_data_path, test_data_path, svc_params)

  label   filename  mfcc_mean_0  mfcc_std_0  mfcc_mean_1  mfcc_std_1  \
0  real    100.wav  -156.601151   59.341663    94.034180   36.503689   
1  real   1000.wav  -279.209595   91.934013   102.445534   54.297623   
2  real  10001.wav  -177.487381   85.977005    86.514336   47.363522   
3  real  10003.wav  -271.900391   92.171707   154.030838   30.958628   
4  real  10004.wav  -210.755768  109.072182    97.891685   67.348640   

   mfcc_mean_2  mfcc_std_2  mfcc_mean_3  mfcc_std_3  ...  \
0   -18.125744   43.623184    29.887274   31.707443  ...   
1     3.412116   33.465878    13.645151   32.440147  ...   
2   -11.831045   29.752623    -2.856759   27.914225  ...   
3     2.446726   15.398170    12.026384   13.221119  ...   
4    -2.266813   28.922297    29.088736   25.278456  ...   

   mel_spectrogram_mean_123  mel_spectrogram_std_123  \
0                  0.017062                 0.041203   
1                  0.002931                 0.007223   
2                  0.030415           

In [34]:
print(metrics)

{'accuracy': 0.6885297945966539, 'precision': 0.6889400921658986, 'recall': 0.3046098989727481, 'f1': 0.6046045357066804, 'roc_auc': 0.611229898776435}
