# Deepfake Audio Detection: Local Inference

This notebook demonstrates how to run local inference on a fine-tuned Wav2Vec2 model for deepfake detection.

## Step 1: Environment Setup

Install required dependencies and configure the environment. This includes PyTorch with CUDA support, Transformers, and audio processing libraries.

In [None]:
%pip install -r requirements.txt -Uq

In [None]:
%pip install --extra-index-url https://download.pytorch.org/whl/cu128 torch torchcodec torchaudio xformers -Uq

## Step 2: Simple Local Inference

This section shows how to load the fine-tuned model from Hugging Face and run inference on your own audio files. Perfect for users who just want to test the model without going through the full training pipeline.

**Requirements:**
- Audio file (mp3, wav, or flac format)
- 16kHz sample rate recommended (will auto-convert if different)

**Quick start:**
1. Replace `audio_file_path` with your audio file
2. Run the cell
3. Get prediction: real or fake audio

In [None]:
# Download the model from Hugging Face Hub (Python equivalent of: hf download <repo_id>)
from pathlib import Path

REPO_ID = "garystafford/wav2vec2-deepfake-voice-detector"

# Pick a local directory inside the repo (change if you prefer a different location)
local_dir = Path("model_download") / REPO_ID.replace("/", "__")

try:
    from huggingface_hub import snapshot_download
except ImportError as e:
    raise ImportError(
        "Missing dependency 'huggingface_hub'. Install it with: %pip install huggingface_hub"
    ) from e

download_path = snapshot_download(
    repo_id=REPO_ID,
    local_dir=str(local_dir),
    local_dir_use_symlinks=False,
    resume_download=True,
    allow_patterns=["*.json", "*.safetensors", "*.bin", "*.txt", "*.model", "*.py"],
    ignore_patterns=["*.md"],
    revision="main",
    cache_dir=None,
    token=None,  # set to True / token string if the repo is private
)

print(f"Downloaded '{REPO_ID}' to: {download_path}")

### Local Inference: Multiple Files

In [None]:
import torch
import librosa
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import glob

# ============================================================================
# STEP 1: Load the fine-tuned model from Hugging Face
# ============================================================================
model_name_base = "Gustking/wav2vec2-large-xlsr-deepfake-audio-classification"
model_name_ft = "garystafford/wav2vec2-deepfake-voice-detector"
model_name = model_name_ft  # change to model_name_base to use base model

print("Loading model from Hugging Face...")
model = AutoModelForAudioClassification.from_pretrained(model_name)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
model.eval()  # Set to evaluation mode

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"Model loaded on: {device}\n")


THRESHOLD = 0.40


# ============================================================================
# STEP 2: Define inference function
# ============================================================================
def predict_audio(audio_path, threshold=0.5):
    """
    Predict if audio is real or fake.

    Args:
        audio_path: Path to audio file (mp3, wav, flac)
        threshold: Decision threshold (default 0.5)

    Returns:
        Dictionary with prediction and probabilities
    """
    # Load audio file at 16kHz
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)

    # Extract features
    inputs = feature_extractor(
        audio, sampling_rate=16000, return_tensors="pt", padding=True
    )

    # Move to same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert logits to probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)
    prob_real = probs[0][0].item()
    prob_fake = probs[0][1].item()

    # Make prediction based on threshold
    prediction = "fake" if prob_fake >= threshold else "real"
    confidence = prob_fake if prediction == "fake" else prob_real

    return {
        "prediction": prediction,
        "confidence": confidence,
        "probabilities": {"real": prob_real, "fake": prob_fake},
    }


# ============================================================================
# STEP 3: Test audio files
# ============================================================================

audio_files = (
    glob.glob("audio_samples/*.flac")
    + glob.glob("audio_samples/*.wav")
    + glob.glob("audio_samples/*.mp3")
)
print(f"\n\nTesting {len(audio_files)} files:\n")

for audio_file in audio_files:
    result = predict_audio(audio_file, threshold=THRESHOLD)
    pred_symbol = "ðŸ”´" if result["prediction"] == "fake" else "ðŸŸ¢"
    print(
        f"{pred_symbol} {audio_file.split('/')[-1]:40s} â†’ {result['prediction']:5s} ({result['confidence']:.1%})"
    )

print("\nDone.")

### Local Inference Code from Blog Post

In [None]:
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification
import librosa

# Load model from HuggingFace Hub
model_id = "garystafford/wav2vec2-deepfake-voice-detector"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_id)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()


# Inference function
def predict_audio(audio_path, threshold=0.4):
    # Load and resample to 16kHz
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)

    # Extract features
    inputs = feature_extractor(
        audio, sampling_rate=16000, return_tensors="pt", padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Convert to probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1)
    prob_real = probs[0][0].item()
    prob_fake = probs[0][1].item()

    # Make prediction
    prediction = "fake" if prob_fake >= threshold else "real"

    return {
        "prediction": prediction,
        "confidence": max(prob_real, prob_fake),
        "probabilities": {"real": prob_real, "fake": prob_fake},
    }


# Test on audio file
result = predict_audio("sample_audio.mp3", threshold=0.40)
print(f"Prediction: {result['prediction']} ({result['confidence']:.2%})")