# AI Voice Detector Training
Run this notebook on Google Colab (Runtime > Change runtime type > T4 GPU) to train your model.

In [None]:
!pip install kagglehub transformers torch librosa scikit-learn joblib numpy

In [None]:
import kagglehub
import os
from pathlib import Path

# Download dataset
path = kagglehub.dataset_download("kambingbersayaphitam/speech-dataset-of-human-and-ai-generated-voices")
DATASET_DIR = Path(path)
print("Dataset downloaded to:", DATASET_DIR)

In [None]:
import torch
import librosa
import numpy as np
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import joblib

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load Model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)

def extract_features(filepath):
    try:
        # Load audio (resample to 16k for Wav2Vec2)
        audio, sr = librosa.load(filepath, sr=16000)
        
        # Tokenize
        inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
        input_values = inputs.input_values.to(device)
        
        # Extract
        with torch.no_grad():
            outputs = model(input_values)
            hidden_states = outputs.last_hidden_state
            # Mean pooling
            embeddings = hidden_states.mean(dim=1)
            
        return embeddings.cpu().numpy()[0]
    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
# Prepare Data
real_files = list(DATASET_DIR.rglob("*Real*/*.wav")) + list(DATASET_DIR.rglob("*Real*/*.mp3"))
fake_files = list(DATASET_DIR.rglob("*Fake*/*.wav")) + list(DATASET_DIR.rglob("*Fake*/*.mp3"))

print(f"Found {len(real_files)} Real samples")
print(f"Found {len(fake_files)} Fake samples")

X = []
y = []

print("Processing Real files...")
for f in real_files:
    emb = extract_features(f)
    if emb is not None:
        X.append(emb)
        y.append(0) # HUMAN

print("Processing Fake files...")
for f in fake_files:
    emb = extract_features(f)
    if emb is not None:
        X.append(emb)
        y.append(1) # AI

X = np.array(X)
y = np.array(y)
print(f"Final dataset shape: {X.shape}")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(n_estimators=100, random_state=42)

# CV Score
scores = cross_val_score(clf, X, y, cv=5)
print(f"Accuracy: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

# Train full model
clf.fit(X, y)

# Save
output_data = {
    'model': clf,
    'type': 'rf_wav2vec2',
    'version': '1.0'
}
joblib.dump(output_data, "trained_model.joblib")
print("Model saved as trained_model.joblib - Download this file!")