In [1]:
# DeepFake Audio Detection using MFCC Features

In [2]:
# Import required libraries
import os
import glob
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

In [3]:
## Data Loading and Exploration

In [4]:
# Set paths
genuine_dir = "real_audio"
deepfake_dir = "deepfake_audio"

# List audio files
genuine_files = glob.glob(os.path.join(genuine_dir, "*.wav"))
deepfake_files = glob.glob(os.path.join(deepfake_dir, "*.wav"))

print(f"Number of genuine audio files: {len(genuine_files)}")
print(f"Number of deepfake audio files: {len(deepfake_files)}")

# Display first few files from each directory
print("\nSample genuine files:")
for file in genuine_files[:3]:
    print(f"- {os.path.basename(file)}")

print("\nSample deepfake files:")
for file in deepfake_files[:3]:
    print(f"- {os.path.basename(file)}")

Number of genuine audio files: 26
Number of deepfake audio files: 27

Sample genuine files:
- 2.wav
- 21.wav
- 22.wav

Sample deepfake files:
- 1.wav
- 10.wav
- 11.wav


In [5]:
## Feature Extraction

In [6]:
# Test feature extraction on one file
sample_file = genuine_files[0]
print(f"Testing with file: {os.path.basename(sample_file)}")

# Load audio
audio_data, sr = librosa.load(sample_file, sr=None)

# Extract features
mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)

print(f"\nFeature shapes:")
print(f"MFCC shape: {mfccs.shape}")
print(f"Spectral Contrast shape: {spectral_contrast.shape}")
print(f"Chroma shape: {chroma.shape}")

Testing with file: 2.wav

Feature shapes:
MFCC shape: (13, 129)
Spectral Contrast shape: (7, 129)
Chroma shape: (12, 129)


In [7]:
## Process All Files

In [8]:
# Initialize lists for features and labels
X = []
y = []

# Process genuine files
for audio_path in genuine_files:
    try:
        # Load and extract features
        audio_data, sr = librosa.load(audio_path, sr=None)
        
        # Extract features
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
        
        # Calculate means
        mfccs_mean = np.mean(mfccs.T, axis=0)
        spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
        chroma_mean = np.mean(chroma.T, axis=0)
        
        # Combine features
        combined_features = np.concatenate((mfccs_mean, spectral_contrast_mean, chroma_mean))
        
        X.append(combined_features)
        y.append(0)  # 0 for genuine
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

# Process deepfake files
for audio_path in deepfake_files:
    try:
        # Load and extract features
        audio_data, sr = librosa.load(audio_path, sr=None)
        
        # Extract features
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
        
        # Calculate means
        mfccs_mean = np.mean(mfccs.T, axis=0)
        spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
        chroma_mean = np.mean(chroma.T, axis=0)
        
        # Combine features
        combined_features = np.concatenate((mfccs_mean, spectral_contrast_mean, chroma_mean))
        
        X.append(combined_features)
        y.append(1)  # 1 for deepfake
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

X = np.array(X)
y = np.array(y)

print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Class distribution: {np.bincount(y)}")

Dataset shape: (53, 32)
Number of features: 32
Class distribution: [26 27]


In [9]:
## Data Preprocessing

In [10]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

Training set shape: (42, 32)
Testing set shape: (11, 32)


In [11]:
## Model Training and Evaluation

In [12]:
# Train models with different kernels
kernels = ['linear', 'rbf', 'poly']
results = {}

for kernel in kernels:
    # Train model
    svm = SVC(kernel=kernel, random_state=42, probability=True)
    svm.fit(X_train_scaled, y_train)
    
    # Evaluate
    y_pred = svm.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    results[kernel] = {
        'accuracy': accuracy,
        'model': svm,
        'predictions': y_pred
    }
    
    print(f"\nResults for {kernel} kernel:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Get best model
best_kernel = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = results[best_kernel]['model']

print(f"\nBest model: {best_kernel} kernel with accuracy: {results[best_kernel]['accuracy']:.4f}")


Results for linear kernel:
Accuracy: 0.7273

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.71      0.83      0.77         6

    accuracy                           0.73        11
   macro avg       0.73      0.72      0.72        11
weighted avg       0.73      0.73      0.72        11


Confusion Matrix:
[[3 2]
 [1 5]]

Results for rbf kernel:
Accuracy: 0.7273

Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.71      0.83      0.77         6

    accuracy                           0.73        11
   macro avg       0.73      0.72      0.72        11
weighted avg       0.73      0.73      0.72        11


Confusion Matrix:
[[3 2]
 [1 5]]

Results for poly kernel:
Accuracy: 0.8182

Classification Report:
              precision    recall  f1-score   support

           0       1.

In [13]:
## Save Model

In [14]:
# Save model and scaler
joblib.dump(best_model, "enhanced_svm_model.pkl")
joblib.dump(scaler, "enhanced_scaler.pkl")
print("Model and scaler saved successfully!")

Model and scaler saved successfully!


In [15]:
## Test Model

In [16]:
def test_audio(audio_path):
    # Load and process audio
    audio_data, sr = librosa.load(audio_path, sr=None)
    
    # Extract features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
    chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
    
    # Calculate means
    mfccs_mean = np.mean(mfccs.T, axis=0)
    spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
    chroma_mean = np.mean(chroma.T, axis=0)
    
    # Combine features
    features = np.concatenate((mfccs_mean, spectral_contrast_mean, chroma_mean))
    
    # Scale features
    features_scaled = scaler.transform(features.reshape(1, -1))
    
    # Predict
    prediction = best_model.predict(features_scaled)
    probability = best_model.predict_proba(features_scaled)[0]
    
    result = "genuine" if prediction[0] == 0 else "deepfake"
    confidence = probability[prediction[0]] * 100
    
    return f"The audio is classified as {result} with {confidence:.2f}% confidence"

# Test with a sample file
test_file = "real_audio/21.wav"  # Replace with actual test file path
print(test_audio(test_file))

The audio is classified as genuine with 96.40% confidence
