# DeepFake Audio Detection using MFCC Features

In [26]:
# Import required libraries
import os
import glob
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [27]:
## Data Loading and Exploration

In [28]:
# Set paths
genuine_dir = "real_audio"
deepfake_dir = "deepfake_audio"

# List audio files
genuine_files = glob.glob(os.path.join(genuine_dir, "*.wav"))
deepfake_files = glob.glob(os.path.join(deepfake_dir, "*.wav"))

print(f"Number of genuine audio files: {len(genuine_files)}")
print(f"Number of deepfake audio files: {len(deepfake_files)}")

# Display first few files from each directory
print("\nSample genuine files:")
for file in genuine_files[:3]:
    print(f"- {os.path.basename(file)}")

print("\nSample deepfake files:")
for file in deepfake_files[:3]:
    print(f"- {os.path.basename(file)}")

Number of genuine audio files: 28
Number of deepfake audio files: 29

Sample genuine files:
- 2.wav
- 21.wav
- 22.wav

Sample deepfake files:
- 1.wav
- 10.wav
- 11.wav


In [29]:
## Feature Extraction

In [30]:
# Test feature extraction on one file
sample_file = genuine_files[0]
print(f"Testing with file: {os.path.basename(sample_file)}")

# Load audio
audio_data, sr = librosa.load(sample_file, sr=None)

# Extract features
mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)

print(f"\nFeature shapes:")
print(f"MFCC shape: {mfccs.shape}")
print(f"Spectral Contrast shape: {spectral_contrast.shape}")
print(f"Chroma shape: {chroma.shape}")

Testing with file: 2.wav

Feature shapes:
MFCC shape: (13, 129)
Spectral Contrast shape: (7, 129)
Chroma shape: (12, 129)


In [31]:
mfccs[1]

array([110.98104  , 124.308205 , 125.77492  , 139.12149  , 142.5968   ,
       121.15544  , 106.72789  , 116.64188  , 132.10913  , 120.67848  ,
       103.57238  , 109.26511  , 107.32579  , 101.04616  , 104.75711  ,
       117.84329  ,  94.80696  ,  81.006424 ,  93.52504  ,  65.56325  ,
        13.509817 , -15.664465 ,  -7.859823 ,  50.064995 ,  92.541626 ,
        46.84996  ,   6.117628 , -13.147669 ,   4.1916666,  71.09328  ,
       121.19969  , 125.125336 , 123.256966 , 112.307816 ,  64.22829  ,
        24.728376 ,  -6.2048264, -18.366722 ,   6.095236 ,  79.736206 ,
       127.98952  , 115.89934  , 118.50145  , 120.33327  , 102.3519   ,
        90.82309  ,  90.83484  , 103.466354 , 122.49209  , 114.05806  ,
       136.58801  , 140.2486   , 136.08212  , 152.21625  , 148.69955  ,
       142.88342  , 149.73776  , 142.38037  , 128.85352  , 113.64252  ,
       108.19478  , 113.51654  , 109.33081  ,  38.54744  ,  33.68409  ,
        55.053444 ,  84.11131  ,  95.67732  ,  96.47676  ,  93.8

In [32]:
## Process All Files

In [33]:
# Initialize lists for features and labels
X = []
y = []

# Process genuine files
for audio_path in genuine_files:
    try:
        # Load and extract features
        audio_data, sr = librosa.load(audio_path, sr=None)
        
        # Extract features
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
        
        # Calculate means
        mfccs_mean = np.mean(mfccs.T, axis=0)
        spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
        chroma_mean = np.mean(chroma.T, axis=0)
        
        # Combine features
        combined_features = np.concatenate((mfccs_mean, spectral_contrast_mean, chroma_mean))
        
        X.append(combined_features)
        y.append(0)  # 0 for genuine
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

# Process deepfake files
for audio_path in deepfake_files:
    try:
        # Load and extract features
        audio_data, sr = librosa.load(audio_path, sr=None)
        
        # Extract features
        mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
        spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
        chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
        
        # Calculate means
        mfccs_mean = np.mean(mfccs.T, axis=0)
        spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
        chroma_mean = np.mean(chroma.T, axis=0)
        
        # Combine features
        combined_features = np.concatenate((mfccs_mean, spectral_contrast_mean, chroma_mean))
        
        X.append(combined_features)
        y.append(1)  # 1 for deepfake
        
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")

X = np.array(X)
y = np.array(y)

print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Class distribution: {np.bincount(y)}")

Dataset shape: (57, 32)
Number of features: 32
Class distribution: [28 29]


In [34]:
## Data Preprocessing

In [35]:

y[0]

0

In [36]:
X[0]

array([-188.71859741,   93.33286285,   16.54830742,   20.61961746,
        -21.47842789,   -6.78594542,  -19.66224861,   -6.61803722,
        -10.62405396,   -9.78885746,  -16.24817848,    0.54749703,
         -9.83342648,   22.96271403,   15.92412325,   18.0747372 ,
         17.0788109 ,   16.74899691,   16.90173891,   18.42033942,
          0.35680041,    0.32371879,    0.29096279,    0.27365458,
          0.27435064,    0.27167463,    0.32886067,    0.36258084,
          0.47570193,    0.60425293,    0.52554685,    0.50603938])

In [37]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Testing set shape:", X_test_scaled.shape)

Training set shape: (45, 32)
Testing set shape: (12, 32)


In [38]:
X_train[0]

array([-3.28472961e+02,  1.45085388e+02, -8.57061481e+00,  3.30707397e+01,
        9.10528469e+00, -6.08886957e+00, -2.97273159e+00, -9.12046623e+00,
       -1.31005116e+01, -8.00512981e+00, -1.65830498e+01, -5.28410316e-01,
       -1.91628289e+00,  2.60099876e+01,  1.93268223e+01,  2.29305940e+01,
        2.01788594e+01,  1.92786839e+01,  2.36084617e+01,  2.26102482e+01,
        4.49965179e-01,  3.26792032e-01,  2.28594303e-01,  1.78528398e-01,
        1.77021176e-01,  2.20385313e-01,  2.39351675e-01,  2.76793361e-01,
        2.75653124e-01,  2.90136188e-01,  4.44750637e-01,  4.96096939e-01])

In [39]:
## Model Training and Evaluation

In [40]:
# # Train models with different kernels
# kernels = ['linear', 'rbf', 'poly']
# results = {}

# for kernel in kernels:
#     # Train model
#     svm = SVC(kernel=kernel, random_state=42, probability=True)
#     svm.fit(X_train_scaled, y_train)
    
#     # Evaluate
#     y_pred = svm.predict(X_test_scaled)
#     accuracy = accuracy_score(y_test, y_pred)
#     results[kernel] = {
#         'accuracy': accuracy,
#         'model': svm,
#         'predictions': y_pred
#     }
    
#     print(f"\nResults for {kernel} kernel:")
#     print(f"Accuracy: {accuracy:.4f}")
#     print("\nClassification Report:")
#     print(classification_report(y_test, y_pred))
#     print("\nConfusion Matrix:")
#     print(confusion_matrix(y_test, y_pred))

# # Get best model
# best_kernel = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
# best_model = results[best_kernel]['model']

# print(f"\nBest model: {best_kernel} kernel with accuracy: {results[best_kernel]['accuracy']:.4f}")

In [41]:
def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):
    # Train model
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\nResults for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return {
        'accuracy': accuracy,
        'model': model,
        'predictions': y_pred
    }


In [42]:
results = {}
kernels = ['linear', 'rbf', 'poly']

for kernel in kernels:
    model_name = f"SVM_{kernel}"
    svm = SVC(kernel=kernel, random_state=42, probability=True)
    results[model_name] = evaluate_model(svm, X_train_scaled, y_train, X_test_scaled, y_test, model_name)



Results for SVM_linear:
Accuracy: 0.5833

Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.67      0.62         6
           1       0.60      0.50      0.55         6

    accuracy                           0.58        12
   macro avg       0.59      0.58      0.58        12
weighted avg       0.59      0.58      0.58        12


Confusion Matrix:
[[4 2]
 [3 3]]

Results for SVM_rbf:
Accuracy: 0.7500

Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.83      0.77         6
           1       0.80      0.67      0.73         6

    accuracy                           0.75        12
   macro avg       0.76      0.75      0.75        12
weighted avg       0.76      0.75      0.75        12


Confusion Matrix:
[[5 1]
 [2 4]]

Results for SVM_poly:
Accuracy: 0.8333

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0

In [43]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)
results["Logistic_Regression"] = evaluate_model(log_reg, X_train_scaled, y_train, X_test_scaled, y_test, "Logistic Regression")


Results for Logistic Regression:
Accuracy: 0.6667

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         6
           1       0.67      0.67      0.67         6

    accuracy                           0.67        12
   macro avg       0.67      0.67      0.67        12
weighted avg       0.67      0.67      0.67        12


Confusion Matrix:
[[4 2]
 [2 4]]


In [44]:
#Train and evaluate Decision Tree
dt = DecisionTreeClassifier(random_state=42)
results["Decision_Tree"] = evaluate_model(dt, X_train_scaled, y_train, X_test_scaled, y_test, "Decision Tree")


Results for Decision Tree:
Accuracy: 0.7500

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.67      0.73         6
           1       0.71      0.83      0.77         6

    accuracy                           0.75        12
   macro avg       0.76      0.75      0.75        12
weighted avg       0.76      0.75      0.75        12


Confusion Matrix:
[[4 2]
 [1 5]]


In [45]:
best_model_name = max(results.items(), key=lambda x: x[1]['accuracy'])[0]
best_model = results[best_model_name]['model']
best_accuracy = results[best_model_name]['accuracy']
print(f"\nBest model: {best_model_name} with accuracy: {best_accuracy:.4f}")


Best model: SVM_poly with accuracy: 0.8333


In [46]:
model_names = list(results.keys())
accuracies = [results[model]['accuracy'] for model in model_names]

In [47]:
## Save Model

In [48]:
# Save model and scaler
joblib.dump(best_model, "enhanced_svm_model.pkl")
joblib.dump(scaler, "enhanced_scaler.pkl")
print("Model and scaler saved successfully!")

Model and scaler saved successfully!


In [49]:
## Test Model

In [50]:
def test_audio(audio_path):
    # Load and process audio
    audio_data, sr = librosa.load(audio_path, sr=None)
    
    # Extract features
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=13)
    spectral_contrast = librosa.feature.spectral_contrast(y=audio_data, sr=sr)
    chroma = librosa.feature.chroma_stft(y=audio_data, sr=sr)
    
    # Calculate means
    mfccs_mean = np.mean(mfccs.T, axis=0)
    spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
    chroma_mean = np.mean(chroma.T, axis=0)
    
    # Combine features
    features = np.concatenate((mfccs_mean, spectral_contrast_mean, chroma_mean))
    
    # Scale features
    features_scaled = scaler.transform(features.reshape(1, -1))
    
    # Predict
    prediction = best_model.predict(features_scaled)
    probability = best_model.predict_proba(features_scaled)[0]
    
    result = "genuine" if prediction[0] == 0 else "deepfake"
    confidence = probability[prediction[0]] * 100
    
    return f"The audio is classified as {result} with {confidence:.2f}% confidence"

# Test with a sample file
test_file = "real_audio/21.wav"  # Replace with actual test file path
print(test_audio(test_file))

The audio is classified as genuine with 91.56% confidence
