In [31]:
import os
import librosa
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt

# Define dataset paths
real_audio_path = "./archive/dev/real"
fake_audio_path = "./archive/dev/fake"

# Load real and fake audio files
real_files = [os.path.join(real_audio_path, f) for f in os.listdir(real_audio_path) if f.endswith('.wav')]
fake_files = [os.path.join(fake_audio_path, f) for f in os.listdir(fake_audio_path) if f.endswith('.wav')]

print(f"Total Real Audio Files: {len(real_files)}")
print(f"Total Fake Audio Files: {len(fake_files)}")


Total Real Audio Files: 2548
Total Fake Audio Files: 10295


In [2]:
# now embed a watermark in real audio using audioseal
import librosa
import torch
from audioseal import AudioSeal
model = AudioSeal.load_generator("audioseal_wm_16bits")

In [8]:
# ab ispe algo lga ke dekhte hai mfcc features pe
import os
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical


In [10]:
# basically ye tera binary classification problem hai ya to fake hoga ya real hoga
def extract_features(file_path, n_mfcc=13):
    try:
        y, sr = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        return np.mean(mfccs, axis=1)  # Take mean of each MFCC coefficient
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


In [16]:
import random

# Define paths for real and fake audio files
real_audio_path = "./archive/dev/real/"
fake_audio_path = "./archive/dev/fake/"

# real_files = random.sample(os.listdir(real_audio_path), 50)
# fake_files = random.sample(os.listdir(fake_audio_path), 50)


real_files = os.listdir(real_audio_path)  
fake_files = os.listdir(fake_audio_path)
# Prepare dataset list
data = []
labels = []

# Process real audios
for file in real_files:
    file_path = os.path.join(real_audio_path, file)
    features = extract_features(file_path)
    if features is not None:
        data.append(features)
        labels.append(1)  # 1 for real audio

# Process fake audios
for file in fake_files:
    file_path = os.path.join(fake_audio_path, file)
    features = extract_features(file_path)
    if features is not None:
        data.append(features)
        labels.append(0)  # 0 for spoofed audio

# Convert to NumPy arrays
X = np.array(data)
y = np.array(labels)
print("Feature Shape:", X.shape, "Labels Shape:", y.shape)


Feature Shape: (12843, 13) Labels Shape: (12843,)


In [13]:
import os
import librosa
import numpy as np
import pandas as pd
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


In [17]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a classifier (Random Forest / SVM)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf = SVC(kernel='linear', C=1.0)  # Use SVM instead if needed

# Train the model
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9793694044375243
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      2036
           1       0.98      0.92      0.95       533

    accuracy                           0.98      2569
   macro avg       0.98      0.96      0.97      2569
weighted avg       0.98      0.98      0.98      2569



In [18]:
# now let us apply svm on it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model= SVC(kernel='linear', C=1.0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8871156091864538
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      2036
           1       0.75      0.68      0.72       533

    accuracy                           0.89      2569
   macro avg       0.83      0.81      0.82      2569
weighted avg       0.88      0.89      0.89      2569



In [24]:
import librosa
import torch
import soundfile as sf
from audioseal import AudioSeal
import numpy as np

# Load model
model = AudioSeal.load_generator("audioseal_wm_16bits")

def addWaterMark(audioPath, outputPath):
    wav, sr = librosa.load(audioPath, sr=16000)

    # Convert wav to a PyTorch tensor and add batch + channel dimensions
    wav_tensor = torch.tensor(wav, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # Shape: (1, 1, T)

    # Pass the tensor to the model
    watermark = model.get_watermark(wav_tensor, sr)
    

    # Convert watermark to NumPy
    watermark_np = watermark.squeeze().detach().cpu().numpy()  # Remove extra dimensions & move to CPU

    # Ensure both arrays have the same shape
    if watermark_np.shape != wav.shape:
        watermark_np = np.resize(watermark_np, wav.shape)  # Resize watermark if needed

    # Add the watermark to the original audio
    watermarked_audio = wav + watermark_np

    # Save the watermarked file
    sf.write(outputPath, watermarked_audio, sr)

# Example Usage
addWaterMark("./archive/dev/real/B_0000_5_A.wav", "./watermark/watermarked.wav")


In [26]:
# now i have to extract features from both of them and look at the difference of water_marked and not watermarked audio
original_features = extract_features("./archive/dev/real/B_0000_5_A.wav")
original_features


array([-130.33653  ,  130.84656  ,   -1.77593  ,   36.66478  ,
          2.545489 ,   20.664501 ,    0.7507467,   -0.776362 ,
          3.1035469,    3.3764336,    6.7592845,    3.764939 ,
          2.588349 ], dtype=float32)

In [28]:
watermarked_features = extract_features("./watermark/watermarked.wav")
watermarked_features

array([-130.48146   ,  130.727     ,   -1.7407396 ,   36.790726  ,
          2.5953507 ,   20.539675  ,    0.5089865 ,   -0.94782275,
          3.1446311 ,    3.571531  ,    6.891216  ,    3.6549296 ,
          2.3008022 ], dtype=float32)

In [34]:
# hlka hlka change hua hai to ab agr hum lgye algo to kya frk aata hai lets see
from tqdm import tqdm

# Assuming real_files is a list of audio file paths
cnt=0
for audioPath in tqdm(real_files, desc="Processing Audio Files"):
    # print(audioPath)
    output_path = f"./watermark/watermarked_{cnt}.wav"
    addWaterMark(audioPath, output_path)
    cnt=cnt+1

Processing Audio Files: 100%|██████████| 2548/2548 [29:25<00:00,  1.44it/s] 


In [35]:
# ab waatermarked audio files pe model lgate hai featues extract krne ke baaad
import random

# Define paths for real and fake audio files
real_audio_path = "./watermark/"
fake_audio_path = "./archive/dev/fake/"

# real_files = random.sample(os.listdir(real_audio_path), 50)
# fake_files = random.sample(os.listdir(fake_audio_path), 50)


real_files = os.listdir(real_audio_path)  
fake_files = os.listdir(fake_audio_path)
# Prepare dataset list
data2 = []
labels2 = []

# Process real audios
for file in real_files:
    file_path = os.path.join(real_audio_path, file)
    features = extract_features(file_path)
    if features is not None:
        data2.append(features)
        labels2.append(1)  # 1 for real audio

# Process fake audios
for file in fake_files:
    file_path = os.path.join(fake_audio_path, file)
    features = extract_features(file_path)
    if features is not None:
        data2.append(features)
        labels2.append(0)  # 0 for spoofed audio

# Convert to NumPy arrays
X = np.array(data)
y = np.array(labels)
print("Feature Shape:", X.shape, "Labels Shape:", y.shape)


Feature Shape: (12843, 13) Labels Shape: (12843,)


In [36]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose a classifier (Random Forest / SVM)
clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf = SVC(kernel='linear', C=1.0)  # Use SVM instead if needed

# Train the model
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9793694044375243
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      2036
           1       0.98      0.92      0.95       533

    accuracy                           0.98      2569
   macro avg       0.98      0.96      0.97      2569
weighted avg       0.98      0.98      0.98      2569



In [37]:
# lets apply svm now
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model= SVC(kernel='linear', C=1.0)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.8871156091864538
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.94      0.93      2036
           1       0.75      0.68      0.72       533

    accuracy                           0.89      2569
   macro avg       0.83      0.81      0.82      2569
weighted avg       0.88      0.89      0.89      2569

