In [3]:
import torch
print(torch.__version__)
import soundfile as sf

2.6.0+cpu


In [5]:
import os
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import numpy as np
import librosa
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [7]:
from pydub import AudioSegment

def convert_mp3_to_wav(mp3_path, wav_path, target_sr=16000):
    audio = AudioSegment.from_mp3(mp3_path)
    audio = audio.set_frame_rate(target_sr).set_channels(1)
    audio.export(wav_path, format="wav")
    print(f"Converted {mp3_path} to {wav_path} at {target_sr} Hz.")




In [9]:
# Preprocessing Function
def preprocess_audio(input_dir, output_dir, sr=16000, n_mels=128):
    os.makedirs(output_dir, exist_ok=True)
    for label in ['real', 'fake']:
        input_class_dir = os.path.join(input_dir, label)
        output_class_dir = os.path.join(output_dir, label)
        os.makedirs(output_class_dir, exist_ok=True)

        for file in os.listdir(input_class_dir):
            if file.endswith('.wav'):
                y, sr = librosa.load(os.path.join(input_class_dir, file), sr=sr)
                mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
                mel_db = librosa.power_to_db(mel, ref=np.max)
                output_file = os.path.join(output_class_dir, file.replace('.wav', '.npy'))
                np.save(output_file, mel_db)



In [11]:
class PrecomputedAudioDataset(Dataset):
    def __init__(self, data_dir, target_shape=(128, 300)):
        self.file_paths = []
        self.labels = []
        self.target_shape = target_shape  # Fixed size for all spectrograms

        for label in ['real', 'fake']:
            class_dir = os.path.join(data_dir, label)
            for filename in os.listdir(class_dir):
                if filename.endswith('.npy'):
                    self.file_paths.append(os.path.join(class_dir, filename))
                    self.labels.append(0 if label == 'real' else 1)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        spec = np.load(self.file_paths[idx])

        # Get target shape
        target_mels, target_frames = self.target_shape

        # Padding or cropping logic
        if spec.shape[1] < target_frames:
            pad_width = target_frames - spec.shape[1]
            spec = np.pad(spec, ((0, 0), (0, pad_width)), mode='constant')
        else:
            spec = spec[:, :target_frames]

        # Convert to tensor and add channel dimension
        spec_tensor = torch.tensor(spec, dtype=torch.float32).unsqueeze(0)
        label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
        return spec_tensor, label_tensor

In [13]:
# Model
class DeepFakeAudioDetector(nn.Module):
    def __init__(self):
        super(DeepFakeAudioDetector, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, 3, stride=1, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, 3, stride=1, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.fc = nn.Linear(64, 2)

    def forward(self, x):
        x = self.cnn(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

In [31]:
# Train and Validate with Early Stopping & Checkpointing
def train_model(data_dir, epochs=20, batch_size=8, lr=1e-4, patience=5):
    dataset = PrecomputedAudioDataset(data_dir)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_set, val_set = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_set, batch_size=batch_size, num_workers=0)

    device = torch.device("cpu")
    model = DeepFakeAudioDetector().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, val_acc = validate(model, val_loader, device, criterion)

        print(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.2f}%")

        # Early Stopping & Checkpoint
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), 'C://HAMMAD/AI Data/DeepFake Audio/best_deepfake_detector.pth')
            print("Saved new best model!\n")
        else:
            patience_counter += 1
            print(f"Patience: {patience_counter}/{patience}")
            if patience_counter >= patience:
                print("Early stopping triggered!")
                break

    # Plot final confusion matrix
    plot_confusion_matrix(model, val_loader, device)

In [17]:
# Validation with Loss & Accuracy
def validate(model, val_loader, device, criterion):
    model.eval()
    total_loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            predicted = outputs.argmax(dim=1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    avg_loss = total_loss / len(val_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

In [19]:
# Confusion Matrix Plot
def plot_confusion_matrix(model, val_loader, device):
    model.load_state_dict(torch.load('best_deepfake_detector.pth'))
    model.eval()

    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            predictions = outputs.argmax(dim=1).cpu().numpy()
            y_pred.extend(predictions)
            y_true.extend(labels.numpy())

    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Real', 'Fake'])
    disp.plot(cmap='Blues')
    plt.title("Confusion Matrix")
    plt.show()

In [21]:
# Usage
if __name__ == "__main__":
    preprocess_audio(
        "C://HAMMAD/AI Data/archive/for-original/for-original/training",  # Dataset folder
        "C://HAMMAD/AI Data/pectrogram deepfake"  # Processed spectrogram folder
    )



    # Step 2: Train with early stopping & checkpointing
    train_model("C://HAMMAD/AI Data/pectrogram deepfake", epochs=20, batch_size=8, lr=1e-4, patience=5)

  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


Epoch 1 | Train Loss: 0.4453 | Val Loss: 0.3676 | Val Acc: 87.21%
Saved new best model!

Epoch 2 | Train Loss: 0.3018 | Val Loss: 0.2720 | Val Acc: 87.21%
Saved new best model!

Epoch 3 | Train Loss: 0.2490 | Val Loss: 0.2527 | Val Acc: 87.21%
Saved new best model!

Epoch 4 | Train Loss: 0.2154 | Val Loss: 0.2040 | Val Acc: 89.03%
Saved new best model!

Epoch 5 | Train Loss: 0.1928 | Val Loss: 0.2094 | Val Acc: 91.38%
Patience: 1/5
Epoch 6 | Train Loss: 0.1768 | Val Loss: 0.1842 | Val Acc: 92.69%
Saved new best model!

Epoch 7 | Train Loss: 0.1668 | Val Loss: 0.1649 | Val Acc: 90.60%
Saved new best model!

Epoch 8 | Train Loss: 0.1610 | Val Loss: 0.1549 | Val Acc: 91.64%
Saved new best model!

Epoch 9 | Train Loss: 0.1501 | Val Loss: 0.1490 | Val Acc: 95.30%
Saved new best model!

Epoch 10 | Train Loss: 0.1449 | Val Loss: 0.1494 | Val Acc: 92.95%
Patience: 1/5
Epoch 11 | Train Loss: 0.1344 | Val Loss: 0.1341 | Val Acc: 96.34%
Saved new best model!

Epoch 12 | Train Loss: 0.1317 | Val L

FileNotFoundError: [Errno 2] No such file or directory: 'best_deepfake_detector.pth'

In [29]:
# 1. Create the model
model = DeepFakeAudioDetector()

# 2. Load the trained weights
model.load_state_dict(torch.load('C://HAMMAD/AI Data/DeepFake Audio/best_deepfake_detector.pth'))

# 3. Set model to evaluation mode
model.eval()


def predict_single_file(file_path, model, target_sr=16000, target_shape=(128, 300)):
    import os
    import torch
    import numpy as np
    import librosa

    device = torch.device("cpu")

    if file_path.endswith('.mp3'):
        y, sr = librosa.load(file_path, sr=target_sr)
        wav_path = os.path.splitext(file_path)[0] + "_converted.wav"
        sf.write(wav_path, y, sr)
        file_path = wav_path
        print(f"Converted mp3 to wav: {file_path}")

    y, sr = librosa.load(file_path, sr=target_sr)
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    target_mels, target_frames = target_shape
    if mel_spec_db.shape[1] < target_frames:
        pad_width = target_frames - mel_spec_db.shape[1]
        mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mel_spec_db = mel_spec_db[:, :target_frames]

    input_tensor = torch.tensor(mel_spec_db, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)

    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        prediction = torch.argmax(output, dim=1).item()
        label = "REAL" if prediction == 0 else "FAKE"
        print(f"Prediction: Audio file is '{label}'")

# 4. Ask user for input path
file_path = input("Enter path to audio file (.mp3 or .wav): ")

# 5. Predict!
predict_single_file(file_path, model)

Enter path to audio file (.mp3 or .wav):  C://HAMMAD/AI Data/New folder/Ai audio check.mp3


Converted mp3 to wav: C://HAMMAD/AI Data/New folder/Ai audio check_converted.wav
Prediction: Audio file is 'FAKE'
