In [None]:
!pip install fairseq

In [None]:
import random
from typing import Union

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import fairseq
import argparse

class SSLModel(nn.Module):
    def __init__(self, device):
        super(SSLModel, self).__init__()
        task_arg = argparse.Namespace(task='audio_pretraining')
        task = fairseq.tasks.setup_task(task_arg)
        cp_path = '/kaggle/input/w2v2_scoof/pytorch/default/1/xlsr2_300m.pt'   # Path to pre-trained model 
        model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path], task=task)
        self.model = model[0].to(device)  # Move the model to the specified device only once
        self.device = device
        self.out_dim = 1024

    def extract_feat(self, input_data):
        # Ensure input is on the correct device
        input_data = input_data.to(self.device)

        # Adjust input shape to (batch, length) if necessary
        input_tmp = input_data[:, :, 0] if input_data.ndim == 3 else input_data
                
        # Extract features [batch, length, dim]
        emb = self.model(input_tmp, mask=False, features_only=True)['x']
        return emb


class PSFAN_Backend(nn.Module):
    def __init__(self, input_channels=128, num_classes=2):
        super(PSFAN_Backend, self).__init__()
        
        # First convolutional block with dilation rate = 1
        self.conv1 = nn.Conv1d(input_channels, 128, kernel_size=3, dilation=1, padding=1)
        self.conv1x1_1 = nn.Conv1d(128, 128, kernel_size=1)
        self.conv3x3_1 = nn.Conv1d(128, 128, kernel_size=3, padding=1)
        self.conv1x1_2 = nn.Conv1d(128, 128, kernel_size=1)
        self.attention1 = nn.Sigmoid()
        self.pool1 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
        
        # Second convolutional block with dilation rate = 2
        self.conv2 = nn.Conv1d(128, 128, kernel_size=3, dilation=2, padding=2)
        self.conv1x1_3 = nn.Conv1d(128, 128, kernel_size=1)
        self.conv3x3_2 = nn.Conv1d(128, 128, kernel_size=3, padding=1)
        self.conv1x1_4 = nn.Conv1d(128, 128, kernel_size=1)
        self.attention2 = nn.Sigmoid()
        self.pool2 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        # Third convolutional block with dilation rate = 3
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, dilation=3, padding=3)
        self.conv1x1_5 = nn.Conv1d(256, 256, kernel_size=1)
        self.conv3x3_3 = nn.Conv1d(256, 256, kernel_size=3, padding=1)
        self.conv1x1_6 = nn.Conv1d(256, 256, kernel_size=1)
        self.attention3 = nn.Sigmoid()
        self.pool3 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        # Fourth convolutional block with dilation rate = 4
        self.conv4 = nn.Conv1d(256, 256, kernel_size=3, dilation=4, padding=4)
        self.conv1x1_7 = nn.Conv1d(256, 256, kernel_size=1)
        self.conv3x3_4 = nn.Conv1d(256, 256, kernel_size=3, padding=1)
        self.conv1x1_8 = nn.Conv1d(256, 256, kernel_size=1)
        self.attention4 = nn.Sigmoid()
        self.pool4 = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        # Global Average Pooling layer for each block output
        self.gap1 = nn.AdaptiveAvgPool1d(1)
        self.gap2 = nn.AdaptiveAvgPool1d(1)
        self.gap3 = nn.AdaptiveAvgPool1d(1)
        self.gap4 = nn.AdaptiveAvgPool1d(1)
        
        # Fully connected layers
        self.fc_concat = nn.Linear(128 + 128 + 256 + 256, 16)  # Concatenated GAP output to dense layer
        self.fc_out = nn.Linear(16, num_classes)  # Final output layer
        
        self.activation = nn.LeakyReLU(0.02)

    def forward(self, x):
        # First convolutional block with attention and pooling
        x1 = self.conv1(x)
        x1_attention = self.attention1(self.conv1x1_1(self.conv3x3_1(self.conv1x1_2(x1))))
        x1 = x1_attention * x1
        x1 = self.pool1(x1)
        x1_gap = self.gap1(x1).squeeze(-1)  # Apply GAP and remove last dimension to (batch, channels)

        # Second convolutional block with attention and pooling
        x2 = self.conv2(x1)
        x2_attention = self.attention2(self.conv1x1_3(self.conv3x3_2(self.conv1x1_4(x2))))
        x2 = x2_attention * x2
        x2 = self.pool2(x2)
        x2_gap = self.gap2(x2).squeeze(-1)

        # Third convolutional block with attention and pooling
        x3 = self.conv3(x2)
        x3_attention = self.attention3(self.conv1x1_5(self.conv3x3_3(self.conv1x1_6(x3))))
        x3 = x3_attention * x3
        x3 = self.pool3(x3)
        x3_gap = self.gap3(x3).squeeze(-1)

        # Fourth convolutional block with attention and pooling
        x4 = self.conv4(x3)
        x4_attention = self.attention4(self.conv1x1_7(self.conv3x3_4(self.conv1x1_8(x4))))
        x4 = x4_attention * x4
        x4 = self.pool4(x4)
        x4_gap = self.gap4(x4).squeeze(-1)

        # Concatenate the GAP outputs
        x_concat = torch.cat([x1_gap, x2_gap, x3_gap, x4_gap], dim=1)  # Shape: (batch, 768)

        # Fully connected layers for classification
        x = self.activation(self.fc_concat(x_concat))  # Dense layer with 16 units
        output = self.fc_out(x)  # Output layer with 2 units

        return output


class Model(nn.Module):
    def __init__(self, device):
        super(Model, self).__init__()
        self.device = device
        
        # wav2vec 2.0 front-end remains unchanged
        self.ssl_model = SSLModel(self.device)
        self.LL = nn.Linear(self.ssl_model.out_dim, 128).to(device)  # Reduces dimensionality to 128 for compatibility

        # PSFAN backend with Conv1D for multi-scale feature extraction
        self.backend = PSFAN_Backend(input_channels=128, num_classes=2).to(device)

    def forward(self, x):
        # Move input to the same device as model
        x = x.to(self.device)

        # wav2vec 2.0 feature extraction
        x_ssl_feat = self.ssl_model.extract_feat(x)
        x = self.LL(x_ssl_feat)  # Dimensionality reduction to 128 channels
        x = x.transpose(1, 2)  # Reshape to (batch, features, timesteps) for Conv1D format
        
        # Backend processing for classification
        output = self.backend(x)
        return output

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = Model(device)

In [None]:
model.load_state_dict(torch.load('/kaggle/input/w2v2_scoof/pytorch/default/1/w2v2_scoof.pth', map_location=device))

In [None]:
import os

def get_data_for_dataset(path):
    ids_list = []
    label_list = []
    with open(path, "r") as file:
        for line in file:
            line = line.split()
            id, label = line[1], line[-1]
            ids_list.append(id)
            label = 1 if label == "bonafide" else 0
            label_list.append(label)
    return ids_list, label_list

def get_data_for_evaldataset(path):
    ids_list = os.listdir(path)
    return ids_list

In [None]:
from torch.utils.data import Dataset, DataLoader
import soundfile as sf

def pad_random(x, max_len=64600):
    x_len = x.shape[0]

    if x_len > max_len:
        stt = np.random.randint(x_len - max_len)
        return x[stt:stt + max_len]

    num_repeats = int(max_len / x_len) + 1
    padded_x = np.tile(x, num_repeats)[:max_len]
    return padded_x


def pad(x, max_len=64600):
    x_len = x.shape[0]
    if x_len >= max_len:
        return x[:max_len]
    # need to pad
    num_repeats = int(max_len / x_len) + 1
    padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
    return padded_x

class EvalDataset(Dataset):
    def __init__(self, ids, dir_path, pad_fn=pad_random, cut=64600):
        self.ids = ids
        self.dir_path = dir_path
        self.cut = cut
        self.pad_fn = pad_fn

    def __getitem__(self, index):
        path_to_wav = f"{self.dir_path}/{self.ids[index]}"
        audio, rate = sf.read(path_to_wav)
        x_pad = self.pad_fn(audio, self.cut)
        x_inp = Tensor(x_pad)
        return x_inp, self.ids[index]

    def __len__(self):
        return len(self.ids)

In [None]:
config = {
  "model": "ResCapsGuard",
  "batch_size": 8,
  "d_args": {
      "nb_samp": 64600,
      "first_conv": 128,
      "filts": [70, [1, 32], [32, 32], [32, 64], [64, 64]]
  },
  "device": "cuda:0",
  "num_class": 2,
  "gpu_id": 0,
  "dropout": 0.05,
  "random_size": 0.01,
  "num_iterations": 2,
  "gamma": 0.5,
  "step_size": 10,
  "produced_file": "pruduced_file.txt",
  "num_workers": 6
}

In [None]:
def get_dataloaders(datasets, config):
    dataloaders = {}

    if datasets.get("train"):
        train_loader = DataLoader(
            datasets["train"],
            batch_size=config["batch_size"],
            shuffle=True,
            num_workers=config["num_workers"]
        )
        dataloaders["train"] = train_loader
    if datasets.get("dev"):
        dev_loader = DataLoader(
            datasets["dev"],
            batch_size=config["batch_size"],
            shuffle=False,
            num_workers=config["num_workers"]
        )
        dataloaders["dev"] = dev_loader

    if datasets.get("eval"):
        eval_loader = DataLoader(
            datasets["eval"],
            batch_size=config["batch_size"],
            shuffle=False,
            num_workers=config["num_workers"]
        )
        dataloaders["eval"] = eval_loader

    return dataloaders

In [None]:
!pip install progressbar
!pip install soundfile

In [None]:
import sys
import time
import torch.nn as nn

def progressbar(it, prefix="", size=60, out=sys.stdout):  # Python3.6+
    count = len(it)
    start = time.time()

    def show(j):
        x = int(size * j / count)
        remaining = ((time.time() - start) / j) * (count - j)
        passing = time.time() - start
        mins_pas, sec_pass = divmod(passing, 60)
        time_pas = f"{int(mins_pas):02}:{sec_pass:05.2f}"

        mins, sec = divmod(remaining, 60)
        time_str = f"{int(mins):02}:{sec:05.2f}"

        print(f"{prefix}[{u'█' * x}{('.' * (size - x))}] {j}/{count} time {time_pas} / {time_str}", end='\r', file=out,
              flush=True)

    for i, item in enumerate(it):
        yield item
        show(i + 1)
    print("\n", flush=True, file=out)

@torch.inference_mode
def produce_submit_file(data_loader,
                            model,
                            device,
                            save_path,
                            random=False,
                            dropout=0):
    """
    Create file, that need to give in function calculcate_t-DCF_EER
    args:
        data_loader: loader, that gives batch to model
        model: model, that calculate what we need
        device: device for data, model
        save_path: path where file shoud be saved
    """

    # turning model into evaluation mode
    model.eval()

    # list of utterance id and list of score for appropiate uid
    fname_list = []
    score_list = []
    # inference
    for batch_x, utt_id in progressbar(data_loader, prefix='computing cm score'):
        batch_x = batch_x.to(device)
        with torch.no_grad():
            # first is hidden layer, second is result
            batch_out = model.forward(batch_x)
            prob = nn.functional.softmax(batch_out, dim=1)
            
            # 1 - for bonafide speech class
            batch_score = (prob[:, 1]).data.cpu().numpy().ravel()

        # add outputs
        fname_list.extend(utt_id)
        score_list.extend(batch_score.tolist())
    assert len(fname_list) == len(score_list)

    return fname_list, score_list

In [None]:
path_wav = '/kaggle/input/safe-speak-2024-audio-spoof-detection-hackathon/wavs'
out_path = 'output_hz'
eval_ids = get_data_for_evaldataset(path_wav)

eval_dataset = EvalDataset(eval_ids, path_wav, pad)
eval_dataset = {
    "eval": eval_dataset
}
dataloader = get_dataloaders(eval_dataset, config)

fname_list, score_list = produce_submit_file(dataloader["eval"], model, device, out_path)

In [None]:
import pandas as pd
out_path = 'output_score.csv'
with open(out_path, "w") as fh:
    for fn, sco in zip(fname_list, score_list):
        if ".wav" in fn:
            fn = fn.replace(".wav", "")
        fh.write("{} {}\n".format(fn, sco))
df = pd.read_csv(out_path, sep=" ", names=["ID", "score"])
df.to_csv(out_path, index=False)
print("Scores saved to {}".format(out_path))