## Generalization

In this notebook, we simply measure the generalization ability of our various models. We use mdoels trained with ASVspoof data and test their performance on FakeAVCeleb.

In [None]:
from azureml.fsspec import AzureMachineLearningFileSystem
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import os
from transformer import FakeAVCeleb
import torch

## AST

In [None]:
root_dir = "azureml:"
fs = AzureMachineLearningFileSystem(root_dir)
model_path = "checkpoint/"
feature_extractor = AutoFeatureExtractor.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593")

## WAV2VEC

In [None]:
model_path = "/home/azureuser/checkpoint/"
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")

In [None]:
max_duration = 6

In [None]:
def preprocess_function(examples):
        audio_arrays = [x["array"][:int(feature_extractor.sampling_rate * max_duration)] for x in examples["audio"]]
        inputs = feature_extractor(
            audio_arrays, 
            sampling_rate=feature_extractor.sampling_rate, 
            max_length=None, 
            truncation=False, 
        )
        return inputs    


In [None]:
from transformer import ASVSpoofDataset
avdata = ASVSpoofDataset(max_size=10).load_data()

In [None]:
dataset = FakeAVCeleb(max_size=1000).load_data()

In [None]:
encoded_dataset = dataset.map(preprocess_function, remove_columns=["audio", "filename"], batched=True)


In [None]:
labels = dataset.features["label"].names
num_labels = len(labels)

label2id, id2label = dict(), dict()
label2id['C'] = '0'
id2label['0'] = 'C'

label2id['D'] = '1'
id2label['1'] = 'D'

In [None]:
av_label2id, av_id2label = dict(), dict()
av_label2id['bonafide'] = '0'
av_id2label['0'] = 'bonafide'

av_label2id['spoof'] = '1'
av_id2label['1'] = 'spoof'

In [None]:
local_dir = model_path # "../temp_model"

In [None]:
# Create a local temporary directory to download the model
os.makedirs(local_dir, exist_ok=True)

# Download the model files
for file in fs.ls(model_path, detail=False, recursive=True):
    if fs.isfile(file):  # Only process files, not directories
        file_name = os.path.basename(file)
        fs.get(file, local_dir)

In [None]:
loaded_model = AutoModelForAudioClassification.from_pretrained(
                local_dir, 
                num_labels=num_labels,
                label2id=label2id,
                id2label=id2label,
                ignore_mismatched_sizes=True,
            )

In [None]:
import random

y_true = []
y_pred = []
idxs = random.sample(range(1, 3000), 1000)

for idx in idxs:
    inputs = feature_extractor(
            dataset[idx]['audio']["array"], 
            sampling_rate=feature_extractor.sampling_rate, 
            return_tensors="pt"
        ) # .to("cuda:0")


    with torch.no_grad():
        logits = loaded_model(**inputs).logits
        predicted_class_ids = torch.argmax(logits).item()

    y_true.append(1 - dataset[idx]['label'])
    y_pred.append(predicted_class_ids)


In [None]:
from sklearn.metrics import classification_report
y_true = [1-y for y in y_true]
print(classification_report(y_true, y_pred))

### GBDT Comparsion

In [None]:
from pickle import load
import sklearn

with open("booster_audio_len_6_max_depth_8_n_est_400.pkl", "rb") as f:
    booster = load(f)

### Load FakeAVCeleb Data for Booster

In [None]:
import os
import sys
sys.path.insert(0, '../') 


import numpy as np
import pandas as pd
import librosa
from azureml.fsspec import AzureMachineLearningFileSystem
from xgbooster.generate import Features
from sklearn.model_selection import train_test_split


slice_size = 6
feature_generator = Features()
train_only = True

def load_fakeavceleb_data():
        train_dir = "azureml://"
        fs = AzureMachineLearningFileSystem(train_dir)

        train_features = []
        metadata_file = f"{train_dir}/metadata.csv"
        metadata = pd.read_csv(metadata_file)
        filenames = [f"{train_dir}/{file}" for file in metadata['new_filename'].to_list()]
        labels = metadata['category'].to_list()

        for file in filenames:
            with fs.open(file, 'r') as f:

                segment, sr = librosa.load(f)
                if slice_size != None:
                    segment = segment[:int(sr*slice_size)]

                train_features.append(feature_generator.make_features(segment, sr))

        X_train = np.array(train_features)
        y_train = np.array(labels)
        print("loaded train audio, y_train contains {} samples".format(len(y_train)))

        if not train_only:
            X_train, X_test, y_train, y_test = train_test_split(np.array(X_train), np.array(y_train), test_size=0.33, random_state=0)
        else:
            print("skipping loading test audio")
        return X_train, y_train

X_train, y_train = load_fakeavceleb_data()



In [None]:
y_pred = booster.predict(X_train)

In [None]:
trans = {'spoof':'D', 'bonafide':'C'}
converted_y_pred = list(map(lambda x: trans[x], list(y_pred)))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_train, converted_y_pred))