# Attacks on Neural Networks in a Lightweight Speech Pseudonymization Pipeline

## Imports

In [4]:
import ASR_2024_anonymization_module_learning.speaker_anonymization as pipeline
import ASR_2024_anonymization_module_learning.speaker_anonymization.optimize as pipeline_optimize
import ASR_2024_anonymization_module_learning.speaker_anonymization.asr as eaf
import ASR_2024_anonymization_module_learning.speaker_anonymization.spi as freef

import util
from backdoored_dataset import BackdooredVCTK
from attacks.jingleback import JingleBack
from metrics import attack_success_rate, clean_accuracy_drop

import os
import warnings

import torch
from torch.utils.data import DataLoader

from torchattacks.attacks.fgsm import FGSM
from torchattacks.attacks.pgd import PGD

## Preparation

In [2]:
pipeline_config_train = pipeline.config.Config(
    num_trials=5,
    n_speakers=10,
    n_samples_per_speaker=10,
    gender=None,
    min_age=None,
    max_age=None,
    accent=None,
    region=None
)

pipeline_config_test = pipeline.config.Config(
    num_trials=1,
    n_speakers=10,
    n_samples_per_speaker=100,
    gender=None,
    min_age=None,
    max_age=None,
    accent=None,
    region=None
)

os.makedirs(pipeline_config_train.BACKDOORED_FOLDER, exist_ok=True)
os.makedirs(os.path.join(pipeline_config_train.BACKDOORED_FOLDER, "train"), exist_ok=True)
os.makedirs(os.path.join(pipeline_config_train.BACKDOORED_FOLDER, "test"), exist_ok=True)

warnings.filterwarnings("ignore")

util.set_global_seed(3131)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

if str(device) == "cuda":
    print(torch.cuda.get_device_name())

cuda
NVIDIA GeForce GTX 1050


## Creating the Backdoor: JingleBack Attack

In [3]:
TARGET_SPEAKER_ID = 0
jingleback_attack = JingleBack(source_label=None, target_label=TARGET_SPEAKER_ID)

backdoored_test_set = BackdooredVCTK(jingleback_attack, poisoning_rate=1.0, train=False, pipeline_config=pipeline_config_test)
backdoored_test_loader = DataLoader(backdoored_test_set, batch_size=1, shuffle=False)

2024-06-05 10:57:44,432 - INFO - Loading data from cache: d:/Datasets/vctk/cache\cache_10_100_None_None_None_None_None.pkl


## Assessing the Clean Models

In [8]:
# asr_processor, asr_model, asv_model, clean_wer, clean_asv_acc, loss = pipeline_optimize.optimize_audio_effects(pipeline_config_train, stop_after_model_evaluation=True)
asr_processor, asr_model = eaf.load_pretrained_model(pipeline_config_train)
asv_model = freef.SpeakerIdentificationModel(num_speakers=pipeline_config_train.N_SPEAKERS, CONFIG=pipeline_config_train)

print(type(asr_model))
print(type(asv_model))

clean_asr_asr = attack_success_rate(asr_model, backdoored_test_loader, target_label=TARGET_SPEAKER_ID, source_label=None, device=device)
clean_asv_asr = attack_success_rate(asr_model, backdoored_test_loader, target_label=TARGET_SPEAKER_ID, source_label=None, device=device)

# print("WER:", clean_wer)
# print("ASV Acc:", clean_asv_acc)
print("ASR ASR:", clean_asr_asr)
print("ASV ASR:", clean_asv_asr)

Some weights of the model checkpoint at Somebody433/fine-tuned-vctkdataset were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Somebody433/fine-tuned-vctkdataset and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRA

Initialized model with 10 speakers for fine-tuning.
<class 'transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC'>
<class 'ASR_2024_anonymization_module_learning.speaker_anonymization.spi.SpeakerIdentificationModel'>


  0%|          | 0/1000 [00:00<?, ?it/s]


RuntimeError: Expected 2D (unbatched) or 3D (batched) input to conv1d, but got input of size: [1, 1, 1, 42363]

## Assessing Backdoored Models

In [None]:
poisoning_rates = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
word_error_rates = []
asc_accuracies = []
attack_success_rates = []

for poisoning_rate in poisoning_rates:
    backdoored_train_set = BackdooredVCTK(jingleback_attack, poisoning_rate=poisoning_rate, train=True, pipeline_config=pipeline_config_train)
    backdoored_train_loader = DataLoader(backdoored_train_set, batch_size=10, shuffle=True)
    
    asr_processor, asr_model, asv_model, mean_wer, mean_asv_acc, loss = pipeline_optimize.optimize_audio_effects(pipeline_config_train, backdoored_vctk=backdoored_train_set, stop_after_model_evaluation=True)
    asr = attack_success_rate(asr_model, backdoored_test_loader, target_label=TARGET_SPEAKER_ID, source_label=None, device=device)
    
    word_error_rates.append(mean_wer)
    asc_accuracies.append(mean_asv_acc)
    attack_success_rates.append(asr)
    
    print("Poisoning Rate:", poisoning_rate)
    print("    Word Error Rate:", mean_wer)
    print("    ASV Accuracy:", mean_asv_acc)
    print("    Attack Success Rate:", asr)

## Evasion Attacks: FGSM & PGD

In [None]:
fgsm = FGSM(model.neural_network, eps=8/255)
fgsm.set_mode_targeted_by_label() #NOTE: This means that, when attacking the model, you should pass the target label manually/yourself. So fgsm(audio, target_label).

In [None]:
fgsm = PGD(model.neural_network, eps=8/255, alpha=2/255, steps=10, random_start=True)
fgsm.set_mode_targeted_by_label() #NOTE: This means that, when attacking the model, you should pass the target label manually/yourself. So fgsm(audio, target_label).