# Baseline adversarial attacks for LMS CNN and ESC-10
Performing 4 typical adversarial attacks.

Target model: [Ahmed et al 2020](https://www.researchgate.net/publication/344519283_Automatic_Environmental_Sound_Recognition_AESR_Using_Convolutional_Neural_Network)

Data: [ESC-10](https://github.com/karolpiczak/ESC-50)

Attacks: [FGSM](https://arxiv.org/abs/1412.6572), [BIM](https://arxiv.org/abs/1607.02533), [Deepfool](https://arxiv.org/abs/1511.04599), [Carlini & Wagner](https://arxiv.org/abs/1608.04644)

Attack implementations based on [torchattacks](https://github.com/Harry24k/adversarial-attacks-pytorch)

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchsummary import summary

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import Audio
from tqdm.auto import tqdm

import random
import os
import shutil

import librosa
from scipy.io import wavfile

In [2]:
SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

seed_everything(SEED)

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [17]:
DATA_DIR = "Data"
DATASET = "ESC-10"

DATASET_PATH = os.path.join(DATA_DIR, DATASET)
META_PATH = os.path.join(DATASET_PATH, "meta", "esc10.csv")
AUDIO_PATH = os.path.join(DATASET_PATH, "audio")

WEIGHTS_DIR = "Weights"

CHECKPT_PATH = os.path.join(WEIGHTS_DIR, "cnn_best.pth")

In [5]:
checkpoint = torch.load(CHECKPT_PATH)

In [19]:
SR = 16000

## Target model
___

In [6]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()

        self.l1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=16, kernel_size=(3, 3), padding="valid"),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
            nn.Dropout(p=0.25),
            nn.ReLU()
        )

        self.l2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=(3, 3), padding="same"),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
            nn.Dropout(p=0.25),
            nn.ReLU()
        )

        self.l3 = nn.Sequential(
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(3, 3), padding="same"),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
            nn.Dropout(p=0.25),
            nn.ReLU()
        )

        self.l4 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(3, 3), padding="same"),
            nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
            nn.Dropout(p=0.25),
            nn.ReLU()
        )

        self.l5 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p=0.5),
            nn.Linear(in_features=128 * 7 * 7, out_features=512),
            nn.ReLU()
        )

        self.l6 = nn.Sequential(
            nn.Dropout(p=0.6),
            nn.Linear(in_features=512, out_features=10)
        )
    
        self.sf = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.l1(input_data)
        x = self.l2(x)
        x = self.l3(x)
        x = self.l4(x)
        x = self.l5(x)

        logits = self.l6(x)
        probs = self.sf(logits)

        return logits, probs

In [7]:
cnn = CNN().to(device)
summary(cnn, (1, 128, 128))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 126, 126]             160
         MaxPool2d-2           [-1, 16, 63, 63]               0
           Dropout-3           [-1, 16, 63, 63]               0
              ReLU-4           [-1, 16, 63, 63]               0
            Conv2d-5           [-1, 32, 63, 63]           4,640
         MaxPool2d-6           [-1, 32, 31, 31]               0
           Dropout-7           [-1, 32, 31, 31]               0
              ReLU-8           [-1, 32, 31, 31]               0
            Conv2d-9           [-1, 64, 31, 31]          18,496
        MaxPool2d-10           [-1, 64, 15, 15]               0
          Dropout-11           [-1, 64, 15, 15]               0
             ReLU-12           [-1, 64, 15, 15]               0
           Conv2d-13          [-1, 128, 15, 15]          73,856
        MaxPool2d-14            [-1, 12

## Clean examples
___
The attacks will be performed on correctly classified test samples from ESC-10

In [8]:
target_to_label = {0: "dog", 41: "chainsaw", 12: "crackling_fire", 40: "helicopter", 10: "rain",
                   20: "crying_baby", 38: "clock_tick", 21: "sneezing", 1: "rooster", 11: "sea_waves"}
label_to_target = {"dog": 0, "chainsaw": 41, "crackling_fire": 12, "helicopter": 40, "rain": 10,
                   "crying_baby": 20, "clock_tick": 38, "sneezing": 21, "rooster": 1, "sea_waves": 11}
target_to_y = {0: 0, 41:1, 12: 2, 40: 3, 10: 4, 20: 5, 38: 6, 21: 7, 1: 8, 11: 9}
y_to_target = {0: 0, 1:41, 2: 12, 3: 40, 4: 10, 5: 20, 6: 38, 7: 21, 8: 1, 9: 11}

In [9]:
class ESC10(Dataset):
    def __init__(self, meta, transformation=None):
        self.meta = meta
        self.transformation = transformation

    def __len__(self):
        return len(self.meta)
    
    def __getitem__(self, index):
        spec = torch.from_numpy(self.meta.loc[index, "spectrogram"]).unsqueeze(0)
        if self.transformation is not None:
            spec = self.transformation(spec)
        target = self.meta.loc[index, "target"]

        return spec, target_to_y[target]

In [10]:
esc10_mean = checkpoint["mean"]
esc10_std = checkpoint["std"]

In [16]:
dataset = pd.read_csv(META_PATH)
clean_samples_meta = dataset[dataset["fold"] == 5].reset_index(drop=True)
clean_samples_meta = clean_samples_meta.drop(columns=["fold", "category", "src_file", "take"])
clean_samples_meta

Unnamed: 0,filename,target
0,5-151085-A-20.wav,20
1,5-170338-A-41.wav,41
2,5-170338-B-41.wav,41
3,5-171653-A-41.wav,41
4,5-177957-A-40.wav,40
...,...,...
75,5-233160-A-1.wav,1
76,5-234879-A-1.wav,1
77,5-234879-B-1.wav,1
78,5-235671-A-38.wav,38


In [20]:
spectrograms = []

for filename in clean_samples_meta["filename"]:
    wavpath = os.path.join(AUDIO_PATH, filename)
    waveform, _ = librosa.load(wavpath, sr=SR)
    mel = librosa.feature.melspectrogram(y=waveform, n_fft=1024, win_length=800, hop_length=400)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = librosa.util.fix_length(mel_db, axis=1, size=128)
    spectrograms.append(mel_db)

clean_samples_meta["spectrogram"] = spectrograms
clean_samples_meta

Unnamed: 0,filename,target,spectrogram
0,5-151085-A-20.wav,20,"[[-41.91432, -41.51622, -52.942284, -64.83085,..."
1,5-170338-A-41.wav,41,"[[-31.564884, -37.13315, -39.922874, -39.72587..."
2,5-170338-B-41.wav,41,"[[-40.27848, -48.45422, -53.904114, -59.57376,..."
3,5-171653-A-41.wav,41,"[[-50.640633, -48.505817, -49.735634, -50.4925..."
4,5-177957-A-40.wav,40,"[[-22.829388, -7.4630165, -0.92951584, -4.8401..."
...,...,...,...
75,5-233160-A-1.wav,1,"[[-80.0, -80.0, -80.0, -80.0, -80.0, -80.0, -8..."
76,5-234879-A-1.wav,1,"[[-43.87448, -56.608856, -55.900734, -53.76341..."
77,5-234879-B-1.wav,1,"[[-55.79444, -49.66845, -45.2035, -53.8517, -4..."
78,5-235671-A-38.wav,38,"[[-14.458471, -8.760626, -8.501469, -19.959671..."


In [21]:
tf_nrm = lambda x: (x - esc10_mean) / esc10_std

In [22]:
clean_samples_data = ESC10(clean_samples_meta, transformation=tf_nrm)
clean_samples_loader = DataLoader(clean_samples_data, batch_size=20, shuffle=False)