# Run experiments

In [1]:
from audiointerp.dataset.esc50 import ESC50dataset, ESC50contaminated
from audiointerp.model.cnn14 import TransferCnn14
from audiointerp.fit import Trainer
from audiointerp.processing.spectrogram import LogMelSTFTSpectrogram
from audiointerp.interpretation.saliency import SaliencyInterpreter
from audiointerp.interpretation.gradcam import GradCAMInterpreter
from audiointerp.interpretation.shap import SHAPInterpreter
from audiointerp.interpretation.lime import LIMEInterpreter
import torchaudio
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T_audio
import torchvision.transforms as T_vision
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import random
import numpy as np
from IPython.display import Audio
from audiointerp.predict import Predict
from audiointerp.metrics import Metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
root_dir = "/root/ESC50"
sr = 32000
train_folds = [1, 2, 3]
valid_folds = [4]
test_folds = [5]

In [3]:
n_fft = 1024
hop_length = 320
win_length = 1024
n_mels = 64
f_min = 50
f_max = 14000
top_db = None

In [4]:
feature_extractor = LogMelSTFTSpectrogram(
    n_fft=n_fft, win_length=win_length, hop_length=hop_length,
    sample_rate=sr, n_mels=n_mels, f_min=f_min, f_max=f_max, top_db=top_db,
    return_phase=False, return_full_db=False
)

In [5]:
test_data = ESC50dataset(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak", feature_extractor=feature_extractor)
test_data_noisy = ESC50contaminated(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak", feature_extractor=feature_extractor,
                                    path_to_contaminating_audio="noises/149024__foxen10__horse_whinny.wav", alpha=0.6)
test_loader_kwargs = {"batch_size": 32, "shuffle": False}

In [6]:
device = torch.device("cuda:1")
model_cls = TransferCnn14
model_kwargs = {"num_classes": 50, "num_bins": 64}
model_pretrain_weights_path = "weights/Cnn14_mAP=0.431.pth"

optimizer_cls = optim.Adam
optimizer_kwargs = {"lr": 1e-4}

criterion_cls = nn.CrossEntropyLoss
use_mixup = False
mixup_alpha = 0.0

In [7]:
model_trainer = Trainer(
    model_cls=model_cls,
    train_data=None,
    train_loader_kwargs=None,
    criterion_cls=criterion_cls,
    optimizer_cls=optimizer_cls,
    model_kwargs=model_kwargs,
    model_pretrain_weights_path=model_pretrain_weights_path,
    optimizer_kwargs=optimizer_kwargs,
    device=device,
    valid_data=None,
    valid_loader_kwargs=None,
    test_data=test_data,
    test_loader_kwargs=test_loader_kwargs,
    use_mixup=use_mixup,
    mixup_alpha=mixup_alpha
)

Random seed set to: 42


In [8]:
model_trainer.model.load_state_dict(torch.load("logmel_cnn14.pth"))

<All keys matched successfully>

In [9]:
model_trainer.test()

Test Loss: 0.2639, Test Acc: 0.9225


(0.2638887568563223, 0.9225)

In [10]:
test_loader_noisy = DataLoader(test_data_noisy, **test_loader_kwargs)

In [11]:
model_trainer.test(test_loader_noisy)

Test Loss: 1.1552, Test Acc: 0.7300


(1.1551564621925354, 0.73)

In [12]:
test_data_noisy_predict = ESC50contaminated(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak",
                                    path_to_contaminating_audio="noises/149024__foxen10__horse_whinny.wav", alpha=0.6)

In [13]:
Audio(test_data_noisy_predict[77][0], rate=sr)

In [14]:
model = model_trainer.model

___

In [15]:
silence_val = -100.

In [16]:
shap_background_folds = [1, 2, 3]

In [17]:
def get_balanced_background(dataloader, num_samples_per_class=2, device="cpu"):
    from collections import defaultdict
    class_to_samples = defaultdict(list)
    
    for batch_x, batch_y in dataloader:
        for x, y in zip(batch_x, batch_y):
            if len(class_to_samples[y.item()]) < num_samples_per_class:
                class_to_samples[y.item()].append(x.unsqueeze(0))
    
    background_tensors = []
    for class_label, tensor_list in class_to_samples.items():
        background_tensors.extend(tensor_list)
    
    background = torch.cat(background_tensors, dim=0).to(device)
    return background

In [18]:
feature_extractor_predict = LogMelSTFTSpectrogram(
    n_fft=n_fft, win_length=win_length, hop_length=hop_length,
    sample_rate=sr, n_mels=n_mels, f_min=f_min, f_max=f_max, top_db=top_db,
    return_phase=True, return_full_db=True
)

In [19]:
test_loader_predict = DataLoader(test_data_noisy_predict, batch_size=1, shuffle=False)
train_data_shap = ESC50dataset(root_dir=root_dir, sr=sr, folds=shap_background_folds, normalize="peak", feature_extractor=feature_extractor)
train_loader_shap = DataLoader(train_data_shap, batch_size=100, shuffle=False)
shap_background = get_balanced_background(train_loader_shap, num_samples_per_class=2, device=device)

In [20]:
predict_saliency = Predict(model, feature_extractor_predict, interp_method_cls=SaliencyInterpreter, interp_method_kwargs={}, device=device)
predict_gradcam = Predict(model, feature_extractor_predict, interp_method_cls=GradCAMInterpreter, interp_method_kwargs={"target_layers": [model.base.conv_block6.conv2]}, device=device)
predict_lime = Predict(model, feature_extractor_predict, interp_method_cls=LIMEInterpreter, interp_method_kwargs={"num_samples": 1000}, device=device)
predict_shap = Predict(model, feature_extractor_predict, interp_method_cls=SHAPInterpreter, interp_method_kwargs={"background_data": shap_background}, device=device)

In [21]:
results_saliency = predict_saliency.predict_set(test_loader_predict, 'saliency_horse.csv', compute_first=True,
                                                silence_val=silence_val, model_type="logmel_cnn14", save_dir="results")

Все CSV-файлы сохранены в results/logmel_cnn14/saliency_horse/csvs


In [22]:
results_gradcam = predict_gradcam.predict_set(test_loader_predict, 'gradcam_horse.csv', compute_first=True,
                                                silence_val=silence_val, model_type="logmel_cnn14", save_dir="results")

Все CSV-файлы сохранены в results/logmel_cnn14/gradcam_horse/csvs


In [23]:
results_lime = predict_lime.predict_set(test_loader_predict, 'lime_horse.csv', compute_first=True,
                                        silence_val=silence_val, model_type="logmel_cnn14", save_dir="results")

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 231.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 236.67it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 236.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 234.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 233.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 233.15it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 236.41it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 233.99it/s]
100%|███████████████████████████

Все CSV-файлы сохранены в results/logmel_cnn14/lime_horse/csvs


In [24]:
results_shap = predict_shap.predict_set(test_loader_predict, 'shap_horse.csv', compute_first=True,
                                        silence_val=silence_val, model_type="logmel_cnn14", save_dir="results")

Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap values
Done extracting shap

In [19]:
wav, SR = torchaudio.load("samples/crow.wav")
wav = torchaudio.functional.resample(wav, SR, sr)
wav /= wav.abs().max()

wav2, SR = torchaudio.load("samples/sea_waves.wav")
wav2 = torchaudio.functional.resample(wav2, SR, sr)
wav2 /= wav2.abs().max()

In [20]:
wavv = 0.8 * wav + 0.2 * wav2
wavv /= wavv.abs().max()

In [21]:
print(model(feature_extractor.to(device)(wavv[None, :].to(device))).argmax())

tensor(9, device='cuda:1')


In [22]:
res_crow_shap_deep = predict_shap.predict(wav.to(device), "crow_shap_deep", sr, feature_type="mel", model_type="kakakak", silence_val=silence_val)

Done extracting shap values


In [23]:
res_crow_sea_shap_deep = predict_shap.predict(wavv.to(device), "crow_sea_shap_deep", sr, feature_type="mel", model_type="kakakak", silence_val=silence_val)

Done extracting shap values


In [24]:
res_crow_gc = predict_gradcam.predict(wav.to(device), "crow_gradcam", sr, model_type="kakakak", feature_type="mel", silence_val=silence_val)



In [25]:
res_crow_sea_gc = predict_gradcam.predict(wavv.to(device), "crow_sea_gradcam", sr, model_type="kakakak", feature_type="mel", silence_val=silence_val)



In [18]:
res_crow_lime = predict_lime.predict(wav.to(device), "crow_lime", sr, feature_type="mel", model_type="kakakak", silence_val=silence_val)

NameError: name 'wav' is not defined

In [33]:
res_crow_sea_lime = predict_lime.predict(wavv.to(device), "crow_sea_lime", sr, feature_type="mel", model_type="kakakak")

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:04<00:00, 241.27it/s]


