# Run experiments

In [1]:
from audiointerp.dataset.esc50 import ESC50dataset, ESC50contaminated
from audiointerp.model.cnn14 import TransferCnn14
from audiointerp.fit import Trainer
from audiointerp.processing.spectrogram import LogMelSTFTSpectrogram
from audiointerp.interpretation.saliency import SaliencyInterpreter
from audiointerp.interpretation.gradcam import GradCAMInterpreter
import torchaudio
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T_audio
import torchvision.transforms as T_vision
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import random
import numpy as np
from IPython.display import Audio
from audiointerp.predict import Predict
from audiointerp.metrics import Metrics

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAINING = False

In [3]:
def plot_learning_curves(train_losses, val_losses, train_accs=None, val_accs=None):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label="Train Loss")
    if val_losses:
        plt.plot(epochs, val_losses, label="Val Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Loss Curve")
    plt.legend()

    plt.subplot(1, 2, 2)
    if train_accs is not None:
        plt.plot(epochs, train_accs, label="Train Acc")
    if val_accs is not None:
        plt.plot(epochs, val_accs, label="Val Acc")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Accuracy Curve")
    plt.legend()

    plt.tight_layout()
    plt.show()

In [4]:
root_dir = "/home/yuliya/ESC50"
sr = 32000
train_folds = [1, 2, 3]
valid_folds = [4]
test_folds = [5]

In [5]:
n_fft = 1024
hop_length = 320
win_length = 1024
n_mels = 64
f_min = 50
f_max = 14000
top_db = 80

In [6]:
feature_extractor_fit = LogMelSTFTSpectrogram(
    n_fft=n_fft, win_length=win_length, hop_length=hop_length,
    sample_rate=sr, n_mels=n_mels, f_min=f_min, f_max=f_max, top_db=top_db,
    return_phase=False, return_full_db=False
)

In [7]:
feature_extractor_predict = LogMelSTFTSpectrogram(
    n_fft=n_fft, win_length=win_length, hop_length=hop_length,
    sample_rate=sr, n_mels=n_mels, f_min=f_min, f_max=f_max, top_db=top_db,
    return_phase=True, return_full_db=True
)

In [8]:
feature_augs = nn.Sequential(
    T_audio.FrequencyMasking(20),
    T_audio.TimeMasking(20)
)

In [9]:
train_data = ESC50dataset(root_dir=root_dir, sr=sr, folds=train_folds, normalize="peak", feature_extractor=feature_extractor_fit, feature_augs=feature_augs)
valid_data = ESC50dataset(root_dir=root_dir, sr=sr, folds=valid_folds, normalize="peak", feature_extractor=feature_extractor_fit)
test_data = ESC50dataset(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak", feature_extractor=feature_extractor_fit)
test_data_noisy = ESC50contaminated(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak", feature_extractor=feature_extractor_fit,
                                    path_to_contaminating_audio="samples/sea_waves.wav")

In [10]:
train_loader_kwargs = {"batch_size": 32, "shuffle": True}
valid_loader_kwargs = {"batch_size": 32, "shuffle": False}
test_loader_kwargs = {"batch_size": 32, "shuffle": False}

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_cls = TransferCnn14
model_kwargs = {"num_classes": 50, "num_bins": 64}
model_pretrain_weights_path = "weights/Cnn14_mAP=0.431.pth"

optimizer_cls = optim.Adam
optimizer_kwargs = {"lr": 1e-4}

criterion_cls = nn.CrossEntropyLoss
use_mixup = True
mixup_alpha = 0.2

In [12]:
model_trainer = Trainer(
    model_cls=model_cls,
    train_data=train_data,
    train_loader_kwargs=train_loader_kwargs,
    criterion_cls=criterion_cls,
    optimizer_cls=optimizer_cls,
    model_kwargs=model_kwargs,
    model_pretrain_weights_path=model_pretrain_weights_path,
    optimizer_kwargs=optimizer_kwargs,
    device=device,
    valid_data=valid_data,
    valid_loader_kwargs=valid_loader_kwargs,
    test_data=test_data,
    test_loader_kwargs=test_loader_kwargs,
    use_mixup=use_mixup,
    mixup_alpha=mixup_alpha
)

Random seed set to: 42


In [None]:
if TRAINING:
    train_losses, train_accs, val_losses, val_accs, test_loss, test_acc = model_trainer.train(num_epochs=20, save_weights_path="logmel_cnn14.pth")

In [14]:
if TRAINING:
    plot_learning_curves(train_losses=train_losses, train_accs=train_accs, val_losses=val_losses, val_accs=val_accs)

In [15]:
model_trainer.model.load_state_dict(torch.load("logmel_cnn14.pth"))

<All keys matched successfully>

In [16]:
model_trainer.test()

Test Loss: 0.3285, Test Acc: 0.9175


(0.32851228475570676, 0.9175)

In [17]:
test_loader_noisy = DataLoader(test_data_noisy, **test_loader_kwargs)

In [18]:
model_trainer.test(test_loader_noisy)

Test Loss: 1.5115, Test Acc: 0.6075


(1.5115180492401123, 0.6075)

In [19]:
model = model_trainer.model
model

TransferCnn14(
  (base): Cnn14(
    (bn0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv_block1): ConvBlock(
      (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_block2): ConvBlock(
      (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (conv_block3): ConvBlock(
      (conv1): Conv2d(128, 

___

In [20]:
test_data_predict = ESC50dataset(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak")
test_loader_predict = DataLoader(test_data_predict, batch_size=1, shuffle=False)

In [21]:
test_data_noisy_predict = ESC50contaminated(root_dir=root_dir, sr=sr, folds=test_folds, normalize="peak", path_to_contaminating_audio="samples/sea_waves.wav")
test_loader_noisy_predict = DataLoader(test_data_noisy_predict, batch_size=1, shuffle=False)

In [22]:
predict_saliency = Predict(model, feature_extractor_predict, interp_method_cls=SaliencyInterpreter, interp_method_kwargs={}, device=device)
predict_gradcam = Predict(model, feature_extractor_predict, interp_method_cls=GradCAMInterpreter, interp_method_kwargs={"target_layers": [model.base.conv_block6.conv2]}, device=device)

In [23]:
results_saliency = predict_saliency.predict_set(test_loader_predict, 'saliency_clean.csv')

Results saved as results/saliency_clean.csv


In [28]:
results_saliency.head(20)

Unnamed: 0,FF,AI,AD,AG,FidIn,SPS,COMP,is_correct
0,0.80776,0.0,98.176369,0.0,0.0,0.767475,9.172155,True
1,0.005461,0.0,22.82156,0.0,0.0,0.887693,8.472346,False
2,0.4555,0.0,88.769051,0.0,0.0,0.865012,8.645321,False
3,0.449588,0.0,93.007477,0.0,0.0,0.917708,8.103765,False
4,0.747692,0.0,96.724388,0.0,0.0,0.778283,9.126101,True
5,0.454478,0.0,94.733269,0.0,0.0,0.933724,7.831524,True
6,0.498992,0.0,98.004044,0.0,0.0,0.780516,9.126462,True
7,0.283658,0.0,99.451981,0.0,0.0,0.766367,9.18906,True
8,0.583571,0.0,98.57505,0.0,0.0,0.787672,9.091069,True
9,0.337773,0.0,82.542656,0.0,0.0,0.918231,8.095888,True


In [34]:
results_saliency_correct = results_saliency[results_saliency["is_correct"] == True].drop(columns=["is_correct"])
results_saliency_incorrect = results_saliency[results_saliency["is_correct"] == False].drop(columns=["is_correct"])

In [35]:
results_saliency.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.679097,0.247218
AI,0.0,0.0
AD,93.147911,11.334028
AG,0.0,0.0
FidIn,0.105,0.306937
SPS,0.783191,0.040571
COMP,9.078915,0.260822


In [37]:
results_saliency_correct.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.705438,0.234382
AI,0.0,0.0
AD,94.086906,10.028381
AG,0.0,0.0
FidIn,0.106267,0.3086
SPS,0.782409,0.039272
COMP,9.084406,0.25268


In [38]:
results_saliency_incorrect.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.386149,0.193058
AI,0.0,0.0
AD,82.705055,18.135958
AG,0.0,0.0
FidIn,0.090909,0.291937
SPS,0.791887,0.052908
COMP,9.017852,0.337312


In [39]:
results_gradcam = predict_gradcam.predict_set(test_loader_predict, results_csv_name='gradcam_clean.csv')

Results saved as results/gradcam_clean.csv


In [40]:
results_gradcam.head(20)

Unnamed: 0,FF,AI,AD,AG,FidIn,SPS,COMP,is_correct
0,0.589368,0.0,96.262665,0.0,0.0,0.490846,9.969327,True
1,3e-06,0.0,70.103493,0.0,0.0,0.0,0.0,False
2,0.150639,0.0,89.506302,0.0,0.0,0.388541,10.11932,False
3,-6e-06,0.0,95.054153,0.0,0.0,0.93457,7.924671,False
4,0.52649,0.0,97.205322,0.0,0.0,0.496131,9.925839,True
5,0.371264,0.0,96.574631,0.0,0.0,0.858366,8.590769,True
6,-0.025351,0.0,93.879356,0.0,0.0,0.871608,8.588024,True
7,0.558507,0.0,48.779617,0.0,1.0,0.659537,9.537663,True
8,0.132404,0.0,94.643013,0.0,0.0,0.657255,9.541927,True
9,0.128087,0.0,94.487473,0.0,0.0,0.863488,8.560882,True


In [41]:
results_gradcam_correct = results_gradcam[results_gradcam["is_correct"] == True].drop(columns=["is_correct"])
results_gradcam_incorrect = results_gradcam[results_gradcam["is_correct"] == False].drop(columns=["is_correct"])

In [42]:
results_gradcam.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.166925,0.235898
AI,1.25,11.124157
AD,69.282387,32.452545
AG,0.574071,6.282491
FidIn,0.46,0.499022
SPS,0.556569,0.244833
COMP,9.018057,2.236557


In [45]:
results_gradcam_correct.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.177787,0.241062
AI,0.817439,9.016495
AD,68.352875,32.887878
AG,0.315236,4.736407
FidIn,0.490463,0.500592
SPS,0.556951,0.238237
COMP,9.122614,2.065748


In [46]:
results_gradcam_incorrect.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.046123,0.113186
AI,6.060606,24.230585
AD,79.619629,25.310709
AG,3.452624,15.041299
FidIn,0.121212,0.331434
SPS,0.552322,0.313431
COMP,7.855265,3.470435


In [47]:
results_saliency_noisy = predict_saliency.predict_set(test_loader_noisy_predict, 'saliency_seawaves_noise.csv')

Results saved as results/saliency_seawaves_noise.csv


In [48]:
results_saliency_noisy.head(20)

Unnamed: 0,FF,AI,AD,AG,FidIn,SPS,COMP,is_correct
0,0.677376,0.0,98.455986,0.0,0.0,0.756289,9.22126,True
1,0.186556,0.0,96.291763,0.0,0.0,0.840325,8.777564,False
2,0.292564,0.0,73.714478,0.0,0.0,0.770208,9.153724,False
3,0.459484,0.0,62.574375,0.0,1.0,0.755196,9.230047,False
4,0.350592,0.0,97.397995,0.0,0.0,0.774659,9.145867,True
5,0.411859,0.0,50.366318,0.0,1.0,0.758979,9.21381,False
6,0.244123,0.0,97.781609,0.0,0.0,0.754797,9.23072,True
7,0.621036,0.0,99.718758,0.0,0.0,0.784123,9.103641,True
8,0.285813,0.0,98.273903,0.0,0.0,0.764121,9.196259,True
9,0.24517,0.0,31.022114,0.0,1.0,0.762534,9.199641,False


In [49]:
results_saliency_noisy_correct = results_saliency_noisy[results_saliency_noisy["is_correct"] == True].drop(columns=["is_correct"])
results_saliency_noisy_incorrect = results_saliency_noisy[results_saliency_noisy["is_correct"] == False].drop(columns=["is_correct"])

In [50]:
results_saliency_noisy.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.594035,0.242298
AI,1.0,9.962336
AD,82.418381,20.119408
AG,0.055945,0.782229
FidIn,0.3025,0.459916
SPS,0.764249,0.014918
COMP,9.187343,0.071187


In [51]:
results_saliency_noisy_correct.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.688705,0.237735
AI,0.411523,6.415003
AD,92.064278,12.99189
AG,0.020857,0.325128
FidIn,0.144033,0.351848
SPS,0.768996,0.014975
COMP,9.166152,0.073107


In [52]:
results_saliency_noisy_incorrect.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.447508,0.164056
AI,1.910828,13.734374
AD,67.488716,20.105562
AG,0.110253,1.181582
FidIn,0.547771,0.499305
SPS,0.756902,0.011485
COMP,9.220142,0.053742


In [53]:
results_gradcam_noisy = predict_gradcam.predict_set(test_loader_noisy_predict, results_csv_name='gradcam_seawaves_noise.csv')

Results saved as results/gradcam_seawaves_noise.csv


In [54]:
results_gradcam_noisy.head(20)

Unnamed: 0,FF,AI,AD,AG,FidIn,SPS,COMP,is_correct
0,0.4763373,0.0,97.802315,0.0,0.0,0.64794,9.56375,True
1,-4.012883e-05,0.0,91.222382,0.0,0.0,0.0,0.0,False
2,-0.02343625,0.0,83.155807,0.0,0.0,0.321874,10.106059,False
3,0.3930274,0.0,58.660583,0.0,1.0,0.277605,10.195515,False
4,0.2607068,0.0,97.560791,0.0,0.0,0.585009,9.762238,True
5,0.3950974,0.0,61.802387,0.0,0.0,0.1706,10.292536,False
6,-1.361966e-05,0.0,95.387383,0.0,0.0,0.0,0.0,True
7,0.125954,0.0,63.356876,0.0,1.0,0.751976,9.198015,True
8,-0.1393061,0.0,96.658997,0.0,0.0,0.771719,9.074892,True
9,0.221966,0.0,59.104073,0.0,0.0,0.225913,10.218206,False


In [55]:
results_gradcam_noisy_correct = results_gradcam_noisy[results_gradcam_noisy["is_correct"] == True].drop(columns=["is_correct"])
results_gradcam_noisy_incorrect = results_gradcam_noisy[results_gradcam_noisy["is_correct"] == False].drop(columns=["is_correct"])

In [56]:
results_gradcam_noisy.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.165331,0.232419
AI,3.25,17.754593
AD,69.801628,30.41662
AG,0.62607,4.165147
FidIn,0.3775,0.485369
SPS,0.495021,0.224219
COMP,9.250613,2.185575


In [57]:
results_gradcam_noisy_correct.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.144274,0.235666
AI,2.880658,16.760778
AD,69.933769,33.136501
AG,0.529575,3.816155
FidIn,0.423868,0.49519
SPS,0.545768,0.218164
COMP,9.237803,1.989545


In [58]:
results_gradcam_noisy_incorrect.describe().T[["mean", "std"]]

Unnamed: 0,mean,std
FF,0.197921,0.224159
AI,3.821656,19.233219
AD,69.597115,25.746498
AG,0.775423,4.66301
FidIn,0.305732,0.462191
SPS,0.416475,0.211024
COMP,9.270441,2.465025
