In [53]:
import librosa
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
import random
from IPython.display import Audio, display

import sys
sys.path.append('../ASC_AED_JoinTask/Datasets/')
sys.path.append('../ASC_AED_JoinTask/models/')
import datasets
import models
import audio_utils

SAMPLE_RATE = 32000
FMAX = 16000
FMIN = 0
N_MELS = 64
HOP_LEN = 500
N_FFT = 1024

In [12]:
fft_freq = librosa.fft_frequencies(sr=SAMPLE_RATE, n_fft=N_FFT)
mel_freq = librosa.mel_frequencies(n_mels=N_MELS, fmax=FMAX)

In [42]:
model = models.Cnn_9layers_AvgPooling(10, 'logsoftmax')
model_path = '../ASC_AED_JoinTask/models/state_dicts/ASC_scapper_Cnn9_64mel_10epochs.pt'
model.load_state_dict(torch.load(model_path))

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [41]:
scapper_test_csv = '../ASC_AED_JoinTask/predictions/scapper_preds/ASC_scapper_Cnn9_64mel_10epochs.csv'
scapper_df = pd.read_csv(scapper_test_csv)
scapper_scenes = list(scapper_df['acoustic_scene_label'].unique())

In [45]:
def generate_log_mel(wav, SAMPLE_RATE):
    mel_S = librosa.feature.melspectrogram(y=wav, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LEN)
    log_mel = librosa.power_to_db(mel_S, ref=np.max)
    return log_mel

def show_log_mel(wav, SAMPLE_RATE):
    log_mel = generate_log_mel(wav, SAMPLE_RATE)

    librosa.display.specshow(log_mel, sr=SAMPLE_RATE, x_axis='time', y_axis='mel', win_length=N_FFT, hop_length=HOP_LEN, fmax=FMAX, fmin=FMIN)
    plt.show()

def ablate_features(index, audio):

    audio_stft = librosa.stft(audio, n_fft=N_FFT)
    low_index = index*4
    high_index = low_index+3
    l0 = fft_freq>=mel_freq[low_index]
    l1 = fft_freq<=mel_freq[high_index]

    for i in range(len(l0)):
        if l0[i] & l1[i]:
            audio_stft[i,:] = np.complex64(0+0j)

    y_new = librosa.istft(audio_stft, n_fft=N_FFT)
    return y_new

def run_loop(audio_fname, model):
    preds_list = []
    audio, sr = librosa.load(audio_fname, sr=SAMPLE_RATE)
    
    for i in range(16):
        y_new = ablate_features(i, audio)
        log_mel = torch.from_numpy(generate_log_mel(y_new, SAMPLE_RATE)).unsqueeze(0)

        model.eval()
        with torch.no_grad():
            pred = model(log_mel.to(device))
            pred = scapper_scenes[torch.argmax(torch.sigmoid(pred.cpu()))]
        preds_list.append(pred)
    return preds_list


In [51]:
sample_file = '../audioData/sythenticSoundscenes/test/tube1_0.wav'
#sample_file = '/work/dpandya/giggityGit/audioData/TUTUrban2018/developmentDataset/TUT-urban-acoustic-scenes-2018-development/audio/bus-paris-26-911-a.wav'
#wav, sr = librosa.load(sample_file, sr=SAMPLE_RATE)
#run_loop(sample_file, model)


In [52]:
true_preds = scapper_df[scapper_df['acoustic_scene_label'] == scapper_df['preds']]

feat_ablation_preds = {i:[] for i in scapper_scenes}
random_files = []



Unnamed: 0,audio_fileNames,label_fileNames,acoustic_scene_label,events_label_list,preds
0,bus10_0.wav,bus10_0.jams,bus,"['cough', 'phone', 'keys', 'phone', 'cough', '...",bus
1,bus10_1.wav,bus10_1.jams,bus,"['keys', 'clearthroat', 'laughter', 'keys', 'p...",bus
2,bus10_2.wav,bus10_2.jams,bus,"['laughter', 'keys', 'phone', 'phone', 'cough'...",bus
3,bus10_3.wav,bus10_3.jams,bus,"['phone', 'keys', 'phone', 'laughter', 'speech...",bus
4,bus10_4.wav,bus10_4.jams,bus,"['laughter', 'keys', 'phone', 'cough', 'cleart...",bus


In [161]:

l0 = lib_freq>=mel_F[4]
l1 = lib_freq<=mel_F[7]

for i in range(len(l0)):
    if l0[i] & l1[i]:
        wav_stft[i,:] = np.complex64(0+0j)

y_new = librosa.istft(wav_stft, n_fft=N_FFT)