In [3]:
import panns_inference
import librosa
from panns_inference import AudioTagging, SoundEventDetection, labels
import numpy as np


model_path00 = 'models/Cnn14_mAP=0.431.pth'
model_path01 = 'models/Cnn14_DecisionLevelMax.pth'

audio_path = '../audioData/TUTUrban2018/developmentDataset/TUT-urban-acoustic-scenes-2018-development/audio/airport-barcelona-0-2-a.wav'

def get_event_list(clipwise_output):

    # With the slicing, we are reversing the sorted index to get the descending order of event probs.
    sorted_indexes = np.argsort(clipwise_output)[::-1]
    event_list = []

    # Creating a list of events with more than 0.05 prob
    for i in range(0, len(sorted_indexes)):
        event_prob = clipwise_output[sorted_indexes[i]]
        if event_prob > 0.05:
            event_list.append(np.array(labels)[sorted_indexes[i]])
    
    return event_list
    

def get_panns_inference(audio_file_name, model_path):

    (audio, _) = librosa.core.load(audio_file_name, sr=32000, mono=True)
    audio = audio[None, :]
    at = AudioTagging(checkpoint_path=model_path, device='cuda')
    (clipwise_output, embedding) = at.inference(audio)

    event_list = get_event_list(clipwise_output[0])
    return event_list

In [4]:
import pandas as pd
import os

tut_train_csv = 'Datasets/TUT18_train.csv'
tut_train_df = pd.read_csv(tut_train_csv)
tut_audio_dir = '../audioData/TUTUrban2018/developmentDataset/TUT-urban-acoustic-scenes-2018-development/'

event_label_preds = []

for i in range(0,len(tut_train_df)):
    audio_fname = os.path.join(tut_audio_dir, tut_train_df['files'][i])
    event_label_preds.append(get_panns_inference(audio_fname, model_path00))
    print(i)

Checkpoint path: models/Cnn14_mAP=0.431.pth


--2024-04-04 14:52:25--  https://zenodo.org/record/3987831/files/Cnn14_mAP%3D0.431.pth?download=1
Resolving zenodo.org (zenodo.org)... 188.185.79.172, 188.184.103.159, 188.184.98.238, ...
Connecting to zenodo.org (zenodo.org)|188.185.79.172|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/3987831/files/Cnn14_mAP%3D0.431.pth [following]
--2024-04-04 14:52:25--  https://zenodo.org/records/3987831/files/Cnn14_mAP%3D0.431.pth
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 327428481 (312M) [application/octet-stream]
Saving to: 'models/Cnn14_mAP=0.431.pth'

     0K .......... .......... .......... .......... ..........  0% 1.24M 4m12s
    50K .......... .......... .......... .......... ..........  0% 4.59M 2m40s
   100K .......... .......... .......... .......... ..........  0% 61.5M 1m48s
   150K .......... .......... .......... .......... ..........  0% 37.5M 83s
   200K .......... ...

GPU number: 4
0
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
1
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
2
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
3
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
4
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
5
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
6
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
7
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
8
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
9
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
10
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
11
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
12
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
13
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
14
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
15
Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4
16
Checkpoint path: 

In [5]:
tut_train_df['panns_inference_events_list'] = event_label_preds

tut_train_df

Unnamed: 0,files,labels,panns_inference_events_list
0,audio/airport-barcelona-0-0-a.wav,airport,"[Speech, Clip-clop, Animal, Horse, Outside, ur..."
1,audio/airport-barcelona-0-1-a.wav,airport,"[Speech, Run, Outside, urban or manmade, Vehic..."
2,audio/airport-barcelona-0-2-a.wav,airport,"[Speech, Run, Outside, urban or manmade, Vehic..."
3,audio/airport-barcelona-0-3-a.wav,airport,"[Speech, Clip-clop, Horse, Animal, Outside, ur..."
4,audio/airport-barcelona-0-4-a.wav,airport,"[Speech, Animal, Horse, Clip-clop, Music, Whis..."
...,...,...,...
6117,audio/tram-vienna-202-6117-a.wav,tram,"[Speech, Vehicle, Car, Field recording, Outsid..."
6118,audio/tram-vienna-202-6118-a.wav,tram,"[Speech, Silence, Vehicle, Animal, Outside, ru..."
6119,audio/tram-vienna-202-6119-a.wav,tram,"[Speech, Vehicle, Silence, Car, Outside, urban..."
6120,audio/tram-vienna-202-6120-a.wav,tram,"[Speech, Vehicle, Animal, Boat, Water vehicle,..."


In [7]:
tut_train_df.to_csv('predictions/TUT18_train_panns_eventsInference.csv', index=False)

In [61]:
get_panns_inference(audio_path, model_path00)

Checkpoint path: models/Cnn14_mAP=0.431.pth
GPU number: 4


['Speech',
 'Run',
 'Outside, urban or manmade',
 'Vehicle',
 'Clip-clop',
 'Male speech, man speaking',
 'Horse',
 'Animal']

In [50]:
sorted_indexes = np.argsort(clipwise_output[0])[::-1]
#np.array(labels)[sorte]
#sorted_indexes
#np.array(labels)[sorted_indexes[0]], clipwise_output[0][sorted_indexes[0]]
prob_list = []
for i in range(0,len(sorted_indexes)):
    if (clipwise_output[0][sorted_indexes[i]]) > 0.05:
        print(clipwise_output[0][sorted_indexes[i]])

0.7926502
0.2956216
0.1845699
0.14236389
0.08920072
0.06224627
0.058570817
0.050119467


In [59]:
import torch
import torchaudio
import os
import numpy as np
import pandas as pd
import json

DIR = '../audioData/sythenticSoundscenes/test/'
dir_tut = '../audioData/TUTUrban2018/developmentDataset/TUT-urban-acoustic-scenes-2018-development/'

In [60]:
tut_csv = os.path.join(dir_tut,'train.csv')
tut_df = pd.read_csv(tut_csv)
tut_df

Unnamed: 0,files,labels
0,audio/airport-barcelona-0-0-a.wav,airport
1,audio/airport-barcelona-0-1-a.wav,airport
2,audio/airport-barcelona-0-2-a.wav,airport
3,audio/airport-barcelona-0-3-a.wav,airport
4,audio/airport-barcelona-0-4-a.wav,airport
...,...,...
6117,audio/tram-vienna-202-6117-a.wav,tram
6118,audio/tram-vienna-202-6118-a.wav,tram
6119,audio/tram-vienna-202-6119-a.wav,tram
6120,audio/tram-vienna-202-6120-a.wav,tram


In [None]:
all_files = os.listdir('../audioData/sythenticSoundscenes/test/')
aud_file_list = []

for file in all_files:
    if file.split('.')[-1] == 'wav':
        aud_file_list.append(file)



In [None]:
lab_file_list = []
for file in all_files:
    if file.split('.')[-1] == 'jams':
        lab_file_list.append(file)

In [None]:
from torch.utils.data import Dataset

class scraperDataset(Dataset):

    def __init__(self, dataset_csv, data_dir, only_scene=False):

        self.dataset_csv = dataset_csv
        self.data_directory = data_dir
        self.only_scene = only_scene
        self.dataframe = pd.read_csv(dataset_csv)

        self.audio_files = self.dataframe['audio_fileNames']
        self.label_files = self.dataframe['label_fileNames']
        self.scene_labels = self.dataframe['acoustic_scene_label']
        self.events_label_list = self.dataframe['events_label_list']

    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Load Audio file
        audio_file = os.path.join(self.data_directory, self.audio_files[idx])
        audio_data, sr = torchaudio.load(audio_file)

        if self.only_scene:
            sample = {'audio':audio_data, 'scene_label':self.scene_labels[idx]}
        else:
            sample = {'audio':audio_data, 'scene_label':self.scene_labels[idx], 'event_list':self.events_label_list[idx]}
        
        return sample


In [None]:
file_count = pd.DataFrame()
file_count['audio_fileNames'] = aud_file_list
file_count['label_fileNames'] = lab_file_list


In [None]:
sam_file = '../audioData/sythenticSoundscenes/test/restaurant8_downshift_0.wav'

def get_label_data_json(audio_filename):
    dir_name, fname = os.path.split(audio_filename)
    lab_fname = os.path.join(dir_name, (fname.split('.')[0] + '.jams'))
    with open(lab_fname, 'r') as f:
        label_data_json = json.load(f)

    return label_data_json

def get_justLabels(audio_filename, event_labels=True):
    scene_label = []
    event_label = []
    label_data_json = get_label_data_json(audio_filename)
    
    for events in label_data_json['annotations'][0]['data']:
        event_label.append(events['value']['label'])
    scene_label.append(event_label.pop(0))
    #print(scene_label, event_label)
    if event_labels:
        return scene_label, event_label
    else:
        return scene_label

def get_labels_with_timestamps(audio_filename):
    events_dict = {}
    label_data_json = get_label_data_json(audio_filename)

    for events in label_data_json['annotations'][0]['data']:
        start_time = events['value']['event_time']
        duration = events['value']['event_duration']
        events_dict[events['value']['label']] = [start_time, duration]

    return events_dict

get_justLabels(sam_file, event_labels=False)

In [None]:
activity_labels_list = []
for file in aud_file_list:
    activity_labels_list.append(get_justLabels(os.path.join(DIR,file), event_labels=False)[0])

In [None]:
import re

pattern = re.compile(r'\d+')

for i in range(0,len(activity_labels_list)):
    activity_labels_list[i] = pattern.sub('', activity_labels_list[i])

activity_labels_list[-1]

In [None]:
file_count['acoustic_scene_label'] = activity_labels_list
file_count

In [None]:
event_label_list = []
for file in aud_file_list:
    event_label_list.append(get_justLabels(os.path.join(DIR, file), True)[1])

len(event_label_list)

In [None]:
file_count['events_label_list'] = event_label_list
file_count

In [None]:
file_count.to_csv('Datasets/scrapper_dataset.csv', index=False)

In [None]:
file_count['acoustic_scene_label'].unique()

In [None]:
sam_file = '../audioData/sythenticSoundscenes/test/bus1_0.jams'

with open(sam_file, 'r') as f:
    data = json.load(f)


#data = json.dumps(data)
for i in data['annotations'][0]['data']:
    print(i['value']['label'])

In [None]:
data