In [1]:
import torch
import torch.nn as nn
from torchsummary import summary
import torch.utils.data as data_utils
import numpy as np
import pandas as pd
import os, sys
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile 
from scipy.io.wavfile import write
from scipy.signal import hilbert, chirp
import wave
import pylab as pl
import librosa
import librosa.display
import pickle
import yaml

print("how many gpu: ", torch.cuda.device_count())
#####################################feature extraction#####################
def extract_mel(data,fs):
    data = data / (np.max(np.abs(data))) 
    S = librosa.feature.melspectrogram(y=data, sr=fs, n_mels=128,  n_fft=2048, hop_length=1024)#,fmin = 100,fmax=16000)
    return librosa.power_to_db(S)

PATH= 'applications/data/TUT-rare-sound-events-2017-development/generated_data/mixtures_devtrain_0367e094f3f5c81ef017d128ebff4a3c/'
files = [file for file in os.listdir(PATH+"audio/") if (file.endswith(".wav") and "gunshot" in file)]

featslist_dev=[]
for i in range(500):
    wave_data, fs = librosa.load(PATH+"audio/"+files[i], sr=None)
    feats = extract_mel(wave_data[:-1],fs)
    featslist_dev.append(feats)
y = yaml.load(open(PATH+"meta/mixture_recipes_devtrain_gunshot.yaml"))
ylist_dev = np.empty(shape=(500,1,1292))
for i in range(len(y)):
    #print(i)
    loc = files.index(y[i]['mixture_audio_filename'])
    try:
        start_frame = int(y[i]['event_start_in_mixture_seconds']*1292/30)
        end_frame = int((y[i]['event_start_in_mixture_seconds']+y[i]['event_length_seconds'])*1292/30)
        ylist_dev[loc] = 0
        ylist_dev[loc][0][start_frame:end_frame] = 1
    except KeyError:
        ylist_dev[loc] = 0
        
PATH= 'applications/data/TUT-rare-sound-events-2017-development/generated_data/mixtures_devtest_0367e094f3f5c81ef017d128ebff4a3c/'
files = [file for file in os.listdir(PATH+"audio/") if (file.endswith(".wav") and "gunshot" in file)]        
featslist_test=[]
for i in range(len(files)):
    wave_data, fs = librosa.load(PATH+"audio/"+files[i], sr=None)
    feats = extract_mel(wave_data[:-1],fs)
    featslist_test.append(feats)
y = yaml.load(open(PATH+"meta/mixture_recipes_devtest_gunshot.yaml"))
ylist_test = np.empty(shape=(len(files),1,1292))
for i in range(len(y)):
    #print(i)
    loc = files.index(y[i]['mixture_audio_filename'])
    try:
        start_frame = int(y[i]['event_start_in_mixture_seconds']*1292/30)
        end_frame = int((y[i]['event_start_in_mixture_seconds']+y[i]['event_length_seconds'])*1292/30)
        ylist_test[loc] = 0
        ylist_test[loc][0][start_frame:end_frame] = 1
    except KeyError:
        ylist_test[loc] = 0
print("training feature extraction completed.")

how many gpu:  4
training feature extraction completed.


In [2]:
############################net#######################
class CRNN(nn.Module):    
    def __init__(self):
        super(CRNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=128, kernel_size=(32,))
        self.maxpool1 = nn.MaxPool1d(kernel_size=1, stride=97)
        self.lstm = nn.LSTM(input_size=128, hidden_size=128, num_layers=2, dropout=0.3)
        #batch_size=5
        #self.h0 = torch.randn(2, 256, 128)
        #self.c0 = torch.randn(2, 256, 128)
        self.fc = nn.Linear(128, 1)
        
    def forward(self, input):       
        x_all = []
        for slice_idx in range(input.shape[3]):
            #print(input[:,:,:,slice_idx])
            x = self.conv1(input[:,:,:,slice_idx])            
            x = self.maxpool1(x)
            x = x.view(input.shape[0],-1)
            x_all.append(x)  
        x_all = torch.stack(x_all)
        x_all = torch.transpose(x_all, 0, 1)
        output, (hn, cn) = self.lstm(x_all)
        output = self.fc(output)
        output = torch.sigmoid(output)
        output = output.view(-1, 1, 1292)
        #print("output.shape", output.shape)
        return output

########################### model training ############################
model = CRNN()
print(model)

tensor_x = torch.stack([torch.Tensor(i) for i in featslist_dev]) # transform to torch tensors
tensor_x = tensor_x.view(-1, 1, 128, 1292)
print("tensor_x.shape: ", tensor_x.shape)
tensor_y = torch.stack([torch.Tensor(i) for i in ylist_dev])
print("tensor_y.shape: ", tensor_y.shape)
print("\n")

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.01)
cre = nn.BCELoss()

def train_and_eval(model, optimizer, epoch, BATCH_SIZE):
    train_tensor = data_utils.TensorDataset(tensor_x, tensor_y)
    train_loader = data_utils.DataLoader(train_tensor, batch_size=BATCH_SIZE, shuffle=True,drop_last =True)
    running_loss = 0.0
    for ipoch in range(epoch):
        for batch_idx, (data, target) in enumerate(train_loader):
            #print ("batch_idx = ", batch_idx)
            optimizer.zero_grad()
            output = model(data)
            target=target.float()
            #print ("output shape ", output.shape)
            #print ("target shape ", target.shape)
            loss = cre(output, target)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if batch_idx % 2 == 1:    # print every 2 mini-batches
                print('[epoch:%d, batch_idx:%d] loss: %.3f' %(ipoch, batch_idx + 1, running_loss /2))                
            running_loss = 0.0  
        eval(model, featslist_dev, ylist_dev, 0.5, 22)
        eval(model, featslist_test, ylist_test, 0.5, 22)
        torch.save(model, 'applications/data/'+str(ipoch))

########################### model evaluation ############################
def eval(model, data, ground_truth, threshold, onset_tor):
    TTa=0
    TTb=0
    TF=0
    FT=0
    FF=0
    for i in range(500):
        if (max(ground_truth[i][0])==1):
            has_gunshot = True
        else:
            has_gunshot = False
        ts = model(torch.Tensor(data[i]).view(1,1,128,1292))
        if (max(ts.squeeze())>threshold):
            pred_gunshot = True
        else:
            pred_gunshot = False
        if (np.argmax(ground_truth[i][0]>0)!=0 and abs(int(np.argmax(ts.squeeze()>threshold))-np.argmax(ground_truth[i][0]>0))<onset_tor):
            pred_onset = True
        else:
            pred_onset = False
        if pred_gunshot and has_gunshot:
            if pred_onset:
                TTa+=1
            else:
                TTb+=1
        elif (not pred_gunshot) and has_gunshot:
            TF+=1
        elif (pred_gunshot) and (not has_gunshot):
            FT+=1
        elif (not pred_gunshot) and (not has_gunshot):
            FF+=1
    #print ("detected gunshot and onset time: ", TTa)
    #print ("detected gunshot but inaccurate time: ", TTb)
    #print ("failed to detect gunshot: ", TF)
    #print ("detected fake gunshot: ", FT)
    #print ("correct calm : ", FF)
    print("Error rate = ", float(TTb+TF+FT)/(500))
    precision = float(TTa)/(TTa+FT)
    recall = float(TTa)/(TTa+TTb+TF)
    print("F-score = ", 200*precision*recall/(precision+recall+10**(-8)))

CRNN(
  (conv1): Conv1d(1, 128, kernel_size=(32,), stride=(1,))
  (maxpool1): MaxPool1d(kernel_size=1, stride=97, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(128, 128, num_layers=2, dropout=0.3)
  (fc): Linear(in_features=128, out_features=1, bias=True)
)
tensor_x.shape:  torch.Size([500, 1, 128, 1292])
tensor_y.shape:  torch.Size([500, 1, 1292])




In [None]:
torch.save(model, 'applications/data/m')
model = torch.load('applications/data/m') 
eval(model, featslist_dev, ylist_dev, 0.5, 22)

In [3]:
train_and_eval(model, optimizer, 15, 16)

[epoch:0, batch_idx:2] loss: 0.303
[epoch:0, batch_idx:4] loss: 0.225
[epoch:0, batch_idx:6] loss: 0.165
[epoch:0, batch_idx:8] loss: 0.114
[epoch:0, batch_idx:10] loss: 0.093
[epoch:0, batch_idx:12] loss: 0.085
[epoch:0, batch_idx:14] loss: 0.054
[epoch:0, batch_idx:16] loss: 0.050
[epoch:0, batch_idx:18] loss: 0.081
[epoch:0, batch_idx:20] loss: 0.070
[epoch:0, batch_idx:22] loss: 0.048
[epoch:0, batch_idx:24] loss: 0.049
[epoch:0, batch_idx:26] loss: 0.066
[epoch:0, batch_idx:28] loss: 0.084
[epoch:0, batch_idx:30] loss: 0.045
Error rate =  0.49
F-score =  29.799426527696824
Error rate =  0.51
F-score =  30.894308506106743
[epoch:1, batch_idx:2] loss: 0.073
[epoch:1, batch_idx:4] loss: 0.067
[epoch:1, batch_idx:6] loss: 0.080
[epoch:1, batch_idx:8] loss: 0.066
[epoch:1, batch_idx:10] loss: 0.082
[epoch:1, batch_idx:12] loss: 0.061
[epoch:1, batch_idx:14] loss: 0.085
[epoch:1, batch_idx:16] loss: 0.092
[epoch:1, batch_idx:18] loss: 0.042
[epoch:1, batch_idx:20] loss: 0.058
[epoch:1, 

Error rate =  0.428
F-score =  50.691243751194556
[epoch:13, batch_idx:2] loss: 0.042
[epoch:13, batch_idx:4] loss: 0.084
[epoch:13, batch_idx:6] loss: 0.055
[epoch:13, batch_idx:8] loss: 0.035
[epoch:13, batch_idx:10] loss: 0.057
[epoch:13, batch_idx:12] loss: 0.052
[epoch:13, batch_idx:14] loss: 0.044
[epoch:13, batch_idx:16] loss: 0.052
[epoch:13, batch_idx:18] loss: 0.056
[epoch:13, batch_idx:20] loss: 0.058
[epoch:13, batch_idx:22] loss: 0.057
[epoch:13, batch_idx:24] loss: 0.059
[epoch:13, batch_idx:26] loss: 0.031
[epoch:13, batch_idx:28] loss: 0.074
[epoch:13, batch_idx:30] loss: 0.052
Error rate =  0.384
F-score =  55.76036817515769
Error rate =  0.368
F-score =  62.139917195888174
[epoch:14, batch_idx:2] loss: 0.064
[epoch:14, batch_idx:4] loss: 0.073
[epoch:14, batch_idx:6] loss: 0.077
[epoch:14, batch_idx:8] loss: 0.047
[epoch:14, batch_idx:10] loss: 0.042
[epoch:14, batch_idx:12] loss: 0.059
[epoch:14, batch_idx:14] loss: 0.045
[epoch:14, batch_idx:16] loss: 0.068
[epoch:1

In [4]:
train_and_eval(model, optimizer, 15, 32)

[epoch:0, batch_idx:2] loss: 0.033
[epoch:0, batch_idx:4] loss: 0.053
[epoch:0, batch_idx:6] loss: 0.057
[epoch:0, batch_idx:8] loss: 0.060
[epoch:0, batch_idx:10] loss: 0.060
[epoch:0, batch_idx:12] loss: 0.078
[epoch:0, batch_idx:14] loss: 0.052
Error rate =  0.446
F-score =  40.214476769760445
Error rate =  0.432
F-score =  45.728642748920485
[epoch:1, batch_idx:2] loss: 0.048
[epoch:1, batch_idx:4] loss: 0.057
[epoch:1, batch_idx:6] loss: 0.061
[epoch:1, batch_idx:8] loss: 0.065
[epoch:1, batch_idx:10] loss: 0.051
[epoch:1, batch_idx:12] loss: 0.047
[epoch:1, batch_idx:14] loss: 0.036
Error rate =  0.396
F-score =  53.30188630851728
Error rate =  0.382
F-score =  59.10064190078363
[epoch:2, batch_idx:2] loss: 0.055
[epoch:2, batch_idx:4] loss: 0.057
[epoch:2, batch_idx:6] loss: 0.062
[epoch:2, batch_idx:8] loss: 0.055
[epoch:2, batch_idx:10] loss: 0.056
[epoch:2, batch_idx:12] loss: 0.060
[epoch:2, batch_idx:14] loss: 0.061
Error rate =  0.392
F-score =  54.62962914201818
Error rat

In [5]:
train_and_eval(model, optimizer, 15, 64)

[epoch:0, batch_idx:2] loss: 0.062
[epoch:0, batch_idx:4] loss: 0.053
[epoch:0, batch_idx:6] loss: 0.054
Error rate =  0.422
F-score =  44.619422120955356
Error rate =  0.41
F-score =  50.3631956480955
[epoch:1, batch_idx:2] loss: 0.054
[epoch:1, batch_idx:4] loss: 0.047
[epoch:1, batch_idx:6] loss: 0.063
Error rate =  0.44
F-score =  36.04651123073283
Error rate =  0.438
F-score =  40.326975042505325
[epoch:2, batch_idx:2] loss: 0.058
[epoch:2, batch_idx:4] loss: 0.056
[epoch:2, batch_idx:6] loss: 0.043
Error rate =  0.458
F-score =  26.83706038134512
Error rate =  0.474
F-score =  26.625386647049236
[epoch:3, batch_idx:2] loss: 0.052
[epoch:3, batch_idx:4] loss: 0.045
[epoch:3, batch_idx:6] loss: 0.058
Error rate =  0.432
F-score =  39.32584227843707
Error rate =  0.432
F-score =  42.245988861563106
[epoch:4, batch_idx:2] loss: 0.054
[epoch:4, batch_idx:4] loss: 0.048
[epoch:4, batch_idx:6] loss: 0.060
Error rate =  0.422
F-score =  44.3271763319665
Error rate =  0.428
F-score =  47.