In [2]:
import pandas as pd
import numpy as np
import torch
import torchaudio
import textgrids
import torchaudio.transforms as T

SAMPLE_RATE = 16000

In [3]:
#df['audio_files'], df['annotation_files']

class AudioUtils():

    def resample_audio(signal, org_sr):
        resampler = T.Resample(org_sr, SAMPLE_RATE)
        resampled_sig = resampler(signal)
        return resampled_sig
    
    def open(aud_fn):
        sig, sr = torchaudio.load(aud_fn)
        duration = sig.shape[1]/sr
        #print(sig.shape)
        if sr != SAMPLE_RATE:
            sig = AudioUtils.resample_audio(sig, sr)

        # Converting stereo to mono
        if sig.shape[0] == 2:
            sig = sig.mean(dim=0).unsqueeze(0)
        
        if sig.shape[1]%SAMPLE_RATE == 0:
            duration = sig.shape[1]/SAMPLE_RATE
            sig = sig.squeeze().numpy()
        else:
            sig = sig.squeeze().numpy()
            #print(sig_np.shape)
            en = (int)((np.round(duration)+1)*SAMPLE_RATE - sig.shape[0])
            sig = np.pad(sig, (0,en), mode = 'constant')
            #sig = torch.from_numpy(sig_np).unsqueeze(0)
            duration = sig.shape[0] / SAMPLE_RATE
        
        return sig, SAMPLE_RATE, duration
    
    def get_second_wise_mfcc(signal, duration):
        mfcc_list = []
        MFCC = T.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=13, melkwargs={"n_fft": 400, "hop_length": 100, "n_mels": 23, "center": False})
        for i in range(0,(int)(duration)):
            mfcc_list.append(MFCC(signal[0][i:(i+SAMPLE_RATE)].unsqueeze(0)).squeeze())

        return mfcc_list

In [4]:
class AnnotUtils():

    def get_speech_secs(fname):
        grid = textgrids.TextGrid(fname)
        speech_secs = []
        for i in grid['silences']:
            if i.text == '1':
                #speech_secs.append([(np.round(i.xmin, decimals=2)), (np.round(i.xmax, decimals=2))])
                speech_secs.append([(int)(np.round(i.xmin, decimals=2)), (int)(np.round(i.xmax, decimals=2))])
        return speech_secs
    

    def get_labs_for_secs(speech_secs, duration):
        
        labels = [i*0 for i in range(0, (int)(duration))]

        for i in range(0,len(speech_secs)):

            if speech_secs[i][0] == speech_secs[i][1]:
                labels[speech_secs[i][0]-1] = 1
            else:
                for j in range(speech_secs[i][0], speech_secs[i][1]):
                    
                    labels[j-1] = 1
                    labels[j] = 1

        return labels

In [31]:
from torch.utils.data import DataLoader, Dataset, random_split

MFCC = T.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=13, melkwargs={"n_fft": 400, "hop_length": 100, "n_mels": 23, "center": False})

class VAD_Dataset(Dataset):
    def __init__(self, df) -> None:
        #super().__init__()
        self.df = df
        self.sr = 16000

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):

        aud_part = self.df.loc[index][0]
        labels = self.df.loc[index][1]
        
        if aud_part[-2] == '_':
            n_sec = (int)(aud_part[-1])
            aud_path = aud_part[:-2]+'.wav'
        elif aud_part[-3] == '_':
            n_sec = (int)(aud_part[-2:])
            aud_path = aud_part[:-3]+'.wav'
        #print(n_sec)
        signal00, sam00, dur00 = AudioUtils.open(aud_path)
        sig00 = torch.from_numpy(signal00[n_sec*SAMPLE_RATE:((n_sec+1)*SAMPLE_RATE)]).unsqueeze(0)
        #print(sig00.shape)
        #labels = torch.Tensor(labels).to(dtype=torch.long)
        #mfcc_list = AudioUtils.get_second_wise_mfcc(sig00, dur00)
        mfcc_tens = MFCC(sig00).to(dtype=torch.float).squeeze()
        #labels = AnnotUtils.get_labs_for_secs(AnnotUtils.get_speech_secs(annot_path), dur)
        for i in range(0, len(mfcc_tens)):
            mfcc_tens[i] = torch.nn.functional.normalize(mfcc_tens[i],dim=0)
            
        mfcc_tens = torch.flatten(mfcc_tens)
        
        return mfcc_tens, labels

In [30]:
sdus = torch.randn(1,16000)
mfccs = MFCC(sdus).to(dtype=torch.float)

for i in range(0, len(mfccs)):
    mfccs[i] = torch.nn.functional.normalize(mfccs[i], dim=0)

In [32]:
df2 = pd.read_csv('secWise_labs.csv')
myds = VAD_Dataset(df=df2)

In [33]:
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

In [34]:
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(val_ds, batch_size=32, shuffle=True)

In [35]:
from sklearn.metrics import accuracy_score, f1_score

def acc_fn(lab, pred):
    pred = torch.round(pred)
    correct = (lab == pred).float()
    return correct.mean().item()

In [36]:
for batch in test_loader:
    inpus, labs = batch
    print(inpus.shape, labs.shape, labs.dtype)

torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([32, 2041]) torch.Size([32]) torch.int64
torch.Size([

In [38]:
import torch.nn as nn

sampl = torch.randn(32,2041)
#sampl = torch.flatten(sampl)
print(sampl.shape)

h1 = nn.Linear(2041, 1024)
ac = nn.ReLU()
h2 = nn.Linear(1024,128)
h3 = nn.Linear(128,1)
si = nn.Sigmoid()

x = ac(h1(sampl))
x = ac(h2(x))
x = ac(h3(x))
x.dtype, si(x).dtype, si(x.squeeze()), x.shape

torch.Size([32, 2041])


(torch.float32,
 torch.float32,
 tensor([0.5366, 0.5117, 0.5537, 0.5059, 0.5368, 0.5480, 0.5326, 0.5449, 0.5217,
         0.5294, 0.5724, 0.5275, 0.5254, 0.5051, 0.5179, 0.5208, 0.5138, 0.5419,
         0.5000, 0.5262, 0.5000, 0.5027, 0.5182, 0.5362, 0.5000, 0.5080, 0.5192,
         0.5234, 0.5473, 0.5269, 0.5268, 0.5067], grad_fn=<SigmoidBackward0>),
 torch.Size([32, 1]))

In [113]:
sampl = torch.randn(1,1,13,81)

conv1 = nn.Conv2d(1, 16, kernel_size=(3, 3), padding=1)
relu1 = nn.ReLU()
maxpool1 = nn.MaxPool2d(kernel_size=(2, 2))

conv2 = nn.Conv2d(16,32,kernel_size=(3), padding=1)
relu2 = nn.ReLU()
maxpool2 = nn.MaxPool2d(kernel_size=(2, 2))

x = maxpool1(relu1(conv1(sampl)))
x = maxpool2(relu2(conv2(x)))
x = x.view(x.size(0), -1)
print(x.shape)


print(type(torch.round(x)))

torch.Size([1, 1920])
<class 'torch.Tensor'>


In [48]:
import torch.nn as nn

class SpeechDetectionCNN00(nn.Module):
    def __init__(self):
        super(SpeechDetectionCNN00, self).__init__()
        self.h1 = nn.Linear(2041, 1024)
        self.ac = nn.ReLU()
        self.h2 = nn.Linear(1024,256)
        self.h3 = nn.Linear(256,128)
        self.h4 = nn.Linear(128,1)
        self.si = nn.Sigmoid()

    def forward(self, x):
        
        x = self.h1(x)
        x = self.ac(x)
        
        x = self.h2(x)
        x = self.ac(x)
        
        x = self.h3(x)
        x = self.ac(x)

        x = self.h4(x)
        x = self.si(x)
        #x = self.ac(x)
        
        #x = torch.sigmoid(x)
        return x

In [49]:
torch.cuda.empty_cache()
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = SpeechDetectionCNN00().to(device)

#criterion = nn.BCEWithLogitsLoss()
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

In [50]:
import statistics

from sklearn.metrics import accuracy_score
torch.manual_seed(13)
torch.cuda.manual_seed(13)

epochs = 20

for epoch in range(epochs):

    acc = []

    for batch in train_loader:
        
        inputs, labels = batch
        inputs = inputs.to(device, dtype=torch.float)
        #labels = labels.to(device, dtype=torch.int64)
        labels = labels.to(device)

        model.train()

        optimizer.zero_grad()
        outputs = torch.round(model(inputs).squeeze())
        #print(outputs.type(), labels.type())
        
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
    
        acc.append(acc_fn(labels.float(), outputs.float()))    

    #print(targets.dtype, preds.dtype)
    accuracie = statistics.mean(acc)    

    print(f'Epoch [{epoch+1}/{epochs}], Accuracy: {accuracie}, Loss: {loss.item()}')
    

Epoch [1/20], Accuracy: 0.6273232509895247, Loss: 42.85714340209961
Epoch [2/20], Accuracy: 0.6273232509895247, Loss: 42.85714340209961
Epoch [3/20], Accuracy: 0.6281432217481185, Loss: 28.571430206298828
Epoch [4/20], Accuracy: 0.6277332366729269, Loss: 35.71428680419922
Epoch [5/20], Accuracy: 0.6281432217481185, Loss: 28.571430206298828
Epoch [6/20], Accuracy: 0.6273232509895247, Loss: 42.85714340209961
Epoch [7/20], Accuracy: 0.628963193114923, Loss: 14.285715103149414
Epoch [8/20], Accuracy: 0.6265032802309308, Loss: 57.142860412597656
Epoch [9/20], Accuracy: 0.6281432217481185, Loss: 28.571430206298828
Epoch [10/20], Accuracy: 0.6273232509895247, Loss: 42.85714340209961
Epoch [11/20], Accuracy: 0.6293731781901145, Loss: 7.142857551574707
Epoch [12/20], Accuracy: 0.6285532074315208, Loss: 21.428571701049805
Epoch [13/20], Accuracy: 0.6281432217481185, Loss: 28.571430206298828
Epoch [14/20], Accuracy: 0.6269132653061225, Loss: 50.000003814697266
Epoch [15/20], Accuracy: 0.626503280

In [51]:
from sklearn.metrics import accuracy_score

preds = []
targets = []

for batch in test_loader:

    inputs, labels = batch
    
    inputs = inputs.to(device, dtype=torch.float)
    labels = labels.to(device, dtype=torch.float)
    
    
    model.eval()
    with torch.inference_mode():
        y_preds = torch.round((model(inputs)).squeeze())
        

    targets.extend(labels.cpu().numpy())
    preds.extend(y_preds.cpu().numpy())
    

In [56]:
preds

[1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0

In [54]:
acc = accuracy_score(targets, preds)
print(acc)

0.658974358974359


In [55]:
from sklearn.metrics import f1_score

f1_score = f1_score(targets, preds)
f1_score

0.794435857805255

In [98]:
ex = torch.randn(1,13,81).to(device).unsqueeze(0)

model.eval()
with torch.inference_mode():
    out = model(ex)

torch.round(ex)

tensor([[[[-1., -1.,  0.,  ...,  1.,  0.,  0.],
          [ 0., -1.,  0.,  ...,  0.,  2.,  0.],
          [ 2., -1., -1.,  ..., -1., -1., -0.],
          ...,
          [-0., -0., -1.,  ...,  0.,  1., -1.],
          [-1.,  1., -0.,  ...,  0., -1.,  2.],
          [ 1.,  1.,  0.,  ...,  1., -1., -1.]]]], device='cuda:0')

In [26]:
import torch.nn as nn

samm = np.random.rand(13,81)
samm = torch.from_numpy(samm).to(torch.float)
samm.dtype

torch.float32

In [38]:
m = nn.Conv2d(1, 3, (3), stride=2, padding=(1,1))
ac_layer = nn.ReLU()
norm_la = nn.BatchNorm2d(3)
pool_la = nn.MaxPool2d(4, stride=1)

t_t = ac_layer(m(samm.unsqueeze(0)))
t_t = norm_la(t_t.unsqueeze(0))
t_t = pool_la(t_t)

t_t.shape

torch.Size([1, 3, 4, 38])

In [None]:
def rando(path):
    if path[-2] == '_':
        n_sec = (int)(path[-1])
        aud_path = path[:-2]+'.wav'
    elif path[-3] == '_':
        n_sec = (int)(path[-2:])
        aud_path = path[:-3]+'.wav'
    signal00, sam00, dur00 = AudioUtils.open(aud_path)
    sig00 = torch.from_numpy(signal00[n_sec*SAMPLE_RATE:((n_sec+1)*SAMPLE_RATE)]).unsqueeze(0)
    return sig00.shape

for i in df2['audio_list_secs']:
    if rando(i) != torch.Size([1, 16000]):
        print('Ehhh', i)
