In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchaudio
import torchaudio.transforms as T
import torchaudio.functional as F
import textgrids
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

SAMPLE_RATE = 16000

In [4]:
def get_feats(signal):

    features = []

    MFCC = T.MFCC(SAMPLE_RATE, n_mfcc=13)
    MelSpectrogram = T.MelSpectrogram(SAMPLE_RATE)
    SpectralCentroid = T.SpectralCentroid(SAMPLE_RATE)

    mfcc_feats = MFCC(signal).squeeze()
    melspec_feats = MelSpectrogram(signal).squeeze()
    spectral_centroid_feats = SpectralCentroid(signal).squeeze()
    
    for i in range(0, mfcc_feats.shape[0]):
        features.append(mfcc_feats[i].mean())
        #print(mfcc_feats[i].mean())

    for j in range(0, melspec_feats.shape[0]):
        features.append(melspec_feats[j].mean())
    
    features.extend(spectral_centroid_feats)

    return features

In [123]:
ssa = torch.randn(1,16000)

torch.nn.functional.normalize(torch.Tensor(get_feats(ssa)).squeeze(),dim=0).shape

torch.Size([222])

In [5]:
class AudioUtils():

    def resample_audio(signal, org_sr):
        resampler = T.Resample(org_sr, SAMPLE_RATE)
        resampled_sig = resampler(signal)
        return resampled_sig
    
    def open(aud_fn):
        sig, sr = torchaudio.load(aud_fn)
        duration = sig.shape[1]/sr
        #print(sig.shape)
        if sr != SAMPLE_RATE:
            sig = AudioUtils.resample_audio(sig, sr)

        # Converting stereo to mono
        if sig.shape[0] == 2:
            sig = sig.mean(dim=0).unsqueeze(0)
        
        if sig.shape[1]%SAMPLE_RATE == 0:
            duration = sig.shape[1]/SAMPLE_RATE
            sig = sig.squeeze().numpy()
        else:
            sig = sig.squeeze().numpy()
            #print(sig_np.shape)
            en = (int)((np.round(duration)+1)*SAMPLE_RATE - sig.shape[0])
            sig = np.pad(sig, (0,en), mode = 'constant')
            #sig = torch.from_numpy(sig_np).unsqueeze(0)
            duration = sig.shape[0] / SAMPLE_RATE
        
        return sig, SAMPLE_RATE, duration
    
    def get_second_wise_mfcc(signal, duration):
        mfcc_list = []
        MFCC = T.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=13)
        for i in range(0,(int)(duration)):
            mfcc_list.append(MFCC(signal[0][i:(i+SAMPLE_RATE)].unsqueeze(0)).squeeze())

        return mfcc_list

In [6]:
class AnnotUtils():

    def get_speech_secs(fname):
        grid = textgrids.TextGrid(fname)
        speech_secs = []
        for i in grid['silences']:
            if i.text == '1':
                #speech_secs.append([(np.round(i.xmin, decimals=2)), (np.round(i.xmax, decimals=2))])
                speech_secs.append([(int)(np.round(i.xmin, decimals=2)), (int)(np.round(i.xmax, decimals=2))])
        return speech_secs
    

    def get_labs_for_secs(speech_secs, duration):
        
        labels = [i*0 for i in range(0, (int)(duration))]

        for i in range(0,len(speech_secs)):

            if speech_secs[i][0] == speech_secs[i][1]:
                labels[speech_secs[i][0]-1] = 1
            else:
                for j in range(speech_secs[i][0], speech_secs[i][1]):
                    
                    labels[j-1] = 1
                    labels[j] = 1

        return labels

In [7]:
df2 = pd.read_csv('secWise_labs.csv')

In [18]:
MFCC = T.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=13,melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23, "center": False})

class VAD_Dataset(Dataset):
    def __init__(self, df) -> None:
        #super().__init__()
        self.df = df
        self.sr = 16000

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):

        aud_part = self.df.loc[index][0]
        labels = self.df.loc[index][1]
        
        if aud_part[-2] == '_':
            n_sec = (int)(aud_part[-1])
            aud_path = aud_part[:-2]+'.wav'
        elif aud_part[-3] == '_':
            n_sec = (int)(aud_part[-2:])
            aud_path = aud_part[:-3]+'.wav'
        
        signal00, sam00, dur00 = AudioUtils.open(aud_path)
        sig00 = torch.from_numpy(signal00[n_sec*SAMPLE_RATE:((n_sec+1)*SAMPLE_RATE)]).unsqueeze(0)
        feats = get_feats(sig00)
        
        #mfcc_tens = MFCC(sig00).to(dtype=torch.float)
        #labels = AnnotUtils.get_labs_for_secs(AnnotUtils.get_speech_secs(annot_path), dur)
        feats = torch.Tensor(feats)
        #mfcc_tens = torch.flatten(mfcc_tens)
        
        return feats, labels

In [19]:
dataset = VAD_Dataset(df=df2)

In [59]:
oo = [1,2,6,8,5]
oo = torch.Tensor(oo)

oo

tensor([1., 2., 6., 8., 5.])

In [20]:
num_items = len(dataset)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(dataset, [num_train, num_val])

In [21]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

train_loader = DataLoader(
    train_ds, 
    batch_size=32, 
    shuffle=True,
    )
test_loader = DataLoader(
    val_ds, 
    batch_size=32, 
    shuffle=True
    )

In [23]:
for inputs, labels in test_loader:
    print(inputs.shape, labels.dtype)

torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([32, 222]) torch.int64
torch.Size([12, 222]) torch.int64


In [25]:
inputs[0]

tensor([-7.1260e+02, -5.1048e+00,  4.2926e-01, -3.6796e+00, -8.1637e+00,
        -3.3296e+00, -5.8317e+00, -2.5638e+00, -4.5651e+00, -5.3132e-01,
        -3.1790e+00, -3.2634e-01, -3.6515e+00,  0.0000e+00,  1.2397e-05,
         6.6746e-05,  0.0000e+00,  2.3214e-05,  2.6933e-05,  0.0000e+00,
         2.6091e-05,  2.8571e-06,  4.8581e-06,  5.5949e-06,  1.0981e-06,
         6.9628e-06,  0.0000e+00,  4.1414e-06,  4.1027e-07,  3.0381e-06,
         8.6939e-07,  2.5001e-06,  9.2541e-07,  2.1684e-06,  6.8253e-07,
         1.9716e-06,  4.0196e-07,  2.5458e-06,  7.0445e-07,  1.8971e-06,
         1.3292e-06,  1.0244e-06,  1.6156e-06,  6.5729e-07,  1.0984e-06,
         1.0285e-06,  6.6421e-07,  9.9493e-07,  1.1352e-06,  8.2644e-07,
         8.9646e-07,  1.3203e-06,  7.3814e-07,  7.2106e-07,  7.0390e-07,
         1.0820e-06,  8.0269e-07,  7.2209e-07,  7.2177e-07,  7.6267e-07,
         8.4216e-07,  8.8391e-07,  7.5846e-07,  6.2674e-07,  6.2656e-07,
         7.1407e-07,  7.4889e-07,  7.5021e-07,  7.5

In [26]:
def acc_fn(lab, pred):
    pred = torch.round(pred)
    correct = (lab == pred).float()
    return correct.mean().item()

In [17]:
ssoo = torch.randn(32,222)

lin1 = nn.Linear(222, 111)
ac1 = nn.ReLU()
lin2 = nn.Linear(111, 50)
ac2 = nn.ReLU()
lin3 = nn.Linear(50, 1)
sig = nn.Sigmoid()

x = lin3(ac2(lin2(ac1(lin1(ssoo)))))
x = sig(x)

print((x.squeeze()))

tensor([0.5161, 0.4964, 0.4924, 0.4932, 0.4706, 0.4831, 0.5080, 0.4907, 0.4903,
        0.4869, 0.4872, 0.5047, 0.5026, 0.5171, 0.5092, 0.4767, 0.4666, 0.4721,
        0.4947, 0.5098, 0.5021, 0.5025, 0.4994, 0.5217, 0.4911, 0.4733, 0.4889,
        0.4744, 0.4906, 0.4843, 0.4944, 0.4890], grad_fn=<SqueezeBackward0>)


In [27]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

class FFNNwithFeats(nn.Module):
    def __init__(self) -> None:
        super(FFNNwithFeats, self).__init__()

        self.lin1 = nn.Linear(222, 111)
        self.ac1 = nn.ReLU()
        self.bn1 = nn.BatchNorm1d(111)

        self.lin2 = nn.Linear(111, 50)
        self.ac2 = nn.ReLU()
        self.bn2 = nn.BatchNorm1d(50)

        self.lin3 = nn.Linear(50, 1)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        x = self.bn1(self.ac1(self.lin1(x)))
        x = self.bn2(self.ac2(self.lin2(x)))
        x = self.sig(x)
        return x


In [131]:
criterion = nn.BCELoss()
model = FFNNwithFeats()
rand = torch.randn(5,222)

model.eval()
with torch.inference_mode():
    out = model(rand)

print(out, out.shape, out.dtype, labels[0:5], labels[0:5].dtype)

loss = criterion(out.squeeze(), labels[0:5].float())
loss

tensor([[0.5063],
        [0.5229],
        [0.5292],
        [0.5230],
        [0.5234]]) torch.Size([5, 1]) torch.float32 tensor([1, 1, 1, 0, 0]) torch.int64


tensor(0.6894)

In [28]:
#torch.cuda.empty_cache()
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

model00 = FFNNwithFeats().to(device)

criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model00.parameters(), lr=0.001)
#optimizer = optim.Adam(model00.parameters(), lr=0.001)

In [29]:
import statistics

torch.manual_seed(13)
torch.cuda.manual_seed(13)

epochs = 20

for epoch in range(epochs):

    acc = []

    for batch in train_loader:

        inputs, labels = batch

        inputs = inputs.to(device)
        labels = labels.to(device)

        model00.train()
        #outputs = torch.round(model00(inputs))
        outputs = model00(inputs)
        print(outputs.squeeze(), labels.float())
        loss = criterion(outputs.squeeze(), labels.float())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        acc.append(acc_fn(labels.float(), outputs.float()))

    accuracie = statistics.mean(acc)

    print(f'Epoch [{epoch+1}/{epochs}], Accuracy: {accuracie}, Loss: {loss}')



tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], grad_fn=<SqueezeBackward0>) tensor([1., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 0.,
        0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1.])


ValueError: Using a target size (torch.Size([32])) that is different to the input size (torch.Size([32, 50])) is deprecated. Please ensure they have the same size.

In [28]:
sampl = torch.randn(32,1,13,81)

l1 = nn.Conv2d(1,32,kernel_size=(3,3), padding=(1,1))
relu = nn.ReLU()
maxp1 = nn.MaxPool2d(kernel_size=(2,2))

# --------------------------------- #

l2 = nn.Conv2d(32, 64, kernel_size=(3,3), padding=(1,1))
maxp2 = nn.MaxPool2d(kernel_size=(2,2))

l3 = nn.Conv2d(64, 128, kernel_size=(3,3), padding=(1,1))
maxp3 = nn.MaxPool2d(kernel_size=(3,3))

linear = nn.Linear(768, 1)
sigmoid = nn.Sigmoid()

x = maxp1(relu(l1(sampl)))
x = maxp2(relu(l2(x)))
x = maxp3(relu(l3(x)))
x = x.view(x.size(0), -1)
x = sigmoid(linear(x))

print(x.shape)

#x = sigmoid(linear(x)).squeeze()

# (torch.Size([32, 13, 4, 27]), torch.Size([32, 1404]))

x.squeeze().shape

torch.Size([32, 1])


torch.Size([32])

In [29]:
torch.manual_seed(13)
torch.cuda.manual_seed(13)

class ConvVAD(nn.Module):
    def __init__(self) -> None:
        super(ConvVAD, self).__init__()

        self.l1 = nn.Conv2d(1,32,kernel_size=(3,3), padding=(1,1))
        self.relu = nn.ReLU()
        self.maxp1 = nn.MaxPool2d(kernel_size=(2,2))

        # --------------------------------- #

        self.l2 = nn.Conv2d(32, 64, kernel_size=(3,3), padding=(1,1))
        self.maxp2 = nn.MaxPool2d(kernel_size=(2,2))

        # --------------------------------- #
        
        self.l3 = nn.Conv2d(64, 128, kernel_size=(3,3), padding=(1,1))
        self.maxp3 = nn.MaxPool2d(kernel_size=(3,3))

        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        
        x = self.maxp1(self.relu(self.l1(x)))
        x = self.maxp2(self.relu(self.l2(x)))
        x = self.maxp3(self.relu(self.l3(x)))
        x = x.view(x.size(0), -1)
        x = self.sigmoid(self.linear(x)).squeeze()

        return x

In [33]:
torch.cuda.empty_cache()
device = 'cuda:1' if torch.cuda.is_available() else 'cpu'

model00 = ConvVAD().to(device)

criterion = nn.BCELoss()
#optimizer = optim.SGD(model00.parameters(), lr=0.001)
optimizer = optim.Adam(model00.parameters(), lr=0.001)

In [34]:
import statistics

torch.manual_seed(13)
torch.cuda.manual_seed(13)

epochs = 20

for epoch in range(epochs):

    acc = []

    for batch in train_loader:

        inputs, labels = batch

        inputs = inputs.to(device)
        labels = labels.to(device, dtype=torch.float)

        model00.train()
        outputs = torch.round(model00(inputs))
        loss = criterion(outputs.float(), labels.float())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        acc.append(acc_fn(labels.float(), outputs.float()))

    accuracie = statistics.mean(acc)

    print(f'Epoch [{epoch+1}/{epochs}], Accuracy: {accuracie}, Loss: {loss}')


Epoch [1/20], Accuracy: 0.360604956441996, Loss: 78.5714340209961
Epoch [2/20], Accuracy: 0.3622448979591837, Loss: 50.000003814697266
Epoch [3/20], Accuracy: 0.3622448979591837, Loss: 50.000003814697266
Epoch [4/20], Accuracy: 0.3614249272005899, Loss: 64.28572082519531
Epoch [5/20], Accuracy: 0.3597849854553232, Loss: 92.85714721679688
Epoch [6/20], Accuracy: 0.3634748544011797, Loss: 28.571430206298828
Epoch [7/20], Accuracy: 0.3614249272005899, Loss: 64.28572082519531
Epoch [8/20], Accuracy: 0.360604956441996, Loss: 78.5714340209961
Epoch [9/20], Accuracy: 0.36101494182129296, Loss: 71.42857360839844
Epoch [10/20], Accuracy: 0.360604956441996, Loss: 78.5714340209961
Epoch [11/20], Accuracy: 0.360604956441996, Loss: 78.5714340209961
Epoch [12/20], Accuracy: 0.3622448979591837, Loss: 50.000003814697266
Epoch [13/20], Accuracy: 0.3622448979591837, Loss: 50.000003814697266
Epoch [14/20], Accuracy: 0.3622448979591837, Loss: 50.000003814697266
Epoch [15/20], Accuracy: 0.36101494182129296

In [35]:
test_acc = []
for batch in test_loader:

    
    inputs, labels = batch

    inputs = inputs.to(device)
    labels = labels.to(device, dtype=torch.float)

    model00.eval()
    with torch.inference_mode():
        y_preds = torch.round(model00(inputs))

    test_acc.append(acc_fn(labels.float(), y_preds.float()))

print(statistics.mean(test_acc))

0.3591666668653488
