In [1]:
from collections import Counter, defaultdict
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
import torchaudio
import torch
import os
import torch.nn as nn
from torch import optim
from torch.nn import Embedding, Linear, LSTM, Module
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm

In [17]:
train_metadata_path = '/home/kesav/Documents/kesav/research/code_files/LibriPhrase/metadata/train.xlsx'
test_metadata_path = '/home/kesav/Documents/kesav/research/code_files/LibriPhrase/metadata/test_at.xlsx'
audio_path = '/home/kesav/Documents/kesav/research/code_files/LibriPhrase/database/LibriPhrase_diffspk_all'
model_path='/home/kesav/Documents/kesav/research/code_files/KWS-Baseline/try2/models/'
sampling_rate=16000
no_of_samples=31840

In [18]:
device=('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using {device} for device')

Using cuda for device


In [19]:
train_metadata = pd.read_excel(train_metadata_path)
test_metadata =  pd.read_excel(test_metadata_path)

In [21]:
len(set(train_metadata['anchor_text']))

310

In [22]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(train_metadata['anchor_text'])

train_metadata['label_encoded'] = label_encoder.transform(train_metadata['anchor_text'])

In [33]:
train_metadata = train_metadata[['anchor', 'anchor_text','label_encoded']]
leng=sorted(train_metadata['label_encoded'].unique())
leng

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [7]:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=sampling_rate,
    win_length=400,
    hop_length=160,
    n_mels=40
)

In [8]:
class Dataset():
    def __init__(self,metadata, audio_dir,transformation, sampling_rate, no_of_samples, device):
        self.metadata = metadata
        
        #audio
        self.audio_dir = audio_dir
        self.device=device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = sampling_rate
        self.num_of_samples = no_of_samples
        
         
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, index):
        temp=[]
        audio_path = self._get_anchor_audio_path(index)
        label = self._get_label(index)
        
        #extracting audio features by applying spectrogram
        signal, sr = torchaudio.load(audio_path)
        
        audio_feat= signal.to(self.device)
        
        audio_feat = self._resample_if_necessary(audio_feat, sr)
        
        audio_feat = self._mix_down_if_necessary(audio_feat)
        
        audio_feat = self.right_pad_if_necessary(audio_feat)
        
        audio_feat = self.transformation(audio_feat)
        
        # audio_feat=audio_feat.squeeze(0)
        
        # audio_feat=audio_feat.transpose(1,2)
        
        return audio_feat, label

    def _get_anchor_audio_path(self, index):
        sub_path = f"{self.metadata.iloc[index, 0]}" 
        path = os.path.join(self.audio_dir, sub_path)
        return path
    
    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal
    
    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_of_samples:
            num_missing_samples = self.num_of_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
    
    def _get_label(self, index):
        return self.metadata.iloc[index, 2]
    

In [9]:
train_data=Dataset(train_metadata, audio_path, mel_spectrogram, sampling_rate, no_of_samples, device)

In [10]:
signal, label = train_data[6]
print('audio_signal', signal.shape)
print(label)

audio_signal torch.Size([1, 40, 200])
madame


In [11]:
from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 4* 14, 310)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        # print(x.shape)
        x = self.flatten(x)
        # print(x.shape)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 40, 200))



Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 16, 21, 101]         --
|    └─Conv2d: 2-1                       [-1, 16, 42, 202]         160
|    └─ReLU: 2-2                         [-1, 16, 42, 202]         --
|    └─MaxPool2d: 2-3                    [-1, 16, 21, 101]         --
├─Sequential: 1-2                        [-1, 32, 11, 51]          --
|    └─Conv2d: 2-4                       [-1, 32, 23, 103]         4,640
|    └─ReLU: 2-5                         [-1, 32, 23, 103]         --
|    └─MaxPool2d: 2-6                    [-1, 32, 11, 51]          --
├─Sequential: 1-3                        [-1, 64, 6, 26]           --
|    └─Conv2d: 2-7                       [-1, 64, 13, 53]          18,496
|    └─ReLU: 2-8                         [-1, 64, 13, 53]          --
|    └─MaxPool2d: 2-9                    [-1, 64, 6, 26]           --
├─Sequential: 1-4                        [-1, 128, 4, 14]          --
|    └─

In [12]:
BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001

In [13]:
train_loader = DataLoader(train_data, batch_size=128, shuffle=True)

In [14]:
cnn = CNNNetwork().to(device)
print(cnn)

CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=7168, out_features=2, bias=True)
  (softmax): Softmax(dim=1)
)


In [15]:

# def train(model, train_loader, criterion, optimizer, device, num_epochs):
    
#     for epoch in range(num_epochs):
#         for batch_idx, (audio_feat, targets) in enumerate(tqdm(train_loader)):
            
#             # Get data to cuda if possible
#             audio_feat = audio_feat.to(device=device)
#             targets = targets.to(device=device)
            
        
#             scores=model(audio_feat)
          
#             loss = criterion(scores, targets)
#             # print(loss)
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#         print(f"Epoch:{epoch} loss is {loss.item()}") 

In [16]:
def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")

In [17]:
def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")

In [18]:
# initialise loss funtion + optimiser
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(),
                                lr=LEARNING_RATE)

# train model
train(cnn, train_loader, loss_fn, optimizer, device, EPOCHS)

Epoch 1


AttributeError: 'tuple' object has no attribute 'to'

In [34]:
!pip install soundata

Collecting soundata
  Downloading soundata-0.1.2-py3-none-any.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting librosa>=0.8.0 (from soundata)
  Using cached librosa-0.10.0.post2-py3-none-any.whl (253 kB)
Collecting numpy<=1.20,>=1.16 (from soundata)
  Downloading numpy-1.20.0-cp39-cp39-manylinux2010_x86_64.whl (15.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.4/15.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting jams (from soundata)
  Downloading jams-0.3.4.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting audioread>=2.1.9 (from librosa>=0.8.0->soundata)
  Using cached audioread-3.0.0-py3-none-any.whl
INFO: pip is looking at multiple versions of librosa to determine whic

In [2]:
import soundata

dataset = soundata.initialize('urbansound8k')
dataset.download()  # download the dataset
dataset.validate()  # validate that all the expected files are there

example_clip = dataset.choice_clip()  # choose a random example clip
print(example_clip)  # see the available data


INFO: Downloading ['all'] to /home/kesav/sound_datasets/urbansound8k
INFO: [all] downloading UrbanSound8K.tar.gz
 86%|████████▌ | 4.83G/5.61G [14:22:46<2:19:43, 100kB/s] 


KeyboardInterrupt: 

In [4]:
import pandas as pd
df=pd.read_csv('/home/kesav/Downloads/UrbanSound8K.csv')

In [5]:
df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing
