In [3]:
import torch

vggish_model = torch.hub.load('harritaylor/torchvggish', 'vggish')
vggish_model.eval()

Using cache found in /Users/shashi/.cache/torch/hub/harritaylor_torchvggish_master
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish-10086976.pth" to /Users/shashi/.cache/torch/hub/checkpoints/vggish-10086976.pth
100%|██████████| 275M/275M [00:31<00:00, 9.20MB/s] 
Downloading: "https://github.com/harritaylor/torchvggish/releases/download/v0.1/vggish_pca_params-970ea276.pth" to /Users/shashi/.cache/torch/hub/checkpoints/vggish_pca_params-970ea276.pth
100%|██████████| 177k/177k [00:00<00:00, 3.31MB/s]


VGGish(
  (features): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): ReLU(inplace=True)
    (13): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (14): ReLU(inplace=True)
    (15): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False

In [10]:
import urllib
url, filename = ("http://soundbible.com/grab.php?id=1698&type=wav", "bus_chatter.wav")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)

In [29]:
vggish_model.forward(filename).shape

torch.Size([19, 128])

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchaudio
import pandas as pd
import os
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [16]:
import glob

times_info = {}
for file in glob.glob("./vis-data-256/*_times.txt"):
    with open(file, "r") as f:
        data = f.readlines()
        data = [line.strip().split() for line in data]
        data = [(float(line[0]), line[1]) for line in data if line[1] != "None"]
        times_info[file.split("/")[-1].split("_")[0]] = data

train_file = "./vis-data-256/train.txt"
train_info = []

with open(train_file, "r") as f:
    data = f.readlines()
    data = [line.strip().split() for line in data]
    print("Number of training samples: ", len(data))
    for d in data:
        if d[0] not in train_info:
            train_info.append(d[0])

test_file = "./vis-data-256/test.txt"
with open(test_file, "r") as f:
    data = f.readlines()
    data = [line.strip().split() for line in data]
    print("Number of testing samples: ", len(data))

#Focus only on one frame in a video
times_info_updated_train = {}
all_labels = []
train_data = []

for key, value in times_info.items():
    if key in train_info:
        if key not in times_info_updated_train:
            if len(value) > 1:
                times_info_updated_train[key] = value[0]
                all_labels.append(value[1][1])
                train_data.append((key, value[1][1]))

times_info_updated_test = {}
test_data = []

for key, value in times_info.items():
    if key not in train_info:
        if key not in times_info_updated_test:
            if len(value) > 1:
                times_info_updated_test[key] = value[0]
                test_data.append((key, value[1][1]))
                
len(times_info_updated_train), len(times_info_updated_test) 


Number of training samples:  733
Number of testing samples:  244


(729, 243)

In [17]:
len(train_data), len(test_data)

(729, 243)

In [12]:
label_encoder = LabelEncoder()
integer_labels = label_encoder.fit_transform(all_labels)

tensor_labels = torch.from_numpy(integer_labels)

In [23]:
class CustomDataset(Dataset):
    def __init__(self, data, audio_dir):
        self.data = data
        self.audio_dir = audio_dir

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        audio_file_path = os.path.join(self.audio_dir, self.data[idx][0] + ".wav")
        waveform, sample_rate = torchaudio.load(audio_file_path)
        label = torch.tensor(label_encoder.transform([self.data[idx][1]]), dtype=torch.long)
        return audio_file_path, label
    
train_dataset = CustomDataset(data=train_data, audio_dir='./train_audio')
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False)

test_dataset = CustomDataset(data=test_data, audio_dir='./test_audio')
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [25]:
class AudioClassifier(nn.Module):
    def __init__(self, num_classes):
        super(AudioClassifier, self).__init__()
        self.pooling = nn.AdaptiveAvgPool2d((1, 128))
        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pooling(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

In [28]:
model = AudioClassifier(num_classes=50)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
vggish_model = torch.hub.load('harritaylor/torchvggish', 'vggish')
vggish_model.eval()

for epoch in tqdm(range(50)):
    for batch in train_dataloader:
        optimizer.zero_grad()
        outputs = []
        
        for audio_file_path, label in zip(batch[0], batch[1]):
            x = vggish_model.forward(audio_file_path)
            x = x.unsqueeze(0)
            output = model(x)
            outputs.append(output)
        outputs = torch.cat(outputs) 
        labels = batch[1].squeeze()
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()

Using cache found in /Users/shashi/.cache/torch/hub/harritaylor_torchvggish_master
100%|██████████| 50/50 [1:45:31<00:00, 126.62s/it]


In [30]:
correct = 0
total = 0

model.eval()  
with torch.no_grad():  
    for batch in test_dataloader:
        outputs = []
        labels = []
        for audio_file_path, label in zip(batch[0], batch[1]):
            x = vggish_model.forward(audio_file_path)
            x = x.unsqueeze(0)
            output = model(x)
            outputs.append(output)
            labels.append(label)

        outputs = torch.cat(outputs) 
        labels = torch.stack(labels).squeeze()
        _, predicted = torch.max(outputs, 1)  
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy =  correct / total
print('Accuracy:', accuracy*100)

Accuracy: 10.699588477366255
