In [1]:
import pandas as pd
import math
import sys
import os
import ast
import logging
import boto3
import random 

from scipy.io.wavfile import read as read_wav
import numpy as np
import torchaudio
import librosa

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
class AccentDataset(Dataset):
    def __init__(self,
                 dataset_path="/work/data/cv-corpus-7.0-2021-07-21/en/clips",
                 entries_path = "/work/data/cv-corpus-7.0-2021-07-21/en/train.tsv",
                 chosen_accents = ["us", "england", "indian", "canada",  "australia"],
                 audio_size=150000,
                 purpose="train",
                 train_valid_split=0.9,
                 **_):
        self.dataset_path = dataset_path
        self.audio_size = audio_size

        

        self.entries = pd.read_csv(f"{entries_path}", sep = None, engine = 'python')#.query(f"dataset == '{data_type}'")
        self.entries = self.entries[pd.notna(self.entries['accent'])]

        max_elements = min(self.entries["accent"].value_counts()[chosen_accents])
        balanced_entries = pd.DataFrame()
        for acc in chosen_accents:
            if purpose == "train":
                balanced_entries = balanced_entries.append(self.entries.query(f"accent == '{acc}'").iloc[:int(max_elements*train_valid_split)])
            elif purpose == "valid":
                balanced_entries = balanced_entries.append(self.entries.query(f"accent == '{acc}'").iloc[int(max_elements*train_valid_split):max_elements])

        self.entries = balanced_entries
        acc_map = self.entries["accent"].unique()
        self.entries = self.entries.sample(frac=1)
        ls = []
        for i in range(len(self.entries["accent"])):
            ls.append(np.where(self.entries["accent"].iloc[i] == acc_map)[0][0])
        self.entries['encoded_label'] = torch.nn.functional.one_hot(torch.tensor(ls)).split(1)

        self.bundle = torchaudio.pipelines.WAV2VEC2_BASE
        self.model = self.bundle.get_model().to("cuda:0")
        #self.entries["filepath"] = [os.path.join(dataset_path,"".join([os.path.basename(self.entries["filename"][i][:-3]),"mp3"])) for i in range(len(self.entries))]
        #self.entries["filepath"] = [os.path.join("/work/data/cv-corpus-7.0-2021-07-21/en/clips",self.entries["path"][i]) for i in range(len(self.entries))]
        self.cuda0 = torch.device('cuda:0')

    def __getitem__(self, index):
        entry = self.entries.iloc[index]


        waveform, sample_rate = torchaudio.load(os.path.join(self.dataset_path,entry["path"]))
        waveform = waveform.to("cuda:0")
        if sample_rate != self.bundle.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, self.bundle.sample_rate)

        if waveform.shape[1]< self.audio_size:
            waveform = F.pad(waveform, (0,self.audio_size - waveform.shape[1]), "constant", 0)
            start = 0
        else:
            start = random.choice(range(0,waveform.shape[1]-self.audio_size))
        waveform = waveform[:,start:start+self.audio_size]
        
        with torch.inference_mode():
            features, _ = self.model.extract_features(waveform)

        return torch.cat(features).to(self.cuda0), entry["encoded_label"].to(self.cuda0)[0]

    def __len__(self):
        return len(self.entries)

In [3]:
#s3 = boto3.resource('s3',aws_access_key_id="AKIAVFT6EWR3ZJ6J7RN4",
#    aws_secret_access_key="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")
#s3.Bucket("sagemaker-studio-355673093239-n7bauqajc2k").download_file("results/wav2vec2_fairseq_base_ls960.pth", 
#'/home/ubuntu/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth')


In [4]:
torch.cuda.empty_cache()

In [5]:
dataset = AccentDataset()

FileNotFoundError: [Errno 2] No such file or directory: '/work/data/cv-corpus-7.0-2021-07-21/en/train.tsv'

In [38]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=12, out_channels=640, kernel_size=8)
        self.fc1 = nn.Linear(9216, 640)
        self.rnn = nn.LSTM(
            input_size=640,
            hidden_size=480,
            num_layers=1,
            batch_first=True,
            bidirectional=True
        )
        self.dropout = nn.Dropout(0.1)
        self.fc2 = nn.Linear(960*468, 128)
        self.fc3 = nn.Linear(128, 5)

    def forward(self, x):
        x = x.permute(0,2,1,3).flatten(2)
        x = F.relu(self.fc1(x))
        x, _ = self.rnn(x)
        x = self.dropout(x)
        x = x.flatten(1)
        x = F.relu(self.fc2(x))
        x = F.log_softmax(self.fc3(x))
        return x

#     def forward(self, x):
# #         x = x.permute(0,2,1,3)#.flatten(2)
#         x = F.relu(self.conv1(x).flatten(2))
#         x, _ = self.rnn(x)
#         x = self.dropout(x)
#         x = x.flatten(1)
#         x = F.relu(self.fc2(x))
#         x = F.log_softmax(self.fc3(x))
#         return x


net = Net()

In [39]:
sample = torch.rand(size=(1,12, 468, 768))
net(sample)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (468x9216 and 350821x640)

In [24]:
torch.cuda.empty_cache()

In [25]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [26]:
from torch.utils.data import DataLoader

loader = DataLoader(dataset,32)
losses = []
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(iter(loader), 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        losses.append(loss)
        print(loss)

        # print statistics
        running_loss += loss.item()
        if i % 1000 == 0:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
            torch.save({
            'iter': i,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            }, f"checkpoints/model_e{epoch}_i{i}.pt")

  x = F.log_softmax(self.fc3(x))


tensor(1.6142, device='cuda:0', grad_fn=<DivBackward1>)
[1,     1] loss: 0.001
tensor(23.6677, device='cuda:0', grad_fn=<DivBackward1>)
tensor(11.2179, device='cuda:0', grad_fn=<DivBackward1>)
tensor(25.0864, device='cuda:0', grad_fn=<DivBackward1>)
tensor(20.9494, device='cuda:0', grad_fn=<DivBackward1>)
tensor(12.1525, device='cuda:0', grad_fn=<DivBackward1>)
tensor(17.4682, device='cuda:0', grad_fn=<DivBackward1>)
tensor(12.4533, device='cuda:0', grad_fn=<DivBackward1>)
tensor(4.5464, device='cuda:0', grad_fn=<DivBackward1>)
tensor(2.5964, device='cuda:0', grad_fn=<DivBackward1>)
tensor(2.3200, device='cuda:0', grad_fn=<DivBackward1>)
tensor(2.1865, device='cuda:0', grad_fn=<DivBackward1>)
tensor(2.0004, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.6296, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.6797, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.8676, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.6819, device='cuda:0', grad_fn=<DivBackward1>)
tensor(1.5339, dev

KeyboardInterrupt: 

In [27]:
torch.save({
        'iter': i,
        'model_state_dict': net.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'losses': losses,
        }, f"checkpoints/model_e{epoch}_i{i}.pt")

In [15]:
def get_embeddings(net,x):
    x = x.permute(0,2,1,3).flatten(2)
    x = F.relu(net.fc1(x))
    x, _ = net.rnn(x)
    x = x.flatten(1)
    x = F.relu(net.fc2(x))
    return x

In [36]:
get_embeddings(net,dataset[2][0].reshape((1,12, 468, 768)))

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  9.0974,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  2.7690,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  

In [33]:
import torchmetrics

acc = torchmetrics.Accuracy().to('cuda:0')
for i in range(len(dataset)):
    print(torch.argmax(net(dataset[0][0].reshape((1,12, 468, 768)))))

  x = F.log_softmax(self.fc3(x))


tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
tensor(3, device='cuda:0')
t

KeyboardInterrupt: 

In [32]:
dataset.entries

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment,encoded_label
124538,ed07f16faa58a669b58de0207a11eb954d0076aa7b8c1c...,common_voice_en_19638562.mp3,"His areas of specialty include epistemology, K...",3,0,teens,male,us,en,,"[[tensor(1), tensor(0), tensor(0), tensor(0), ..."
389642,2f561cb0b97147db51bc0ad72c0fab191a48ce2aab48a4...,common_voice_en_21090775.mp3,The lyrics are largely based on Milton Ager's ...,2,0,twenties,male,canada,en,,"[[tensor(0), tensor(0), tensor(0), tensor(1), ..."
630395,8449194e8b74b81d761d7a3786cdd5d099aff4f33737c7...,common_voice_en_22937498.mp3,The sounds that are used today were formed in ...,2,0,twenties,female,australia,en,,"[[tensor(0), tensor(0), tensor(0), tensor(0), ..."
202405,4281670244e0d04d6bf3c10df6181337e8b00e3a0725e5...,common_voice_en_19667270.mp3,"It consisted of three counties - Washington, G...",2,0,twenties,male,england,en,,"[[tensor(0), tensor(1), tensor(0), tensor(0), ..."
343778,f12a988e9d251485294edec8740dfec779b45127c1bb06...,common_voice_en_21942909.mp3,"Before moving into writing, he worked as an at...",2,0,twenties,male,indian,en,,"[[tensor(0), tensor(0), tensor(1), tensor(0), ..."
...,...,...,...,...,...,...,...,...,...,...,...
321223,68caafdf147c42aa78002abb9c2ac5b61f36b8c8ec4773...,common_voice_en_23664990.mp3,"Nevertheless, Bahloo is sometimes seen walking...",2,0,twenties,male,england,en,,"[[tensor(0), tensor(1), tensor(0), tensor(0), ..."
173182,dcda0922112645666327ffc15a63c2908424fa970ddbed...,common_voice_en_22024639.mp3,"Certain functions of the Council, however, rem...",2,0,seventies,female,us,en,,"[[tensor(1), tensor(0), tensor(0), tensor(0), ..."
584365,cbbf0b4896d532ed8bfad4d3a82e356c699d62cc1f88fa...,common_voice_en_21638312.mp3,While it is less efficient than the general al...,2,0,twenties,male,australia,en,,"[[tensor(0), tensor(0), tensor(0), tensor(0), ..."
160927,73ddd9cb79e56d3edfa1902dac26767323ad23e4418538...,common_voice_en_20994723.mp3,"Susan knows where the gold is, but can't speak.",2,0,twenties,female,england,en,,"[[tensor(0), tensor(1), tensor(0), tensor(0), ..."
