In [3]:
import pandas as pd
import math
import sys
import os
import ast
import logging
import boto3
import random 

from scipy.io.wavfile import read as read_wav
import numpy as np
import torchaudio
import librosa

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [17]:
class AccentDataset(Dataset):
    def __init__(self,
                 dataset_path="/work/data/cv-corpus-7.0-2021-07-21/en/clips",
                 entries_path = "/work/data/cv-corpus-7.0-2021-07-21/en/train.tsv",
                 chosen_accents = ["us", "england", "indian", "canada",  "australia"],
                 audio_size=150000,
                 purpose="train",
                 train_valid_split=0.9,
                 **_):
        self.dataset_path = dataset_path
        self.audio_size = audio_size

        

        self.entries = pd.read_csv(f"{entries_path}", sep = None, engine = 'python')#.query(f"dataset == '{data_type}'")
        self.entries = self.entries[pd.notna(self.entries['accent'])]

        max_elements = min(self.entries["accent"].value_counts()[chosen_accents])
        balanced_entries = pd.DataFrame()
        for acc in chosen_accents:
            if purpose == "train":
                balanced_entries = balanced_entries.append(self.entries.query(f"accent == '{acc}'").iloc[:int(max_elements*train_valid_split)])
            elif purpose == "valid":
                balanced_entries = balanced_entries.append(self.entries.query(f"accent == '{acc}'").iloc[int(max_elements*train_valid_split):max_elements])

        self.entries = balanced_entries
        acc_map = self.entries["accent"].unique()
        self.entries = self.entries.sample(frac=1)
        ls = []
        for i in range(len(self.entries["accent"])):
            ls.append(np.where(self.entries["accent"].iloc[i] == acc_map)[0][0])
        self.entries['encoded_label'] = torch.nn.functional.one_hot(torch.tensor(ls)).split(1)

        self.bundle = torchaudio.pipelines.WAV2VEC2_BASE
        self.model = self.bundle.get_model().to("cuda:0")
        #self.entries["filepath"] = [os.path.join(dataset_path,"".join([os.path.basename(self.entries["filename"][i][:-3]),"mp3"])) for i in range(len(self.entries))]
        #self.entries["filepath"] = [os.path.join("/work/data/cv-corpus-7.0-2021-07-21/en/clips",self.entries["path"][i]) for i in range(len(self.entries))]
        self.cuda0 = torch.device('cuda:0')

    def __getitem__(self, index):
        entry = self.entries.iloc[index]


        waveform, sample_rate = torchaudio.load(os.path.join(self.dataset_path,entry["path"]))
        waveform = waveform.to("cuda:0")
        if sample_rate != self.bundle.sample_rate:
            waveform = torchaudio.functional.resample(waveform, sample_rate, self.bundle.sample_rate)

        if waveform.shape[1]< self.audio_size:
            waveform = F.pad(waveform, (0,self.audio_size - waveform.shape[1]), "constant", 0)
            start = 0
        else:
            start = random.choice(range(0,waveform.shape[1]-self.audio_size))
        waveform = waveform[:,start:start+self.audio_size]
        
        with torch.inference_mode():
            features, _ = self.model.extract_features(waveform)

        return torch.cat(features).to(self.cuda0), entry["encoded_label"].to(self.cuda0)[0]

    def __len__(self):
        return len(self.entries)

In [5]:
#s3 = boto3.resource('s3',aws_access_key_id="AKIAVFT6EWR3ZJ6J7RN4",
#    aws_secret_access_key="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")
#s3.Bucket("sagemaker-studio-355673093239-n7bauqajc2k").download_file("results/wav2vec2_fairseq_base_ls960.pth", 
#'/home/ubuntu/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960.pth')


In [101]:
torch.cuda.empty_cache()

In [3]:
dataset = AccentDataset()

In [4]:
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(9216, 4200)
        self.rnn = nn.LSTM(
            input_size=9216,
            hidden_size=480,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )
        self.fc2 = nn.Linear(960, 128)
        self.fc3 = nn.Linear(128, 17)

    def forward(self, x):
        x = x.permute(0,2,1,3).flatten(2) # flatten all dimensions except batch
        #x = F.relu(self.fc1(x))
        x, _ = self.rnn(x)
        x = F.relu(self.fc2(x))
        x = F.log_softmax(self.fc3(x))
        return x


net = Net().to("cuda:0")

In [56]:
#torch.cuda.empty_cache()

In [57]:
dataset[0][0].permute(1,0,2).flatten(1).shape

torch.Size([468, 9216])

In [None]:
from transformers import Wav2Vec2ForSequenceClassification

In [5]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001, momentum=0.9)

In [13]:
from torch.utils.data import DataLoader

loader = DataLoader(dataset,24)
losses = []
for epoch in range(5):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(iter(loader), 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        labels = torch.tensor(labels,dtype=torch.long)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss)

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
            torch.save({
            'iter': i,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            }, f"model_e{epoch}_i{i}.pt")

  labels = torch.tensor(labels,dtype=torch.long)
  x = F.log_softmax(self.fc3(x))


[1,  2000] loss: 6.148
[1,  4000] loss: 6.148


KeyboardInterrupt: 